In [136]:
import psycopg2
import pandas as pd 
# from sqlalchemy.types import Integer, Text, String, DateTime
import sqlalchemy as s
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

with open("config.json") as config_file:
    config = json.load(config_file)

database_connection_string = 'postgres+psycopg2://{}:{}@{}:{}/{}'.format(config['user'], config['password'], config['host'], config['port'], config['database'])

dbschema='augur_data'
engine = s.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

In [137]:
check_prs = pd.DataFrame()
check_prs_query = f"""
select 'pull requets collected' as data_point, count(*) as count from pull_requests
union 
SELECT
data_point, 
SUM ( COUNT ) AS COUNT 
FROM
(
SELECT 'platform metadata' as data_point, 
repo_id,
SUM ( last_collected ) AS COUNT 
FROM
( SELECT repo_id, MAX ( data_collection_date ), COUNT ( * ) AS last_collected FROM augur_data.repo_info GROUP BY repo_id ORDER BY repo_id ) A 
GROUP BY
repo_id
) b group by data_point;
"""
check_prs = pd.read_sql_query(check_prs_query, con=engine)
print(check_prs)

               data_point     count
0       platform metadata  210715.0
1  pull requets collected  305418.0


In [138]:
missing_query = f"""
SELECT
    * 
FROM
    (
    SELECT
        repo_info.repo_id,
        repo.repo_name,
repo.repo_git, 
        MAX ( pull_request_count ) AS max_pr_count,
        COUNT ( * ) AS meta_count 
    FROM
        repo_info,
        repo -- WHERE issues_enabled = 'true' 
    WHERE
        pull_request_count >= 1
        AND repo.repo_id = repo_info.repo_id 
    GROUP BY
        repo_info.repo_id,
        repo.repo_name, 
 repo.repo_git 
    ORDER BY
        repo_info.repo_id,
        repo.repo_name 
    ) yy
    LEFT OUTER JOIN (
    SELECT A
        .repo_id,
        A.repo_name,
        b.pull_request_count,
        d.repo_id AS pull_request_repo_id,
        e.last_collected,
        f.last_pr_collected,
        COUNT ( * ) AS pull_requests_collected,
        ( b.pull_request_count - COUNT ( * ) ) AS pull_requests_missing,
        ABS ( CAST ( ( COUNT ( * ) ) + 1 AS DOUBLE PRECISION ) / CAST ( b.pull_request_count + 1 AS DOUBLE PRECISION ) ) AS ratio_abs,
        ( CAST ( ( COUNT ( * ) ) + 1 AS DOUBLE PRECISION ) / CAST ( b.pull_request_count + 1 AS DOUBLE PRECISION ) ) AS ratio_prs 
    FROM
        augur_data.repo A,
        augur_data.pull_requests d,
        augur_data.repo_info b,
        ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM augur_data.repo_info GROUP BY repo_id ORDER BY repo_id ) e,
        ( SELECT repo_id, MAX ( data_collection_date ) AS last_pr_collected FROM augur_data.pull_requests GROUP BY repo_id ORDER BY repo_id ) f 
    WHERE
        A.repo_id = b.repo_id 
        AND LOWER ( A.repo_git ) LIKE'%github.com%' 
        AND A.repo_id = d.repo_id 
        AND b.repo_id = d.repo_id 
        AND e.repo_id = A.repo_id 
        AND b.data_collection_date = e.last_collected 
        AND f.repo_id = A.repo_id -- AND d.pull_request_id IS NULL
        
    GROUP BY
        A.repo_id,
        d.repo_id,
        b.pull_request_count,
        e.last_collected,
        f.last_pr_collected 
    ORDER BY
        ratio_abs desc
    ) zz ON yy.repo_id = zz.repo_id 
ORDER BY
    ratio_abs;
    """
missing_data = pd.read_sql_query(missing_query, con=engine)
print(missing_data)

      repo_id                     repo_name  \
0       30139  bitnami-docker-node-exporter   
1       30488        k8s-simple-app-example   
2       26491       steeltoe-developer-code   
3       30481                          kbld   
4       30485             setup-k14s-action   
...       ...                           ...   
2186    26408                 kibosh-sample   
2187    25460       rabbitmq-heroku-example   
2188    25506                         ranch   
2189    25606  prometheus_rabbitmq_exporter   
2190    26811             pcfbackup-release   

                                               repo_git  max_pr_count  \
0     https://github.com/bitnami/bitnami-docker-node...             3   
1        https://github.com/k14s/k8s-simple-app-example             4   
2     https://github.com/platform-acceleration-lab/s...             5   
3                          https://github.com/k14s/kbld            22   
4             https://github.com/k14s/setup-k14s-action            62 

In [139]:
df = missing_data.loc[missing_data['ratio_abs'] < 1]
#print(df)
df1 = df[['repo_id', 'last_pr_collected', 'pull_requests_missing', 'ratio_abs', 'repo_git']]
#df1 = df[['pull_requests_missing', 'repo_git']]
print(df1)
#missing_data.to_csv('/Users/dafoster/gitrepos/augur-pivotal/code/health_scripts/output/missing.csv')

    repo_id  repo_id   last_pr_collected  pull_requests_missing  ratio_abs  \
0     30139  30139.0 2020-12-07 19:36:24                    1.0   0.750000   
1     30488  30488.0 2020-12-08 02:31:04                    1.0   0.800000   
2     26491  26491.0 2020-12-07 11:40:53                    1.0   0.833333   
3     30481  30481.0 2020-12-07 22:37:50                    2.0   0.913043   
4     30485  30485.0 2020-12-27 16:39:57                    4.0   0.936508   
..      ...      ...                 ...                    ...        ...   
78    30661  30661.0 2020-12-07 23:20:11                    1.0   0.998695   
79    25663  25663.0 2020-12-29 11:42:58                    4.0   0.998744   
80    25657  25657.0 2020-12-21 23:55:42                    1.0   0.998771   
81    25522  25522.0 2020-12-27 17:24:53                    1.0   0.998847   
82    25760  25760.0 2020-12-29 09:05:59                    3.0   0.999339   

                                             repo_git  
0   htt

In [140]:
df = missing_data.loc[missing_data['ratio_abs'] < 1]
#print(df)
df1 = df[['repo_id', 'last_pr_collected', 'pull_requests_missing', 'ratio_abs', 'repo_git']]
#df1 = df[['pull_requests_missing', 'repo_git']]
print(df1)

    repo_id  repo_id   last_pr_collected  pull_requests_missing  ratio_abs  \
0     30139  30139.0 2020-12-07 19:36:24                    1.0   0.750000   
1     30488  30488.0 2020-12-08 02:31:04                    1.0   0.800000   
2     26491  26491.0 2020-12-07 11:40:53                    1.0   0.833333   
3     30481  30481.0 2020-12-07 22:37:50                    2.0   0.913043   
4     30485  30485.0 2020-12-27 16:39:57                    4.0   0.936508   
..      ...      ...                 ...                    ...        ...   
78    30661  30661.0 2020-12-07 23:20:11                    1.0   0.998695   
79    25663  25663.0 2020-12-29 11:42:58                    4.0   0.998744   
80    25657  25657.0 2020-12-21 23:55:42                    1.0   0.998771   
81    25522  25522.0 2020-12-27 17:24:53                    1.0   0.998847   
82    25760  25760.0 2020-12-29 09:05:59                    3.0   0.999339   

                                             repo_git  
0   htt

In [141]:
for index, repo in df1.iterrows():
    print(repo['last_pr_collected'])

2020-12-07 19:36:24
2020-12-08 02:31:04
2020-12-07 11:40:53
2020-12-07 22:37:50
2020-12-27 16:39:57
2020-12-27 10:19:09
2020-12-27 12:13:46
2020-12-27 14:41:20
2020-12-27 13:42:29
2020-12-07 19:29:47
2020-12-27 13:11:01
2020-12-27 12:54:17
2020-12-27 13:10:45
2020-12-07 08:07:12
2020-12-07 18:27:23
2020-12-07 16:18:14
2020-12-07 21:22:48
2020-12-27 10:54:42
2020-12-07 14:02:03
2020-12-07 16:45:35
2020-12-27 13:25:37
2020-12-27 13:35:04
2020-12-07 14:37:56
2020-12-10 10:13:24
2020-12-29 11:30:05
2020-12-07 19:26:59
2020-12-27 13:08:11
2020-12-27 13:09:40
2020-12-07 19:07:58
2020-12-10 16:23:30
2020-12-08 19:38:28
2020-12-07 19:51:34
2020-12-27 15:48:51
2020-12-07 23:28:50
2020-12-07 19:13:12
2020-12-07 15:58:50
2020-12-07 16:36:55
2020-12-07 09:44:58
2020-12-07 08:34:38
2020-12-29 07:32:39
2020-12-07 18:42:47
2020-12-07 16:25:44
2020-12-07 09:08:42
2020-12-07 09:41:57
2020-12-07 18:52:43
2020-12-07 08:18:34
2020-12-07 21:48:23
2020-12-29 06:07:59
2020-12-27 14:00:00
2020-12-29 08:26:33


In [142]:
#Repos for gathering data

import datetime 

current = datetime.date.today()
today = "'" + str(current) + "'"
#print(current)

first_current = current.replace(day=1)
last_month = first_current - datetime.timedelta(days=1)
end_date = "'" + str(last_month) + "'"

start = last_month - datetime.timedelta(days=180)
start_date = "'" + str(start) + "'"

def get_commits_by_repo(start_date, end_date, engine):
    import pandas as pd

    repo_list_commits = pd.DataFrame()
    repo_list_commits_query = f"""
            SELECT COUNT(DISTINCT commits.cmt_commit_hash), repo.repo_id, repo.repo_name, repo.repo_path from repo, commits
            WHERE 
                repo.repo_id = commits.repo_id
                AND commits.cmt_author_timestamp >= {start_date}
                AND commits.cmt_author_timestamp <= {end_date}
                AND cmt_author_name NOT LIKE '%%utomation%%'
                AND cmt_author_name NOT LIKE '%%ipeline%%'
                AND cmt_author_name NOT LIKE '%%Cloud Foundry%%'
                AND cmt_author_name NOT LIKE 'snyk%%'
                AND cmt_author_name NOT LIKE '%%bot'
                AND cmt_author_name NOT LIKE 'dependabot%%'
                AND cmt_author_name NOT LIKE '%%Bot'
                AND cmt_author_name NOT LIKE '%%BOT'
                AND cmt_author_name != 'cfcr'
                AND cmt_author_name != 'CFCR'
                AND cmt_author_name != 'Travis CI'
                AND cmt_author_name != 'Cloud Foundry London'
                AND cmt_author_name != 'pivotal-rabbitmq-ci'
                AND cmt_author_name != 'Bitnami Containers'
                AND cmt_author_name != 'Spring Operator'
                AND cmt_author_name != 'Spring Buildmaster'
            GROUP BY repo.repo_id
            ORDER BY COUNT(DISTINCT commits.cmt_commit_hash);
            """
    repo_list_commits = pd.read_sql_query(repo_list_commits_query, con=engine)
    
    return repo_list_commits
    
repo_list_commits = get_commits_by_repo(start_date, end_date, engine)

top = repo_list_commits.loc[repo_list_commits['count'] > 60]

In [143]:
print('repo_id ratio repo_url')
i = 0
for index, repo in top.iterrows():
    repo_id = repo['repo_id']

    try:
        #ratio_abs = round(missing_data.loc[missing_data['pull_request_repo_id'] == repo_id]['ratio_abs'].values[0], 2)
        ratio_abs = format(missing_data.loc[missing_data['pull_request_repo_id'] == repo_id]['ratio_abs'].values[0], '.2f')
    except:
        ratio_abs = 'NA  '
        
    repo_info = str(repo_id) + '   ' + str(ratio_abs) + '  ' + repo['repo_path'] + repo['repo_name']
    print(repo_info)
    i+=1
    
print('Number of repos:', i)

repo_id ratio repo_url
30545   1.17  github.com/spring-projects-experimental/spring-graphql
25736   1.00  github.com/Spring-projects/spring-data-rest
30661   1.00  github.com/spring-cloud/spring-cloud-stream
27435   1.00  github.com/pivotal-cf/docs-cloud-cache
30434   1.00  github.com/vmware-samples/rabbitmq-oauth-example-app
30660   1.00  github.com/spring-cloud/spring-cloud-build
30630   1.00  github.com/spring-cloud/spring-cloud-stream-binder-kafka
30662   1.00  github.com/spring-cloud/spring-cloud-zookeeper
28028   1.00  github.com/vmware/captive-web-view
30477   0.95  github.com/k14s/kapp-controller
30334   1.00  github.com/vmware-labs/service-bindings
28126   1.00  github.com/concourse/hush-house
28117   1.00  github.com/concourse/registry-image-resource
30481   0.91  github.com/k14s/kbld
27511   1.00  github.com/pivotal-cf/docs-pas
25635   1.00  github.com/spring-io/spring-javaformat
30650   1.00  github.com/spring-cloud/spring-cloud-gcp
27940   0.99  github.com/tern-tools/tern


In [144]:
print('repo_id ratio repo_url')
i = 0
for index, repo in top.iterrows():
    repo_id = repo['repo_id']


        #ratio_abs = round(missing_data.loc[missing_data['pull_request_repo_id'] == repo_id]['ratio_abs'].values[0], 2)
    try:
        ratio_abs = float(format(missing_data.loc[missing_data['pull_request_repo_id'] == repo_id]['ratio_abs'].values[0], '.2f'))
    except:
        ratio_abs = 99.0
    
    if ratio_abs < 1:
        repo_info = str(repo_id) + '   ' + str(ratio_abs) + '  ' + repo['repo_path'] + repo['repo_name']
        print(repo_info)
        i+=1
    
print('Number of repos:', i)

repo_id ratio repo_url
30477   0.95  github.com/k14s/kapp-controller
30481   0.91  github.com/k14s/kbld
27940   0.99  github.com/tern-tools/tern
28147   0.98  github.com/goharbor/harbor-helm
25656   0.99  github.com/Spring-projects/spring-amqp
30125   0.99  github.com/bitnami/kube-prod-runtime
30485   0.94  github.com/k14s/setup-k14s-action
28149   0.99  github.com/goharbor/website
28014   0.98  github.com/vmware/declarative-cluster-management
30489   0.97  github.com/k14s/ytt
25776   0.99  github.com/Spring-projects/spring-data-elasticsearch
25823   0.99  github.com/Spring-projects/spring-kafka
27850   0.99  github.com/vmware/govmomi
25435   0.99  github.com/rabbitmq/rabbitmq-dotnet-client
30898   0.99  github.com/micrometer-metrics/micrometer
28143   0.98  github.com/goharbor/harbor-operator
30455   0.99  github.com/kubeapps/kubeapps
27512   0.98  github.com/pivotal-cf/docs-ops-manager
27950   0.99  github.com/vmware/differential-datalog
28037   0.98  github.com/vmware-tanzu/octant
3

In [None]:
repo_id ratio repo_url
25657   0.54  github.com/Spring-projects/spring-batch
25441   0.77  github.com/rabbitmq/rabbitmq-management
25650   0.97  github.com/Spring-projects/spring-data-commons
25435   0.64  github.com/rabbitmq/rabbitmq-dotnet-client
30039   0.33  github.com/bitnami/charts
25663   0.33  github.com/Spring-projects/spring-framework
25432   0.29  github.com/rabbitmq/rabbitmq-server
Number of repos: 7

In [107]:
!cat config.json

{
    "connection_string": "sqlite:///:memory:",
    "database": "panthers_project",
    "host": "vista.osshealth.io",
    "password": "UKPivotal",
    "port": 5433,
    "schema": "augur_data",
    "user": "dawn",
    "user_type": "read_only"
}
