See [schema](https://oss-augur.readthedocs.io/en/dev/_images/schema.png) for details.

In [4]:
import psycopg2
import pandas as pd 
# from sqlalchemy.types import Integer, Text, String, DateTime
import sqlalchemy as s
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

with open("config.json") as config_file:
    config = json.load(config_file)

database_connection_string = 'postgres+psycopg2://{}:{}@{}:{}/{}'.format(config['user'], config['password'], config['host'], config['port'], config['database'])

dbschema='augur_data'
engine = s.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

In [5]:
missing_query = f"""
SELECT
    * 
FROM
    (
    SELECT
        repo_info.repo_id,
        repo.repo_name,
repo.repo_git, 
        MAX ( pull_request_count ) AS max_pr_count,
        COUNT ( * ) AS meta_count 
    FROM
        repo_info,
        repo -- WHERE issues_enabled = 'true' 
    WHERE
        pull_request_count >= 1
        AND repo.repo_id = repo_info.repo_id 
    GROUP BY
        repo_info.repo_id,
        repo.repo_name, 
 repo.repo_git 
    ORDER BY
        repo_info.repo_id,
        repo.repo_name 
    ) yy
    LEFT OUTER JOIN (
    SELECT A
        .repo_id,
        A.repo_name,
        b.pull_request_count,
        d.repo_id AS pull_request_repo_id,
        e.last_collected,
        f.last_pr_collected,
        COUNT ( * ) AS pull_requests_collected,
        ( b.pull_request_count - COUNT ( * ) ) AS pull_requests_missing,
        ABS ( CAST ( ( COUNT ( * ) ) + 1 AS DOUBLE PRECISION ) / CAST ( b.pull_request_count + 1 AS DOUBLE PRECISION ) ) AS ratio_abs,
        ( CAST ( ( COUNT ( * ) ) + 1 AS DOUBLE PRECISION ) / CAST ( b.pull_request_count + 1 AS DOUBLE PRECISION ) ) AS ratio_prs 
    FROM
        augur_data.repo A,
        augur_data.pull_requests d,
        augur_data.repo_info b,
        ( SELECT repo_id, MAX ( data_collection_date ) AS last_collected FROM augur_data.repo_info GROUP BY repo_id ORDER BY repo_id ) e,
        ( SELECT repo_id, MAX ( data_collection_date ) AS last_pr_collected FROM augur_data.pull_requests GROUP BY repo_id ORDER BY repo_id ) f 
    WHERE
        A.repo_id = b.repo_id 
        AND LOWER ( A.repo_git ) LIKE'%github.com%' 
        AND A.repo_id = d.repo_id 
        AND b.repo_id = d.repo_id 
        AND e.repo_id = A.repo_id 
        AND b.data_collection_date = e.last_collected 
        AND f.repo_id = A.repo_id -- AND d.pull_request_id IS NULL
        
    GROUP BY
        A.repo_id,
        d.repo_id,
        b.pull_request_count,
        e.last_collected,
        f.last_pr_collected 
    ORDER BY
        ratio_abs desc
    ) zz ON yy.repo_id = zz.repo_id 
ORDER BY
    ratio_abs;
    """
missing_data = pd.read_sql_query(missing_query, con=engine)
print(missing_data)

      repo_id                        repo_name  \
0       27511                         docs-pas   
1       27426   pcf-metrics-trace-example-node   
2       27908                          admiral   
3       27395                       redisutils   
4       27605  docs-new-relic-dotnet-buildpack   
...       ...                              ...   
1718    30413                        dev-utils   
1719    28063                     git-resource   
1720    26440                         docs-ksm   
1721    28024           terraform-provider-vmc   
1722    27938          terraform-provider-vra7   

                                               repo_git  max_pr_count  \
0                https://github.com/pivotal-cf/docs-pas             6   
1     https://github.com/pivotal-cf/pcf-metrics-trac...             8   
2                     https://github.com/vmware/admiral            20   
3              https://github.com/pivotal-cf/redisutils            14   
4     https://github.com/pivotal-c

In [2]:
table_query = f"""
SELECT
   tablename
FROM
   pg_catalog.pg_tables
WHERE
   schemaname = 'augur_data';
    """
tables = pd.read_sql_query(table_query, con=engine)
print(tables)

                   tablename
0               contributors
1                _git_census
2   contributor_affiliations
3       contributors_aliases
4       dm_repo_group_annual
5     pull_request_assignees
6        pull_request_events
7         pull_request_teams
8                repo_groups
9         repo_test_coverage
10           repos_fetch_log
11                  settings
12             unknown_cache
13               utility_log
14           working_commits
15                repo_stats
16        commit_comment_ref
17        pull_request_files
18                      repo
19                  releases
20              analysis_log
21      chaoss_metric_status
22            commit_parents
23      contributors_history
24            dm_repo_annual
25                   commits
26     dm_repo_group_monthly
27      dm_repo_group_weekly
28           dm_repo_monthly
29            dm_repo_weekly
30                   exclude
31                    issues
32              issue_labels
33         iss

In [4]:
desc_table_query = f"""
SELECT
   COLUMN_NAME
FROM
   information_schema.COLUMNS
WHERE
   TABLE_NAME = 'repo';
    """
table_desc = pd.read_sql_query(desc_table_query, con=engine)
print(table_desc)


                     column_name
0                        repo_id
1                  repo_group_id
2                       repo_git
3                      repo_path
4                      repo_name
5                     repo_added
6                    repo_status
7                      repo_type
8                            url
9                       owner_id
10                   description
11              primary_language
12                    created_at
13                   forked_from
14                    updated_at
15                   tool_source
16                  tool_version
17                   data_source
18          data_collection_date
19                 repo_archived
20  repo_archived_date_collected


In [5]:
get_id_query = f"""
SELECT
   rg_name, repo_group_id
FROM
   repo_groups;
    """
get_id_desc = pd.read_sql_query(get_id_query, con=engine)
print(get_id_desc)

                      rg_name  repo_group_id
0            pivotal-cloudops          25167
1                 projectriff          60005
2             cf-platform-eng          25168
3   platform-acceleration-lab          25169
4                   concourse          25183
5                vmware-tanzu          25182
6                      vmware          25181
7                    appsuite          25177
8                     gemfire          25178
9                  pivotal-cf          25179
10    pivotal-cf-experimental          25180
11   Pivotal-Data-Engineering          25163
12                   cfmobile          25164
13             pivotaltracker          25165
14                   goharbor          25184
15         Pivotal-DataFabric          25162
16             projectcontour          60000
17                    bitnami          25186
18               pcfdev-forks          25166
19                carbonblack          60002
20                 tern-tools          60003
21  Pivota

In [5]:
get_id_query = f"""
SELECT
   cntrb_login, cntrb_email, cntrb_canonical, cntrb_full_name, gh_login 
FROM
   contributors
WHERE
   gh_login = 'bryanl' or cntrb_email = 'bryanliles@gmail.com' or cntrb_email = 'lilesb@vmware.com' or cntrb_email = 'bryan@Bryans-MacBook-Pro.local';
    """
get_id_desc = pd.read_sql_query(get_id_query, con=engine)
print(get_id_desc)

  cntrb_login                     cntrb_email                 cntrb_canonical  \
0      bryanl                  iam@smartic.us                  iam@smartic.us   
1        None               lilesb@vmware.com  bryan@Bryans-MacBook-Pro.local   
2        None            bryanliles@gmail.com  bryan@Bryans-MacBook-Pro.local   
3        None  bryan@Bryans-MacBook-Pro.local  bryan@Bryans-MacBook-Pro.local   

  cntrb_full_name gh_login  
0            None   bryanl  
1          lilesb     None  
2     Bryan Liles     None  
3     Bryan Liles     None  


In [14]:
get_id_query = f"""
SELECT cmt_author_name, cmt_author_email 
FROM commits 
WHERE cmt_author_name LIKE '%%Richard Johnson%%'
GROUP BY cmt_author_email, cmt_author_name;
"""
get_id_desc = pd.read_sql_query(get_id_query, con=engine)
print(get_id_desc)

   cmt_author_name                     cmt_author_email
0  Richard Johnson                  rjohnson@pivotal.io
1  Richard Johnson  rjohnson@Richards-MacBook-Pro.local


In [21]:
get_id_query = f"""
SELECT commits.cmt_author_name, commits.cmt_author_email, commits.repo_id, 
       contributors.cntrb_canonical, contributors.cntrb_full_name
FROM commits, contributors 
WHERE commits.repo_id = 28037
    and 
    and (commits.cmt_author_name like '%%iles%%' or commits.cmt_author_name like 'bryanl' or commits.cmt_author_email = 'iam@smartic.us')
GROUP BY commits.cmt_author_email, commits.cmt_author_name, commits.repo_id, contributors.cntrb_canonical, contributors.cntrb_full_name;
"""
get_id_desc = pd.read_sql_query(get_id_query, con=engine)
print(get_id_desc)

#cntrb_canonical, cntrb_full_name 
#                           FROM contributors

      cmt_author_name                cmt_author_email  repo_id  \
0              bryanl            bryanliles@gmail.com    28037   
1              lilesb            bryanliles@gmail.com    28037   
2              bryanl            bryanliles@gmail.com    28037   
3         Bryan Liles  bryan@Bryans-MacBook-Pro.local    28037   
4         Bryan Liles            bryanliles@gmail.com    28037   
...               ...                             ...      ...   
35225          lilesb               lilesb@vmware.com    28037   
35226          lilesb               lilesb@vmware.com    28037   
35227     Bryan Liles            bryanliles@gmail.com    28037   
35228     Bryan Liles            bryanliles@gmail.com    28037   
35229     Bryan Liles            bryanliles@gmail.com    28037   

                       cntrb_canonical              cntrb_full_name  
0                   tezhang@pivotal.io                zhangteng5513  
1                  ouduobiao@gmail.com                         oudb

In [7]:
get_id_query = f"""
SELECT cntrb_canonical, canonical_full_name 
FROM commits 
WHERE cmt_author_email LIKE '%%bitnami%%'
GROUP BY cmt_author_email, cmt_author_name, repo_id;
"""
get_id_desc = pd.read_sql_query(get_id_query, con=engine)
print(get_id_desc)

ProgrammingError: (psycopg2.errors.UndefinedColumn) column "cntrb_canonical" does not exist
LINE 2: SELECT cntrb_canonical, canonical_full_name 
               ^

[SQL: 
SELECT cntrb_canonical, canonical_full_name 
FROM commits 
WHERE cmt_author_email LIKE '%%bitnami%%'
GROUP BY cmt_author_email, cmt_author_name, repo_id;
]
(Background on this error at: http://sqlalche.me/e/f405)

In [5]:

repo_info_query = f"""
SELECT
   *
FROM
   repo
WHERE
   repo_group_id = 60005;
    """
repo_info = pd.read_sql_query(repo_info_query, con=engine)
print(repo_info)

    repo_id  repo_group_id                                           repo_git  \
0     30402          60005  https://github.com/projectriff/streaming-proce...   
1     30397          60005  https://github.com/projectriff/java-function-b...   
2     30389          60005  https://github.com/projectriff/projectriff.io.git   
3     30418          60005  https://github.com/projectriff/streaming-http-...   
4     30387          60005            https://github.com/projectriff/riff.git   
5     30410          60005  https://github.com/projectriff/stream-client-g...   
6     30415          60005        https://github.com/projectriff/bindings.git   
7     30416          60005   https://github.com/projectriff/octant-plugin.git   
8     30406          60005  https://github.com/projectriff/streaming-http-...   
9     30417          60005  https://github.com/projectriff/reconciler-runt...   
10    30400          60005          https://github.com/projectriff/system.git   
11    30396          60005  

In [21]:
    repo_list_commits_query = f"""
            SELECT COUNT(DISTINCT commits.cmt_commit_hash), repo.repo_path, repo.repo_group_id from repo, commits
            WHERE 
                repo.repo_id = commits.repo_id
                AND commits.cmt_author_timestamp >= '2020-01-01'
                AND commits.cmt_author_timestamp <= '2020-04-15'
            GROUP BY repo.repo_group_id, repo.repo_path
            ORDER BY COUNT(DISTINCT commits.cmt_commit_hash);
            """
    repo_list_commits = pd.read_sql_query(repo_list_commits_query, con=engine)


In [22]:
repo_list_commits

Unnamed: 0,count,repo_path,repo_group_id
0,3,github.com/appsuite/,25177
1,4,github.com/pivotal-gss/,25176
2,5,github.com/pivotaltracker/,25165
3,6,github.com/Pivotal-sg/,25173
4,8,github.com/pivotalsoftware/,25174
5,9,github.com/Pivotal-Field-Engineering/,25170
6,24,github.com/pivotal-cloudops/,25167
7,35,github.com/Pivotal-Data-Engineering/,25163
8,81,github.com/gemfire/,25178
9,95,github.com/pivotalservices/,25171


In [48]:
all_commits_query = f"""
        SELECT DISTINCT(commits.cmt_commit_hash), repo.repo_id, repo.repo_group_id, repo.repo_name, repo.repo_path, 
            commits.cmt_author_email, commits.cmt_author_timestamp from repo, commits
        WHERE 
            repo.repo_id = commits.repo_id
            AND commits.cmt_author_email NOT LIKE 'containers%%'
            AND commits.cmt_author_email NOT LIKE 'k8s%%'
            AND commits.cmt_author_email NOT LIKE 'cf-%%'
        GROUP BY repo.repo_id, commits.cmt_commit_hash, commits.cmt_author_email, commits.cmt_author_timestamp
        ORDER BY repo.repo_id;
        """
all_commits = pd.read_sql_query(all_commits_query, con=engine)
by_repo = all_commits.loc[all_commits.groupby('repo_id').cmt_author_timestamp.idxmax()].sort_values('cmt_author_timestamp')

In [49]:
by_repo

Unnamed: 0,cmt_commit_hash,repo_id,repo_group_id,repo_name,repo_path,cmt_author_email,cmt_author_timestamp
39434,26e84dc10d4bf442660bb0212b3d428640c1f5b7,25455,25155,rabbithub,github.com/rabbitmq/,tonyg@rabbitmq.com,2010-09-05 22:15:13+00:00
1031636,c009a0a48a66d6fb0c6ccd3b51b20200fdb58655,27839,25181,pyvco,github.com/vmware/,yann.hodique@gmail.com,2010-11-07 15:55:02+00:00
39617,ec16a25cc6175f4cdf10421f4ac488090d6a0696,25456,25155,rabbitmq-ha,github.com/rabbitmq/,matthew@lshift.net,2010-12-16 14:38:01+00:00
160753,b99de99785ce2bd1e8b88839d3e9a62e34dff659,25689,25157,cloudfoundry-sandbox,github.com/Spring-projects/,peter@cacoethes.co.uk,2011-06-07 15:13:23+00:00
164104,cf988c19af52baf5c87b74805a6080bbcd70ccea,25702,25157,protocol-fallback-demo,github.com/Spring-projects/,ozhurakousky@pivotal.io,2011-08-04 13:13:17+00:00
...,...,...,...,...,...,...,...
1157591,013e151266b2239dfdfe6756c8952612bed69594,28003,25181,validation-app-engine,github.com/vmware/,pradeepsinghbiet@gmail.com,2020-04-13 18:10:38+00:00
923417,4170bccbe3835072a3cb5fc53c1ae10f8064a90e,27510,25179,docs-tiledev,github.com/pivotal-cf/,jncd@pivotal.io,2020-04-13 18:24:48+00:00
622680,36b99dba1f77bc92b240d9ad7c7c620fefadd1ec,26978,25172,LicenseFinder,github.com/pivotal/,slattanzio@pivotal.io,2020-04-13 18:29:52+00:00
926459,5a2e73a8d0c149ffe1f00da85c095090d2969cd5,27513,25179,docs-pks,github.com/pivotal-cf/,vfedzkovich@pivotal.io,2020-04-13 18:41:51+00:00


In [50]:
by_org = all_commits.loc[all_commits.groupby('repo_group_id').cmt_author_timestamp.idxmax()].sort_values('cmt_author_timestamp')
for index, row in by_org.iterrows():
    print 
    top_contribs = all_commits.loc[all_commits['repo_group_id'] == row.repo_group_id].cmt_author_email.value_counts()
    print(row.repo_path, top_contribs.index[0], top_contribs[0], top_contribs.index[1], top_contribs[1])

github.com/GemXD/ swale@snappydata.io 10 gmlewis@google.com 1
github.com/pcfdev-forks/ suraci.alex@gmail.com 4908 justin@specialbusservice.com 3074
github.com/cfmobile/ david_syer@hotmail.com 921 hone02@gmail.com 723
github.com/pivotal-education/ droberts@gopivotal.com 110 scottyfred@gmail.com 78
github.com/Pivotal-DataFabric/ jpatel-pivotal@pivotal.io 255 jgronline@gmail.com 171
github.com/pivotal-gss/ bruce@momjian.us 8781 tgl@sss.pgh.pa.us 7844
github.com/pivotalsoftware/ haoyuan@cs.berkeley.edu 1073 Florian.Schoppmann@emc.com 550
github.com/pivotaltracker/ KrauseFx@gmail.com 8712 oliver@cocoanetics.com 2225
github.com/Pivotal-sg/ raymond.penners@intenct.nl 846 asim@chuhnk.me 224
github.com/Pivotal-Field-Engineering/ bruce@momjian.us 7033 tgl@sss.pgh.pa.us 4924
github.com/Pivotal-Data-Engineering/ ben@gimbo.net 1516 randy@mathysphere.com 331
github.com/appsuite/ bdussault@pivotal.io 228 kparikh@gopivotal.com 177
github.com/gemfire/ tomaz@tomaz.me 2390 pquerna@apache.org 357
github.c