See [schema](https://oss-augur.readthedocs.io/en/dev/_images/schema.png) for details.

In [1]:
import psycopg2
import pandas as pd 
# from sqlalchemy.types import Integer, Text, String, DateTime
import sqlalchemy as s
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

with open("config.json") as config_file:
    config = json.load(config_file)

database_connection_string = 'postgres+psycopg2://{}:{}@{}:{}/{}'.format(config['user'], config['password'], config['host'], config['port'], config['database'])

dbschema='augur_data'
engine = s.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

In [3]:
table_query = f"""
SELECT
   tablename
FROM
   pg_catalog.pg_tables
WHERE
   schemaname = 'augur_data';
    """
tables = pd.read_sql_query(table_query, con=engine)
print(tables)

                   tablename
0                _git_census
1       contributors_aliases
2   contributor_affiliations
3       dm_repo_group_annual
4     pull_request_assignees
5        pull_request_events
6         pull_request_teams
7                repo_groups
8         repo_test_coverage
9            repos_fetch_log
10                  settings
11             unknown_cache
12               utility_log
13           working_commits
14                repo_stats
15        commit_comment_ref
16              analysis_log
17      chaoss_metric_status
18            commit_parents
19      contributors_history
20            dm_repo_annual
21                   commits
22              contributors
23     dm_repo_group_monthly
24      dm_repo_group_weekly
25           dm_repo_monthly
26            dm_repo_weekly
27                   exclude
28                    issues
29              issue_labels
30         issue_message_ref
31              issue_events
32           issue_assignees
33            

In [8]:
desc_table_query = f"""
SELECT
   COLUMN_NAME
FROM
   information_schema.COLUMNS
WHERE
   TABLE_NAME = 'repo';
    """
table_desc = pd.read_sql_query(desc_table_query, con=engine)
print(table_desc)


             column_name
0                repo_id
1          repo_group_id
2               repo_git
3              repo_path
4              repo_name
5             repo_added
6            repo_status
7              repo_type
8                    url
9               owner_id
10           description
11      primary_language
12            created_at
13           forked_from
14            updated_at
15           tool_source
16          tool_version
17           data_source
18  data_collection_date


In [2]:
get_id_query = f"""
SELECT
   rg_name, repo_group_id
FROM
   repo_groups;
    """
get_id_desc = pd.read_sql_query(get_id_query, con=engine)
print(get_id_desc)

                      rg_name  repo_group_id
0          Pivotal-DataFabric          25162
1                 SteeltoeOSS          25161
2               spring-guides          25160
3                greenplum-db          25159
4                       GemXD          25158
5             Spring-projects          25157
6                   spring-io          25156
7                    rabbitmq          25155
8                 pivotal-gss          25176
9           pivotal-education          25175
10            pivotalsoftware          25174
11                 Pivotal-sg          25173
12                    pivotal          25172
13            pivotalservices          25171
14  Pivotal-Field-Engineering          25170
15                      Rails              1
16  platform-acceleration-lab          25169
17            cf-platform-eng          25168
18           pivotal-cloudops          25167
19               pcfdev-forks          25166
20             pivotaltracker          25165
21        

In [17]:
get_id_query = f"""
SELECT
   cntrb_login, cntrb_email, cntrb_canonical, cntrb_full_name, gh_login 
FROM
   contributors
WHERE
   gh_login = 'bryanl' or cntrb_email = 'bryanliles@gmail.com' or cntrb_email = 'lilesb@vmware.com' or cntrb_email = 'bryan@Bryans-MacBook-Pro.local';
    """
get_id_desc = pd.read_sql_query(get_id_query, con=engine)
print(get_id_desc)

  cntrb_login     cntrb_email cntrb_canonical cntrb_full_name gh_login
0      bryanl  iam@smartic.us  iam@smartic.us            None   bryanl


In [18]:
get_id_query = f"""
SELECT cmt_author_name, cmt_author_email, repo_id 
FROM commits 
WHERE repo_id = 28037 and (cmt_author_name like '%%iles%%' or cmt_author_name like 'bryanl' or cmt_author_email = 'iam@smartic.us')
GROUP BY cmt_author_email, cmt_author_name, repo_id;
"""
get_id_desc = pd.read_sql_query(get_id_query, con=engine)
print(get_id_desc)

  cmt_author_name                cmt_author_email  repo_id
0     Bryan Liles  bryan@Bryans-MacBook-Pro.local    28037
1          bryanl            bryanliles@gmail.com    28037
2     Bryan Liles            bryanliles@gmail.com    28037
3          lilesb            bryanliles@gmail.com    28037
4          lilesb               lilesb@vmware.com    28037


In [21]:
get_id_query = f"""
SELECT commits.cmt_author_name, commits.cmt_author_email, commits.repo_id, 
       contributors.cntrb_canonical, contributors.cntrb_full_name
FROM commits, contributors 
WHERE commits.repo_id = 28037
    and 
    and (commits.cmt_author_name like '%%iles%%' or commits.cmt_author_name like 'bryanl' or commits.cmt_author_email = 'iam@smartic.us')
GROUP BY commits.cmt_author_email, commits.cmt_author_name, commits.repo_id, contributors.cntrb_canonical, contributors.cntrb_full_name;
"""
get_id_desc = pd.read_sql_query(get_id_query, con=engine)
print(get_id_desc)

#cntrb_canonical, cntrb_full_name 
#                           FROM contributors

      cmt_author_name                cmt_author_email  repo_id  \
0              bryanl            bryanliles@gmail.com    28037   
1              lilesb            bryanliles@gmail.com    28037   
2              bryanl            bryanliles@gmail.com    28037   
3         Bryan Liles  bryan@Bryans-MacBook-Pro.local    28037   
4         Bryan Liles            bryanliles@gmail.com    28037   
...               ...                             ...      ...   
35225          lilesb               lilesb@vmware.com    28037   
35226          lilesb               lilesb@vmware.com    28037   
35227     Bryan Liles            bryanliles@gmail.com    28037   
35228     Bryan Liles            bryanliles@gmail.com    28037   
35229     Bryan Liles            bryanliles@gmail.com    28037   

                       cntrb_canonical              cntrb_full_name  
0                   tezhang@pivotal.io                zhangteng5513  
1                  ouduobiao@gmail.com                         oudb

In [None]:
get_id_query = f"""
SELECT cntrb_canonical, canonical_full_name 
FROM commits 
WHERE repo_id = 28037 and (cmt_author_name like '%%iles%%' or cmt_author_name like 'bryanl' or cmt_author_email = 'iam@smartic.us')
GROUP BY cmt_author_email, cmt_author_name, repo_id;
"""
get_id_desc = pd.read_sql_query(get_id_query, con=engine)
print(get_id_desc)

In [3]:
repo_info_query = f"""
SELECT
   *
FROM
   repo
WHERE
   repo_name = 'harbor';
    """
repo_info = pd.read_sql_query(repo_info_query, con=engine)
print(repo_info)

   repo_id  repo_group_id                            repo_git  \
0    27050          25172   https://github.com/pivotal/harbor   
1    28150          25184  https://github.com/goharbor/harbor   

              repo_path repo_name          repo_added repo_status repo_type  \
0   github.com/pivotal/    harbor 2020-02-08 12:58:14    Complete             
1  github.com/goharbor/    harbor 2020-04-02 07:35:11    Complete             

    url owner_id description primary_language created_at forked_from  \
0  None     None        None             None       None        None   
1  None     None        None             None       None        None   

  updated_at tool_source tool_version data_source data_collection_date  
0       None         CLI          1.0         Git  2020-02-08 12:58:14  
1       None         CLI          1.0         Git  2020-04-02 07:35:11  


In [21]:
    repo_list_commits_query = f"""
            SELECT COUNT(DISTINCT commits.cmt_commit_hash), repo.repo_path, repo.repo_group_id from repo, commits
            WHERE 
                repo.repo_id = commits.repo_id
                AND commits.cmt_author_timestamp >= '2020-01-01'
                AND commits.cmt_author_timestamp <= '2020-04-15'
            GROUP BY repo.repo_group_id, repo.repo_path
            ORDER BY COUNT(DISTINCT commits.cmt_commit_hash);
            """
    repo_list_commits = pd.read_sql_query(repo_list_commits_query, con=engine)


In [22]:
repo_list_commits

Unnamed: 0,count,repo_path,repo_group_id
0,3,github.com/appsuite/,25177
1,4,github.com/pivotal-gss/,25176
2,5,github.com/pivotaltracker/,25165
3,6,github.com/Pivotal-sg/,25173
4,8,github.com/pivotalsoftware/,25174
5,9,github.com/Pivotal-Field-Engineering/,25170
6,24,github.com/pivotal-cloudops/,25167
7,35,github.com/Pivotal-Data-Engineering/,25163
8,81,github.com/gemfire/,25178
9,95,github.com/pivotalservices/,25171


In [48]:
all_commits_query = f"""
        SELECT DISTINCT(commits.cmt_commit_hash), repo.repo_id, repo.repo_group_id, repo.repo_name, repo.repo_path, 
            commits.cmt_author_email, commits.cmt_author_timestamp from repo, commits
        WHERE 
            repo.repo_id = commits.repo_id
            AND commits.cmt_author_email NOT LIKE 'containers%%'
            AND commits.cmt_author_email NOT LIKE 'k8s%%'
            AND commits.cmt_author_email NOT LIKE 'cf-%%'
        GROUP BY repo.repo_id, commits.cmt_commit_hash, commits.cmt_author_email, commits.cmt_author_timestamp
        ORDER BY repo.repo_id;
        """
all_commits = pd.read_sql_query(all_commits_query, con=engine)
by_repo = all_commits.loc[all_commits.groupby('repo_id').cmt_author_timestamp.idxmax()].sort_values('cmt_author_timestamp')

In [49]:
by_repo

Unnamed: 0,cmt_commit_hash,repo_id,repo_group_id,repo_name,repo_path,cmt_author_email,cmt_author_timestamp
39434,26e84dc10d4bf442660bb0212b3d428640c1f5b7,25455,25155,rabbithub,github.com/rabbitmq/,tonyg@rabbitmq.com,2010-09-05 22:15:13+00:00
1031636,c009a0a48a66d6fb0c6ccd3b51b20200fdb58655,27839,25181,pyvco,github.com/vmware/,yann.hodique@gmail.com,2010-11-07 15:55:02+00:00
39617,ec16a25cc6175f4cdf10421f4ac488090d6a0696,25456,25155,rabbitmq-ha,github.com/rabbitmq/,matthew@lshift.net,2010-12-16 14:38:01+00:00
160753,b99de99785ce2bd1e8b88839d3e9a62e34dff659,25689,25157,cloudfoundry-sandbox,github.com/Spring-projects/,peter@cacoethes.co.uk,2011-06-07 15:13:23+00:00
164104,cf988c19af52baf5c87b74805a6080bbcd70ccea,25702,25157,protocol-fallback-demo,github.com/Spring-projects/,ozhurakousky@pivotal.io,2011-08-04 13:13:17+00:00
...,...,...,...,...,...,...,...
1157591,013e151266b2239dfdfe6756c8952612bed69594,28003,25181,validation-app-engine,github.com/vmware/,pradeepsinghbiet@gmail.com,2020-04-13 18:10:38+00:00
923417,4170bccbe3835072a3cb5fc53c1ae10f8064a90e,27510,25179,docs-tiledev,github.com/pivotal-cf/,jncd@pivotal.io,2020-04-13 18:24:48+00:00
622680,36b99dba1f77bc92b240d9ad7c7c620fefadd1ec,26978,25172,LicenseFinder,github.com/pivotal/,slattanzio@pivotal.io,2020-04-13 18:29:52+00:00
926459,5a2e73a8d0c149ffe1f00da85c095090d2969cd5,27513,25179,docs-pks,github.com/pivotal-cf/,vfedzkovich@pivotal.io,2020-04-13 18:41:51+00:00


In [50]:
by_org = all_commits.loc[all_commits.groupby('repo_group_id').cmt_author_timestamp.idxmax()].sort_values('cmt_author_timestamp')
for index, row in by_org.iterrows():
    print 
    top_contribs = all_commits.loc[all_commits['repo_group_id'] == row.repo_group_id].cmt_author_email.value_counts()
    print(row.repo_path, top_contribs.index[0], top_contribs[0], top_contribs.index[1], top_contribs[1])

github.com/GemXD/ swale@snappydata.io 10 gmlewis@google.com 1
github.com/pcfdev-forks/ suraci.alex@gmail.com 4908 justin@specialbusservice.com 3074
github.com/cfmobile/ david_syer@hotmail.com 921 hone02@gmail.com 723
github.com/pivotal-education/ droberts@gopivotal.com 110 scottyfred@gmail.com 78
github.com/Pivotal-DataFabric/ jpatel-pivotal@pivotal.io 255 jgronline@gmail.com 171
github.com/pivotal-gss/ bruce@momjian.us 8781 tgl@sss.pgh.pa.us 7844
github.com/pivotalsoftware/ haoyuan@cs.berkeley.edu 1073 Florian.Schoppmann@emc.com 550
github.com/pivotaltracker/ KrauseFx@gmail.com 8712 oliver@cocoanetics.com 2225
github.com/Pivotal-sg/ raymond.penners@intenct.nl 846 asim@chuhnk.me 224
github.com/Pivotal-Field-Engineering/ bruce@momjian.us 7033 tgl@sss.pgh.pa.us 4924
github.com/Pivotal-Data-Engineering/ ben@gimbo.net 1516 randy@mathysphere.com 331
github.com/appsuite/ bdussault@pivotal.io 228 kparikh@gopivotal.com 177
github.com/gemfire/ tomaz@tomaz.me 2390 pquerna@apache.org 357
github.c