In [1]:
import psycopg2
import pandas as pd 
# from sqlalchemy.types import Integer, Text, String, DateTime
import sqlalchemy as s
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

with open("config.json") as config_file:
    config = json.load(config_file)

database_connection_string = 'postgres+psycopg2://{}:{}@{}:{}/{}'.format(config['user'], config['password'], config['host'], config['port'], config['database'])

dbschema='augur_data'
engine = s.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

In [2]:
    all_commits = pd.DataFrame()
    all_commits_query = f"""
            SELECT DISTINCT(commits.cmt_commit_hash), repo.repo_id, repo.repo_name, repo.repo_path, 
                commits.cmt_author_email, commits.cmt_author_timestamp from repo, commits
            WHERE 
                repo.repo_id = commits.repo_id
            GROUP BY repo.repo_id, commits.cmt_commit_hash, commits.cmt_author_email, commits.cmt_author_timestamp
            ORDER BY repo.repo_id;
            """
    all_commits = pd.read_sql_query(all_commits_query, con=engine)
    all_commits

Unnamed: 0,cmt_commit_hash,repo_id,repo_name,repo_path,cmt_author_email,cmt_author_timestamp
0,0097eea9b9cfa0605916fac225513c14a808c910,25430,rmq-0mq,github.com/rabbitmq/,sustrik@250bpm.com,2010-09-09 08:49:09+00:00
1,04b6ed16843b336ce6ab18a693c94bbaf669dcee,25430,rmq-0mq,github.com/rabbitmq/,mikeb@squaremobius.net,2010-08-13 11:35:42+00:00
2,0a43d1cb58bab9162f50bd4c75bed5bd17bd51ae,25430,rmq-0mq,github.com/rabbitmq/,mikeb@squaremobius.net,2010-11-13 18:14:44+00:00
3,0e937a3370247b8f65e7a3eed0f66aea7f255453,25430,rmq-0mq,github.com/rabbitmq/,mikeb@squaremobius.net,2010-09-15 11:26:31+00:00
4,1436532e5e2e7381e184b52356dfb3dc3740b636,25430,rmq-0mq,github.com/rabbitmq/,mikeb@squaremobius.net,2010-09-10 16:56:12+00:00
...,...,...,...,...,...,...
1235945,e67bcfb3e20425f8c4ce9698122aa107155f5c4a,28141,boarding-pass,github.com/concourse/,suraci.alex@gmail.com,2020-01-14 00:41:15+00:00
1235946,eeae26add4a9b42101a56ad246342ecb94ee42b5,28141,boarding-pass,github.com/concourse/,suraci.alex@gmail.com,2020-01-22 14:53:45+00:00
1235947,ef8c628ff1f15455c6f46aacfe3af9f43f499b85,28141,boarding-pass,github.com/concourse/,suraci.alex@gmail.com,2020-01-14 02:29:16+00:00
1235948,f50afe58b89f7f252a0541a78593bda9e93b6537,28141,boarding-pass,github.com/concourse/,suraci.alex@gmail.com,2020-01-09 21:55:11+00:00


In [3]:
    by_repo = all_commits.loc[all_commits.groupby('repo_id').cmt_author_timestamp.idxmax()].sort_values('cmt_author_timestamp')
    by_repo

Unnamed: 0,cmt_commit_hash,repo_id,repo_name,repo_path,cmt_author_email,cmt_author_timestamp
39241,26e84dc10d4bf442660bb0212b3d428640c1f5b7,25455,rabbithub,github.com/rabbitmq/,tonygarnockjones@gmail.com,2010-09-05 22:15:13+00:00
1036433,c009a0a48a66d6fb0c6ccd3b51b20200fdb58655,27839,pyvco,github.com/vmware/,yann.hodique@gmail.com,2010-11-07 15:55:02+00:00
39424,ec16a25cc6175f4cdf10421f4ac488090d6a0696,25456,rabbitmq-ha,github.com/rabbitmq/,matthew@rabbitmq.com,2010-12-16 14:38:01+00:00
160261,b99de99785ce2bd1e8b88839d3e9a62e34dff659,25689,cloudfoundry-sandbox,github.com/Spring-projects/,peter@cacoethes.co.uk,2011-06-07 15:13:23+00:00
163602,cf988c19af52baf5c87b74805a6080bbcd70ccea,25702,protocol-fallback-demo,github.com/Spring-projects/,ozhurakousky@pivotal.io,2011-08-04 13:13:17+00:00
...,...,...,...,...,...,...
553815,ab2de82048fa54218a727d0b7b07d056665084f6,26392,aws-pcf-quickstart,github.com/cf-platform-eng/,ramonmakkelie@gmail.com,2020-03-25 16:11:49+00:00
1168232,b9b5d173b552d4516275ad92d8f55004fede7133,28037,octant,github.com/vmware-tanzu/,wayne@riotousliving.com,2020-03-25 16:13:47+00:00
767525,a14cf3dce6fc6f4312b356e26423016505356b12,27061,paving,github.com/pivotal/,jarchie@pivotal.io,2020-03-25 16:19:23+00:00
741386,f30932095c70a32c0f2c1df9d7af592ce537d2e2,27043,kpack,github.com/pivotal/,mmcnew@pivotal.io,2020-03-25 16:24:19+00:00


In [40]:
    # This gives us the top contribs per repo - this can be printed to csv file 
    i=0
    for index, row in by_repo.iterrows():
        print(row.repo_path, row.repo_name, row.cmt_author_timestamp, row.cmt_author_email)
        top_contribs = all_commits.loc[all_commits['repo_id'] == row.repo_id].cmt_author_email.value_counts()
        if len(top_contribs) > 1:
            print(top_contribs.index[0], top_contribs[0], top_contribs.index[1], top_contribs[1])
        elif len(top_contribs) == 1:
            print(top_contribs.index[0], top_contribs[0])
        elif len(top_contribs) == 0:
            print('None')
        i+=1
        if i == 50:
            break

github.com/rabbitmq/ rabbithub 2010-09-05 22:15:13+00:00 tonygarnockjones@gmail.com
tonyg@lshift.net 47 tonyg@kcbbs.gen.nz 23
github.com/vmware/ pyvco 2010-11-07 15:55:02+00:00 yann.hodique@gmail.com
yann.hodique@gmail.com 50
github.com/rabbitmq/ rabbitmq-ha 2010-12-16 14:38:01+00:00 matthew@rabbitmq.com
matthew@rabbitmq.com 118
github.com/Spring-projects/ cloudfoundry-sandbox 2011-06-07 15:13:23+00:00 peter@cacoethes.co.uk
peter@cacoethes.co.uk 1 markfisher@vmware.com 1
github.com/Spring-projects/ protocol-fallback-demo 2011-08-04 13:13:17+00:00 ozhurakousky@pivotal.io
ozhurakousky@pivotal.io 12 bruce.snyder@gmail.com 3
github.com/Spring-projects/ nio-sandbox 2011-08-22 13:41:24+00:00 jon@jbrisbin.com
jon@jbrisbin.com 11
github.com/rabbitmq/ rabbitmq-heroku-example 2011-08-31 15:23:30+00:00 david@rabbitmq.com
marek@popcount.org 2 david@rabbitmq.com 2
github.com/Spring-projects/ spring-build 2011-09-07 15:11:18+00:00 bhale@vmware.com
ben.hale@springsource.com 18 bhale@vmware.com 11
git