In [4]:
import psycopg2
import pandas as pd 
# from sqlalchemy.types import Integer, Text, String, DateTime
import sqlalchemy as s
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

with open("config.json") as config_file:
    config = json.load(config_file)

database_connection_string = 'postgres+psycopg2://{}:{}@{}:{}/{}'.format(config['user'], config['password'], config['host'], config['port'], config['database'])

dbschema='augur_data'
engine = s.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

In [5]:
repo_list = pd.DataFrame()
repo_list_query = f"""
SELECT repo_id, repo_name, repo_path from repo
WHERE repo_name = 'concourse' OR repo_name = 'postfacto' or repo_name = 'clarity' or repo_name = 'gpdb' or
      repo_name = 'kpack' or repo_name = 'rabbitmq-server' or repo_name = 'sonobuoy';
    """
repo_list = pd.read_sql_query(repo_list_query, con=engine)
print(repo_list)

   repo_id        repo_name                              repo_path
0    26235        concourse               github.com/pcfdev-forks/
1    28051        concourse                  github.com/concourse/
2    28030         sonobuoy               github.com/vmware-tanzu/
3    27913          clarity                     github.com/vmware/
4    26983        postfacto                    github.com/pivotal/
5    27169             gpdb                github.com/pivotal-gss/
6    25857             gpdb               github.com/greenplum-db/
7    25432  rabbitmq-server                   github.com/rabbitmq/
8    26600             gpdb  github.com/Pivotal-Field-Engineering/
9    27043            kpack                    github.com/pivotal/


In [6]:
## List of repository IDs for the report
#repo_dict = {25760, 25663} #spring-boot & spring-framework
#repo_dict = {28051} # concourse
#repo_dict = {26983} #postfacto
repo_dict = {25432} #rabbitmq-server
#repo_dict = {25663} #spring-framework
#repo_dict = {28030} #sonobuoy

In [7]:
#from datetime import date
import datetime 

current = datetime.date.today()
today = "'" + str(current) + "'"
print(today)

first_current = current.replace(day=1)
last_month = first_current - datetime.timedelta(days=1)
end_date = "'" + str(last_month) + "'"
print(end_date)

print

start = last_month - datetime.timedelta(days=365)
year_ago = "'" + str(start) + "'"
print(year_ago)

'2020-03-17'
'2020-02-29'
'2019-03-01'


In [8]:
#Commit data - from humans excluding bots
commitsDF = pd.DataFrame()
for value in repo_dict: 
    commitsquery = f"""
                    SELECT
                        CASE WHEN contributors.cntrb_canonical IS NOT NULL THEN contributors.cntrb_canonical ELSE cmt_author_email END AS cntrb_canonical,
                        CASE WHEN canonical_full_names.cntrb_full_name IS NOT NULL THEN canonical_full_names.cntrb_full_name ELSE cmt_author_name END AS canonical_full_name,
                        cmt_author_name, cmt_author_email, repo_id, cmt_id, cmt_author_timestamp 
                    FROM commits 
                        LEFT OUTER JOIN contributors ON cntrb_email = cmt_author_email
                        LEFT OUTER JOIN (
                            SELECT cntrb_canonical, cntrb_full_name 
                            FROM contributors
                            WHERE cntrb_canonical = cntrb_email
                        ) canonical_full_names
                        ON canonical_full_names.cntrb_canonical = contributors.cntrb_canonical
                    WHERE
                        repo_id = {value}
                        AND cmt_author_name NOT LIKE 'snyk%%'
                        AND cmt_author_name NOT LIKE '%%bot'
                        AND cmt_author_name != 'Spring Operator'
                         AND cmt_author_timestamp >= {year_ago}
                         AND cmt_author_timestamp <= {end_date}
                    ORDER BY
                        cntrb_canonical;
                    """

commitsDF = pd.read_sql_query(commitsquery, con=engine)
display(commitsDF)               

Unnamed: 0,cntrb_canonical,canonical_full_name,cmt_author_name,cmt_author_email,repo_id,cmt_id,cmt_author_timestamp
0,acogoluegnes@gmail.com,Arnaud Cogoluègnes,Arnaud Cogoluègnes,acogoluegnes@gmail.com,25432,37374956,2019-05-29 08:26:52+00:00
1,acogoluegnes@gmail.com,Arnaud Cogoluègnes,Arnaud Cogoluègnes,acogoluegnes@gmail.com,25432,37383294,2019-09-25 09:00:16+00:00
2,acogoluegnes@gmail.com,Arnaud Cogoluègnes,Arnaud Cogoluègnes,acogoluegnes@gmail.com,25432,37376784,2020-01-20 09:51:56+00:00
3,acogoluegnes@gmail.com,Arnaud Cogoluègnes,Arnaud Cogoluègnes,acogoluegnes@gmail.com,25432,37373718,2019-11-26 10:40:17+00:00
4,acogoluegnes@gmail.com,Arnaud Cogoluègnes,Arnaud Cogoluègnes,acogoluegnes@gmail.com,25432,37373719,2019-11-26 10:40:17+00:00
...,...,...,...,...,...,...,...
1205,yxszyn@163.com,tomyouyou,tomyouyou,yxszyn@163.com,25432,37387125,2019-09-04 03:51:08+00:00
1206,yxszyn@163.com,tomyouyou,tomyouyou,yxszyn@163.com,25432,37369616,2019-09-07 08:18:19+00:00
1207,yxszyn@163.com,tomyouyou,tomyouyou,yxszyn@163.com,25432,37380689,2020-01-11 03:06:53+00:00
1208,yxszyn@163.com,tomyouyou,tomyouyou,yxszyn@163.com,25432,37377752,2019-12-26 06:32:07+00:00


In [9]:
total_commits = commitsDF.cmt_id.nunique()
print(total_commits)

1210


In [15]:
authorDF = pd.DataFrame()
authorDF = commitsDF.cntrb_canonical.value_counts()
authorDF = authorDF.reset_index()
authorDF.columns = ['email', 'commits']
authorDF['percent'] = authorDF['commits'] / total_commits
print(authorDF.head(10))

                               email  commits   percent
0           michael@clojurewerkz.org      584  0.482645
1  jean-sebastien.pedron@dumbbell.fr      196  0.161983
2                knilsson@pivotal.io      105  0.086777
3                     luke@bakken.io       67  0.055372
4                 diana@rabbitmq.com       66  0.054545
5           dparracorbacho@piotal.io       56  0.046281
6                 hairyhum@gmail.com       31  0.025620
7             acogoluegnes@gmail.com       26  0.021488
8                 gerhard@lazu.co.uk       17  0.014050
9                kjnilsson@gmail.com       10  0.008264


In [16]:
authorDF = pd.DataFrame()
authorDF = commitsDF.canonical_full_name.value_counts()
authorDF = authorDF.reset_index()
authorDF.columns = ['name', 'commits']
authorDF['percent'] = authorDF['commits'] / total_commits
print(authorDF.head(10))

                    name  commits   percent
0        Michael Klishin      584  0.482645
1  Jean-Sébastien Pédron      196  0.161983
2              kjnilsson      105  0.086777
3            Luke Bakken       67  0.055372
4         Diana Corbacho       67  0.055372
5              dcorbacho       51  0.042149
6         Daniil Fedotov       31  0.025620
7     Arnaud Cogoluègnes       26  0.021488
8           Gerhard Lazu       23  0.019008
9           Karl Nilsson       10  0.008264


In [17]:
cum_percent = 0
people_list = []

for item in authorDF.iterrows():
    name = item[1]['name']
    percent = item[1]['percent']
    commits = item[1]['commits']
    
    cum_percent += percent
    
    people_list.append([name, percent, commits])
    if cum_percent > .50:
        break

print("Contributor Risk Metric Assessment: ", end = '')
num_people = len(people_list)
if num_people < 3:
    print('AT RISK \n\nOnly ', end = '')
else:
    print('Healthy\n')
    
print(num_people, "people make up", "{:.0%}".format(cum_percent), "of the commits in the past year:")

for person in people_list:
    name = person[0]
    percent = person[1]
    commits = person[2]
    print(name, "{:.0%}".format(percent), "-", commits, "commits")
    
print("\nA healthy project should have at a minimum 3 people who combined account for the majority of the commits. The higher this number is, the more likely your project would succeed if a leading contributor suddenly left the project.")

Contributor Risk Metric Assessment: AT RISK 

Only 2 people make up 64% of the commits in the past year:
Michael Klishin 48% - 584 commits
Jean-Sébastien Pédron 16% - 196 commits

A healthy project should have at a minimum 3 people who combined account for the majority of the commits. The higher this number is, the more likely your project would succeed if a leading contributor suddenly left the project.
