In [1]:
import psycopg2
import pandas as pd 
# from sqlalchemy.types import Integer, Text, String, DateTime
import sqlalchemy as s
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

with open("config.json") as config_file:
    config = json.load(config_file)

database_connection_string = 'postgres+psycopg2://{}:{}@{}:{}/{}'.format(config['user'], config['password'], config['host'], config['port'], config['database'])

dbschema='augur_data'
engine = s.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

In [2]:
repo_list = pd.DataFrame()
repo_list_query = f"""
SELECT repo_id, repo_name, repo_path from repo
WHERE repo_name = 'concourse' OR repo_name = 'postfacto' or repo_name = 'clarity' or repo_name = 'gpdb' or
      repo_name = 'kpack' or repo_name = 'rabbitmq-server' or repo_name = 'sonobuoy';
    """
repo_list = pd.read_sql_query(repo_list_query, con=engine)
print(repo_list)

   repo_id        repo_name                              repo_path
0    26235        concourse               github.com/pcfdev-forks/
1    28051        concourse                  github.com/concourse/
2    28030         sonobuoy               github.com/vmware-tanzu/
3    27913          clarity                     github.com/vmware/
4    26983        postfacto                    github.com/pivotal/
5    27169             gpdb                github.com/pivotal-gss/
6    25857             gpdb               github.com/greenplum-db/
7    25432  rabbitmq-server                   github.com/rabbitmq/
8    26600             gpdb  github.com/Pivotal-Field-Engineering/
9    27043            kpack                    github.com/pivotal/


In [3]:
## List of repository IDs for the report
#repo_dict = {25760, 25663} #spring-boot & spring-framework
#repo_dict = {28051} # concourse
#repo_dict = {26983} #postfacto
#repo_dict = {25432} #rabbitmq-server
repo_dict = {25663} #spring-framework
#repo_dict = {28030} #sonobuoy

In [4]:
#from datetime import date
import datetime 

current = datetime.date.today()
today = "'" + str(current) + "'"
print(today)

first_current = current.replace(day=1)
last_month = first_current - datetime.timedelta(days=1)
end_date = "'" + str(last_month) + "'"
print(end_date)

print

start = last_month - datetime.timedelta(days=365)
year_ago = "'" + str(start) + "'"
print(year_ago)

'2020-03-15'
'2020-02-29'
'2019-03-01'


In [16]:
#Commit data - from humans excluding bots
commitsDF = pd.DataFrame()
for value in repo_dict: 
    commitsquery = f"""
                    SELECT
                        CASE WHEN contributors.cntrb_canonical IS NOT NULL THEN contributors.cntrb_canonical ELSE cmt_author_email END AS cntrb_canonical,
                        CASE WHEN canonical_full_names.cntrb_full_name IS NOT NULL THEN canonical_full_names.cntrb_full_name ELSE cmt_author_name END AS canonical_full_name,
                        cmt_author_name, cmt_author_email, repo_id, cmt_id, cmt_author_timestamp 
                    FROM commits 
                        LEFT OUTER JOIN contributors ON cntrb_email = cmt_author_email
                        LEFT OUTER JOIN (
                            SELECT cntrb_canonical, cntrb_full_name 
                            FROM contributors
                            WHERE cntrb_canonical = cntrb_email
                        ) canonical_full_names
                        ON canonical_full_names.cntrb_canonical = contributors.cntrb_canonical
                    WHERE
                        repo_id = {value}
                        AND cmt_author_name NOT LIKE 'snyk%%'
                        AND cmt_author_name NOT LIKE '%%bot'
                        AND cmt_author_name != 'Spring Operator'
                         AND cmt_author_timestamp >= {year_ago}
                         AND cmt_author_timestamp <= {end_date}
                    ORDER BY
                        cntrb_canonical;
                    """

commitsDF = pd.read_sql_query(commitsquery, con=engine)
display(commitsDF)               

Unnamed: 0,cntrb_canonical,canonical_full_name,cmt_author_name,cmt_author_email,repo_id,cmt_id,cmt_author_timestamp
0,1956944+crewmanmud@users.noreply.github.com,Andrew McCallum,Andrew McCallum,1956944+crewmanmud@users.noreply.github.com,25663,37663565,2019-05-09 21:47:37+00:00
1,21066051+rustytheclone@users.noreply.github.com,RustyTheClone,RustyTheClone,21066051+rustytheclone@users.noreply.github.com,25663,37683188,2019-06-15 18:36:07+00:00
2,21066051+rustytheclone@users.noreply.github.com,RustyTheClone,RustyTheClone,21066051+rustytheclone@users.noreply.github.com,25663,37683187,2019-06-15 18:36:07+00:00
3,21066051+rustytheclone@users.noreply.github.com,RustyTheClone,RustyTheClone,21066051+rustytheclone@users.noreply.github.com,25663,37683186,2019-06-15 18:36:07+00:00
4,21066051+rustytheclone@users.noreply.github.com,RustyTheClone,RustyTheClone,21066051+rustytheclone@users.noreply.github.com,25663,37683185,2019-06-15 18:36:07+00:00
...,...,...,...,...,...,...,...
18913,zhangt2333@gmail.com,zhangt2333,ZhangT,zhangt2333@gmail.com,25663,41874407,2020-02-26 11:29:09+00:00
18914,zheng.ren01@mljr.com,zheng.ren01@mljr.com,zheng.ren01@mljr.com,zheng.ren01@mljr.com,25663,37645458,2019-07-20 09:02:36+00:00
18915,zhouyanming@gmail.com,Yanming Zhou,Yanming Zhou,zhouyanming@gmail.com,25663,37705150,2019-04-18 02:20:13+00:00
18916,zhuzhuman@hotmail.com,zhuzhuman978,zhuzhuman978,zhuzhuman@hotmail.com,25663,37663476,2019-08-10 12:03:18+00:00


In [17]:
total_commits = commitsDF.cmt_id.nunique()
print(total_commits)

18918


In [19]:
authorDF = pd.DataFrame()
authorDF = commitsDF.canonical_full_name.value_counts()
authorDF = authorDF.reset_index()
authorDF.columns = ['name', 'commits']
authorDF['percent'] = authorDF['commits'] / total_commits
print(authorDF.head(10))

                name  commits   percent
0       Phillip Webb     7232  0.382281
1        Sam Brannen     6634  0.350671
2    Juergen Hoeller     1369  0.072365
3  Rossen Stoyanchev     1332  0.070409
4  Sebastien Deleuze      704  0.037213
5       Brian Clozel      360  0.019029
6      Arjen Poutsma      286  0.015118
7          Rob Winch      132  0.006977
8    Stephane Nicoll      121  0.006396
9          stsypanov       95  0.005022


In [20]:
cum_percent = 0
people_list = []

for item in authorDF.iterrows():
    name = item[1]['name']
    percent = item[1]['percent']
    commits = item[1]['commits']
    
    cum_percent += percent
    
    people_list.append([name, percent, commits])
    if cum_percent > .50:
        break

print("Contributor Risk Metric Assessment: ", end = '')
num_people = len(people_list)
if num_people < 3:
    print('AT RISK \n\nOnly ', end = '')
else:
    print('Healthy\n')
    
print(num_people, "people make up", "{:.0%}".format(cum_percent), "of the commits in the past year:")

for person in people_list:
    name = person[0]
    percent = person[1]
    commits = person[2]
    print(name, "{:.0%}".format(percent), "-", commits, "commits")
    
print("\nA healthy project should have at a minimum 3 people who combined account for the majority of the commits. The higher this number is, the more likely your project would succeed if a leading contributor suddenly left the project.")

Contributor Risk Metric Assessment: AT RISK 

Only 2 people make up 73% of the commits in the past year:
Phillip Webb 38% - 7232 commits
Sam Brannen 35% - 6634 commits

A healthy project should have at a minimum 3 people who combined account for the majority of the commits. The higher this number is, the more likely your project would succeed if a leading contributor suddenly left the project.
