In [1]:
import psycopg2
import pandas as pd 
# from sqlalchemy.types import Integer, Text, String, DateTime
import sqlalchemy as s
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

with open("config.json") as config_file:
    config = json.load(config_file)

database_connection_string = 'postgres+psycopg2://{}:{}@{}:{}/{}'.format(config['user'], config['password'], config['host'], config['port'], config['database'])

dbschema='augur_data'
engine = s.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

In [2]:
repo_list = pd.DataFrame()
repo_list_query = f"""
SELECT repo_id, repo_name, repo_path from repo
WHERE repo_name = 'concourse' OR repo_name = 'postfacto' or repo_name = 'clarity' or repo_name = 'gpdb' or
      repo_name = 'kpack' or repo_name = 'rabbitmq-server' or repo_name = 'sonobuoy';
    """
repo_list = pd.read_sql_query(repo_list_query, con=engine)
print(repo_list)

   repo_id        repo_name                              repo_path
0    26235        concourse               github.com/pcfdev-forks/
1    28051        concourse                  github.com/concourse/
2    28030         sonobuoy               github.com/vmware-tanzu/
3    27913          clarity                     github.com/vmware/
4    26983        postfacto                    github.com/pivotal/
5    27169             gpdb                github.com/pivotal-gss/
6    25857             gpdb               github.com/greenplum-db/
7    25432  rabbitmq-server                   github.com/rabbitmq/
8    26600             gpdb  github.com/Pivotal-Field-Engineering/
9    27043            kpack                    github.com/pivotal/


In [182]:
## List of repository IDs for the report
#repo_dict = {25760, 25663} #spring-boot & spring-framework
#repo_dict = {28051} # concourse
#repo_dict = {26983} #postfacto
#repo_dict = {25432} #rabbitmq-server
repo_dict = {25663} #spring-framework
#repo_dict = {28030} #sonobuoy

In [183]:
#from datetime import date
import datetime 

current = datetime.date.today()
today = "'" + str(current) + "'"
print(today)

start = current - datetime.timedelta(days=365)
year_ago = "'" + str(start) + "'"
print(year_ago)

'2020-03-11'
'2019-03-12'


In [184]:
#Commit data - from humans excluding bots
commitsDF = pd.DataFrame()
for value in repo_dict: 
    commitsquery = f"""
                    SELECT
                        cmt_author_name, repo_id, cmt_id, cmt_author_timestamp from commits
                    WHERE
                        repo_id = {value}
                        AND cmt_author_name NOT LIKE 'snyk%%'
                        AND cmt_author_name NOT LIKE '%%bot'
                        AND cmt_author_name != 'Spring Operator'
                        AND cmt_author_timestamp > {year_ago}
                    ORDER BY
                        cmt_author_timestamp;
                    """

commitsDF = pd.read_sql_query(commitsquery, con=engine)
                        

In [185]:
print(commitsDF)

         cmt_author_name  repo_id    cmt_id      cmt_author_timestamp
0           Jorg Heymans    25663  37711225 2019-03-12 07:20:10+00:00
1        Stephane Nicoll    25663  37693704 2019-03-12 08:09:34+00:00
2        Stephane Nicoll    25663  37648310 2019-03-12 08:10:29+00:00
3        Stephane Nicoll    25663  37772900 2019-03-12 08:10:49+00:00
4         Сергей Цыпанов    25663  37705778 2019-03-12 12:15:59+00:00
...                  ...      ...       ...                       ...
18541  Rossen Stoyanchev    25663  41876374 2020-03-04 19:28:05+00:00
18542      Arjen Poutsma    25663  41876366 2020-03-05 14:31:20+00:00
18543      Arjen Poutsma    25663  41876365 2020-03-05 14:31:20+00:00
18544        Qimiao Chen    25663  41876345 2020-03-06 11:34:00+00:00
18545    Sviatoslav Hryb    25663  41892890 2020-03-07 19:40:46+00:00

[18546 rows x 4 columns]


In [186]:
total_commits = commitsDF.cmt_id.nunique()
print(total_commits)

18546


In [187]:
authorDF = pd.DataFrame()
authorDF = commitsDF.cmt_author_name.value_counts()
authorDF = authorDF.reset_index()
authorDF.columns = ['name', 'commits']
authorDF['percent'] = authorDF['commits'] / total_commits
print(authorDF.head(10))

                name  commits   percent
0       Phillip Webb     6531  0.352151
1        Sam Brannen     6526  0.351882
2  Rossen Stoyanchev     1342  0.072361
3    Juergen Hoeller     1184  0.063841
4  Sebastien Deleuze      693  0.037367
5          Phil Webb      671  0.036180
6       Brian Clozel      359  0.019357
7      Arjen Poutsma      245  0.013210
8          Rob Winch      132  0.007117
9    Stephane Nicoll      119  0.006416


In [188]:
cum_percent = 0
people_list = []

for item in authorDF.iterrows():
    name = item[1]['name']
    percent = item[1]['percent']
    commits = item[1]['commits']
    
    cum_percent += percent
    
    people_list.append([name, percent, commits])
    if cum_percent > .50:
        break

print("Contributor Risk Metric Assessment: ", end = '')
num_people = len(people_list)
if num_people < 3:
    print('AT RISK \n\nOnly ', end = '')
else:
    print('Healthy\n')
    
print(num_people, "people make up", "{:.0%}".format(cum_percent), "of the commits in the past year:")

for person in people_list:
    name = person[0]
    percent = person[1]
    commits = person[2]
    print(name, "{:.0%}".format(percent), "-", commits, "commits")
    
print("\nA healthy project should have at a minimum 3 people who combined account for the majority of the commits. The higher this number is, the more likely your project would succeed if a leading contributor suddenly left the project.")

Contributor Risk Metric Assessment: AT RISK 

Only 2 people make up 70% of the commits in the past year:
Phillip Webb 35% - 6531 commits
Sam Brannen 35% - 6526 commits

A healthy project should have at a minimum 3 people who combined account for the majority of the commits. The higher this number is, the more likely your project would succeed if a leading contributor suddenly left the project.
