In [1]:
import psycopg2
import pandas as pd 
# from sqlalchemy.types import Integer, Text, String, DateTime
import sqlalchemy as s
import matplotlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

with open("config.json") as config_file:
    config = json.load(config_file)

database_connection_string = 'postgres+psycopg2://{}:{}@{}:{}/{}'.format(config['user'], config['password'], config['host'], config['port'], config['database'])

dbschema='augur_data'
engine = s.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

In [2]:
repo_list = pd.DataFrame()
repo_list_query = f"""
SELECT repo_id, repo_name, repo_path from repo
WHERE repo_name = 'concourse' OR repo_name = 'postfacto' or repo_name = 'clarity' or repo_name = 'gpdb' or
      repo_name = 'kpack' or repo_name = 'rabbitmq-server' or repo_name = 'sonobuoy';
    """
repo_list = pd.read_sql_query(repo_list_query, con=engine)
print(repo_list)

   repo_id        repo_name                              repo_path
0    26235        concourse               github.com/pcfdev-forks/
1    28051        concourse                  github.com/concourse/
2    28030         sonobuoy               github.com/vmware-tanzu/
3    27913          clarity                     github.com/vmware/
4    26983        postfacto                    github.com/pivotal/
5    27169             gpdb                github.com/pivotal-gss/
6    25857             gpdb               github.com/greenplum-db/
7    25432  rabbitmq-server                   github.com/rabbitmq/
8    26600             gpdb  github.com/Pivotal-Field-Engineering/
9    27043            kpack                    github.com/pivotal/


In [3]:
## List of repository IDs for the report
#repo_dict = {25760, 25663} #spring-boot & spring-framework
#repo_dict = {28051} # concourse
#repo_dict = {26983} #postfacto
#repo_dict = {25432} #rabbitmq-server
repo_dict = {25663} #spring-framework
#repo_dict = {28030} #sonobuoy

In [18]:
#from datetime import date
import datetime 

current = datetime.date.today()
today = "'" + str(current) + "'"
print(today)

first_current = current.replace(day=1)
last_month = first_current - datetime.timedelta(days=1)
end_date = "'" + str(last_month) + "'"
print(end_date)

print

start = last_month - datetime.timedelta(days=365)
year_ago = "'" + str(start) + "'"
print(year_ago)

'2020-03-11'
'2020-02-29'
'2019-03-01'


In [19]:
#Commit data - from humans excluding bots
commitsDF = pd.DataFrame()
for value in repo_dict: 
    commitsquery = f"""
                    SELECT
                        cmt_author_name, cmt_author_email, repo_id, cmt_id, cmt_author_timestamp from commits
                    WHERE
                        repo_id = {value}
                        AND cmt_author_name NOT LIKE 'snyk%%'
                        AND cmt_author_name NOT LIKE '%%bot'
                        AND cmt_author_name != 'Spring Operator'
                        AND cmt_author_timestamp >= {year_ago}
                        AND cmt_author_timestamp <= {end_date}
                    ORDER BY
                        cmt_author_timestamp;
                    """

commitsDF = pd.read_sql_query(commitsquery, con=engine)
                        

In [20]:
print(commitsDF)

         cmt_author_name        cmt_author_email  repo_id    cmt_id  \
0            Sam Brannen     sbrannen@pivotal.io    25663  37761439   
1            Sam Brannen     sbrannen@pivotal.io    25663  37761427   
2            Sam Brannen     sbrannen@pivotal.io    25663  37761442   
3            Sam Brannen     sbrannen@pivotal.io    25663  37761441   
4            Sam Brannen     sbrannen@pivotal.io    25663  37761428   
...                  ...                     ...      ...       ...   
18913  Rossen Stoyanchev  rstoyanchev@pivotal.io    25663  41874397   
18914                陈其苗  chenqimiao1994@126.com    25663  41874356   
18915  Rossen Stoyanchev  rstoyanchev@pivotal.io    25663  41876347   
18916  Rossen Stoyanchev  rstoyanchev@pivotal.io    25663  41876346   
18917  Rossen Stoyanchev  rstoyanchev@pivotal.io    25663  41876348   

           cmt_author_timestamp  
0     2019-03-01 09:17:31+00:00  
1     2019-03-01 09:17:31+00:00  
2     2019-03-01 09:17:31+00:00  
3     2019-

In [21]:
total_commits = commitsDF.cmt_id.nunique()
print(total_commits)

18918


In [22]:
authorDF = pd.DataFrame()
authorDF = commitsDF.cmt_author_name.value_counts()
authorDF = authorDF.reset_index()
authorDF.columns = ['name', 'commits']
authorDF['percent'] = authorDF['commits'] / total_commits
print(authorDF.head(10))

                name  commits   percent
0        Sam Brannen     6634  0.350671
1       Phillip Webb     6561  0.346813
2    Juergen Hoeller     1369  0.072365
3  Rossen Stoyanchev     1332  0.070409
4  Sebastien Deleuze      704  0.037213
5          Phil Webb      671  0.035469
6       Brian Clozel      360  0.019029
7      Arjen Poutsma      286  0.015118
8          Rob Winch      132  0.006977
9    Stephane Nicoll      121  0.006396


In [23]:
cum_percent = 0
people_list = []

for item in authorDF.iterrows():
    name = item[1]['name']
    percent = item[1]['percent']
    commits = item[1]['commits']
    
    cum_percent += percent
    
    people_list.append([name, percent, commits])
    if cum_percent > .50:
        break

print("Contributor Risk Metric Assessment: ", end = '')
num_people = len(people_list)
if num_people < 3:
    print('AT RISK \n\nOnly ', end = '')
else:
    print('Healthy\n')
    
print(num_people, "people make up", "{:.0%}".format(cum_percent), "of the commits in the past year:")

for person in people_list:
    name = person[0]
    percent = person[1]
    commits = person[2]
    print(name, "{:.0%}".format(percent), "-", commits, "commits")
    
print("\nA healthy project should have at a minimum 3 people who combined account for the majority of the commits. The higher this number is, the more likely your project would succeed if a leading contributor suddenly left the project.")

Contributor Risk Metric Assessment: AT RISK 

Only 2 people make up 70% of the commits in the past year:
Sam Brannen 35% - 6634 commits
Phillip Webb 35% - 6561 commits

A healthy project should have at a minimum 3 people who combined account for the majority of the commits. The higher this number is, the more likely your project would succeed if a leading contributor suddenly left the project.
