# Step 3.1: GitHub Status Analysis

In [1]:
import os
from os import listdir
from os.path import isfile, join
from collections import Counter
import pandas as pd
import json

In [2]:
df = pd.read_csv('GitHubStatusCheckerResults.csv')
df

Unnamed: 0,Project,Buildable,Success,Fail,No,TotalCommits,% SUCCESS
0,Hystrix,2108,1115,994,0,2109,52.89
1,Jest,1146,457,690,0,1147,39.88
2,Twitter4J,2318,851,1467,0,2318,36.71
3,assertj-core,2855,2435,420,0,2855,85.29
4,aws-sdk-java,2496,2231,265,0,2496,89.38
5,bookkeeper,2230,250,2008,0,2258,11.21
6,commons-lang,5723,4943,781,0,5724,86.37
7,connectbot,1727,0,1791,0,1791,0.0
8,docker-java,1982,977,1006,0,1983,49.29
9,elasticsearch-hadoop,1983,0,1983,0,1983,0.0


In [3]:
df['% SUCCESS'].describe()

count    37.000000
mean     43.040000
std      29.895102
min       0.000000
25%      23.090000
50%      46.390000
75%      56.950000
max      92.460000
Name: % SUCCESS, dtype: float64

In [4]:
df[['TotalCommits']].describe()

Unnamed: 0,TotalCommits
count,37.0
mean,3111.135135
std,1825.441243
min,1147.0
25%,1791.0
50%,2401.0
75%,3693.0
max,7847.0


In [5]:
# Sum of each field
df[["Buildable","Success","Fail","No","TotalCommits"]].sum()

Buildable       110594
Success          52580
Fail             62532
No                   0
TotalCommits    115112
dtype: int64

In [6]:
# Average
df[["Success","Fail","No","TotalCommits","% SUCCESS"]].mean()

Success         1421.081081
Fail            1690.054054
No                 0.000000
TotalCommits    3111.135135
% SUCCESS         43.040000
dtype: float64

In [7]:
q1 = df['TotalCommits'].quantile(0.25)
q1

1791.0

In [8]:
q3 = df['TotalCommits'].quantile(0.75)
q3

3693.0

### Short projects

In [9]:
# Short proyects (< 1791 commits)
short_df = df[ df['TotalCommits']< q1 ]
short_df['% SUCCESS'].describe()

count     9.000000
mean     36.174444
std      26.634741
min       0.000000
25%      28.780000
50%      38.050000
75%      39.880000
max      87.360000
Name: % SUCCESS, dtype: float64

In [14]:
# Sum of each field
short_df[["Buildable","Success","Fail","No","TotalCommits"]].sum()

Buildable       11168
Success          3569
Fail             9588
No                  0
TotalCommits    13157
dtype: int64

### Medium projects

In [10]:
# Medium proyects (> 1791 commits & < 3693)
medium_df = df.query('TotalCommits >= %d and TotalCommits < %d'%(q1,q3))
medium_df['% SUCCESS'].describe()

count    18.000000
mean     43.555556
std      30.770324
min       0.000000
25%      14.517500
50%      50.020000
75%      60.842500
max      89.380000
Name: % SUCCESS, dtype: float64

In [11]:
# Sum of each field
medium_df[["Buildable","Success","Fail","No","TotalCommits"]].sum()

Buildable       43507
Success         20267
Fail            24641
No                  0
TotalCommits    44908
dtype: int64

### Large projects

In [12]:
# Large proyects (> 3693)
large_df = df[ df['TotalCommits'] >= q3 ]
large_df['% SUCCESS'].describe()

count    10.000000
mean     48.291000
std      32.826117
min       0.000000
25%      23.120000
50%      50.295000
75%      78.895000
max      92.460000
Name: % SUCCESS, dtype: float64

In [13]:
large_df[["Buildable","Success","Fail","No","TotalCommits"]].sum()

Buildable       55919
Success         28744
Fail            28303
No                  0
TotalCommits    57047
dtype: int64

## Check number of builds of each Build System

In [11]:
projects = [f for f in listdir("/home/results/GitHub/")]
build_systems = []
for project in projects:
    path = join("/home/results/GitHub/", project)
    build_path = join(path, "experiment_1/build_files/")
    for build_file in listdir(build_path):
        with open(join(build_path, build_file)) as f:
            bs = json.load(f)
            build_systems.append((bs['build_system']))

In [12]:
counter = Counter(build_systems)
counter

Counter({'Maven': 83951, 'Gradle': 11416, 'NOT_FOUND': 4518, 'Ant': 15227})

In [13]:
total_commits = df['TotalCommits'].sum()
print("Maven: {:.2f}%".format(counter['Maven']*100/total_commits))
print("Gradle: {:.2f}%".format(counter['Gradle']*100/total_commits))
print("Ant: {:.2f}%".format(counter['Ant']*100/total_commits))

Maven: 72.93%
Gradle: 9.92%
Ant: 13.23%
