## Task Duration and Network Centrality

In [1]:
from research_utils.database.database import Database

from lifelines import CoxPHFitter
import pandas as pd
from patsy import dmatrices
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import scipy.stats as stats

### Read in Data

Read in the data for the regression model from the `reqs_prioritization` table in Postgres. This table is a compilation of all the issues opened in 2018 for networks for which fewer than 50% of the issues were submitted by non-contributors.

In [2]:
database = Database()

In [284]:
sql = """
SELECT *
FROM open_source.reqs_prioritization x
WHERE package <> 'salt'
AND ('bug' = ANY(lower(labels::text)::text[])
OR 'feature' = ANY(lower(labels::text)::text[])
OR 'feature request' = ANY(lower(labels::text)::text[])
OR 'change' = ANY(lower(labels::text)::text[])
OR 'suggestion' = ANY(lower(labels::text)::text[])
OR 'enhancement' = ANY(lower(labels::text)::text[]))
"""
# AND 'invalid' <> ANY(labels) AND 'duplicate' <> ANY(labels)
# AND 'question' <> ANY(labels) AND 'wontfix' <> ANY(labels)
df = pd.read_sql(sql, database.connection)

### Exploratory Data Analysis

Some simple analysis to get a feel for patterns in the data.

In [285]:
df.describe()

Unnamed: 0,duration,gini_coefficient,avg_clustering,avg_min_path,total_stakeholders,betweenness_centrality,commit_pct,adj_duration,closed
count,2063.0,3017.0,3017.0,3017.0,3017.0,3017.0,3017.0,3017.0,3017.0
mean,41.502666,0.545507,0.555924,2.151024,236.50116,0.18157,0.113625,154.549553,0.683792
std,71.349,0.052324,0.139873,0.223417,148.827077,0.331294,0.25432,185.572037,0.465072
min,0.0,0.133333,0.0,1.333333,12.0,0.0,0.0,0.0,0.0
25%,0.0,0.520255,0.477933,2.014563,126.0,0.0,0.0,3.0,0.0
50%,8.0,0.554637,0.568562,2.084162,227.0,0.0,0.0,41.0,1.0
75%,48.0,0.574429,0.66133,2.307071,310.0,0.125783,0.029061,310.0,1.0
max,471.0,0.663207,0.916667,3.052027,726.0,0.999034,1.0,570.0,1.0


In [286]:
@np.vectorize
def compute_adj_duration(duration, max_value):
    if np.isnan(duration) or duration is None:
        return max_value
    else:
        return duration

In [287]:
df['adj_duration'] = compute_adj_duration(df['duration'], df['duration'].max())

In [288]:
@np.vectorize
def create_label_indicator(labels, label):
    labels = [x.lower() for x in labels]
    if label.lower() in labels:
        return 1
    else:
        return 0

In [289]:
default_labels = ['bug', 'feature', 'enhancement', 'suggestion', 'change']
for label in default_labels:
    df[label.replace(' ','_')] = create_label_indicator(df['labels'], label)

In [290]:
df['contributor'] = df['commit_pct'] > 0

In [291]:
df['contributor'].value_counts()

False    1682
True     1335
Name: contributor, dtype: int64

In [292]:
df['central'] = df['betweenness_centrality'] > 0

In [293]:
df['central'].value_counts()

False    1633
True     1384
Name: central, dtype: int64

In [294]:
package_counts = dict(df['package'].value_counts())

In [295]:
@np.vectorize
def add_total_issues(package, package_counts):
    if package in package_counts:
        return package_counts[package]
    else:
        return 0

In [296]:
df['total_issues'] = add_total_issues(df['package'], package_counts)

In [297]:
count_df = df[df['total_issues'] >= 5]

In [298]:
len(count_df['package'].unique())

127

#### Histogram of the duration of issues

In [299]:
#plt.figure(figsize=(12, 7))
#sns.distplot(df['adj_duration'], kde=False, color='blue', bins=30)
#plt.title('Duration of Issues', fontsize=18)
#plt.xlabel('Time', fontsize=16)
#plt.ylabel('Count', fontsize=16)
# plt.xlim([0,.7])

In [300]:
#plt.figure(figsize=(12, 7))
#sns.distplot(df['gini_coefficient'], kde=False, color='blue', bins=30)
#plt.title('Gini Coefficient', fontsize=18)
#plt.xlabel('Time', fontsize=16)
#plt.ylabel('Count', fontsize=16)

### OLS Sanity Check

Before getting into our more complicated model, we want to run an OLS model just to get a feel for how the model is working.

In [316]:
y, X = dmatrices('adj_duration ~ avg_clustering + avg_min_path + gini_coefficient + '
                 'central + contributor + bug + pull_request',
#                 '+ bug + documentation  + enhancement'
#                  '+ good_first_issue + help_wanted', 
                 data=count_df, return_type='dataframe')

In [317]:
mod = sm.OLS(y, X)
res = mod.fit()

In [318]:
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:           adj_duration   R-squared:                       0.152
Model:                            OLS   Adj. R-squared:                  0.150
Method:                 Least Squares   F-statistic:                     72.24
Date:                Thu, 25 Jul 2019   Prob (F-statistic):           2.05e-96
Time:                        23:23:20   Log-Likelihood:                -18850.
No. Observations:                2828   AIC:                         3.772e+04
Df Residuals:                    2820   BIC:                         3.776e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept              207.0908 

### Cox Proportional Hazard Model

Now that we have a better idea about the relationships between the variables, let's train a model that better accounts for the lifetime of the issue. Specifically, the cox proportional hazard model assumes a conditional exponential distribution, and accounts for issues that have never been closed through the use of left censoring.

In [282]:
keep_cols = ['adj_duration', 'closed', 'avg_clustering', 'avg_min_path',
             'gini_coefficient', 'central', 'contributor', 'bug']
#keep_cols = ['duration', 'closed', 'central', 'contributor']
df_regress = df[keep_cols]

In [283]:
cph = CoxPHFitter()
cph.fit(df_regress, duration_col='adj_duration', event_col='closed', show_progress=True)

cph.print_summary()  # access the results using cph.summary

Iteration 5: norm_delta = 0.00000, step_size = 1.0000, ll = -10658.56993, newton_decrement = 0.00000, seconds_since_start = 0.2Convergence completed after 5 iterations.
<lifelines.CoxPHFitter: fitted with 2349 observations, 879 censored>
      duration col = 'adj_duration'
         event col = 'closed'
number of subjects = 2349
  number of events = 1470
partial log-likelihood = -10658.57
  time fit was run = 2019-07-26 03:20:46 UTC

---
                  coef exp(coef)  se(coef)  coef lower 95%  coef upper 95% exp(coef) lower 95% exp(coef) upper 95%
avg_clustering   -1.37      0.25      0.20           -1.77           -0.97                0.17                0.38
avg_min_path     -0.79      0.45      0.14           -1.07           -0.52                0.34                0.59
gini_coefficient  5.40    221.67      0.58            4.25            6.55               70.43              697.70
central          -0.22      0.80      0.07           -0.36           -0.09                0.70     