## Task Duration and Network Centrality

In [4]:
from research_utils.database.database import Database

from lifelines import CoxPHFitter
import pandas as pd
from patsy import dmatrices
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import scipy.stats as stats

### Read in Data

Read in the data for the regression model from the `reqs_prioritization` table in Postgres. This table is a compilation of all the issues opened in 2018 for networks for which fewer than 50% of the issues were submitted by non-contributors.

In [35]:
database = Database()

In [36]:
sql = """
SELECT *, ROW_NUMBER() OVER (PARTITION BY organization, package ORDER BY closed_at ASC) as priority_order
FROM open_source.reqs_prioritization x
WHERE closed_at IS NOT NULL
AND package <> 'salt'
AND ('bug' = ANY(lower(labels::text)::text[])
OR 'feature' = ANY(lower(labels::text)::text[])
OR 'feature request' = ANY(lower(labels::text)::text[])
OR 'change' = ANY(lower(labels::text)::text[])
OR 'suggestion' = ANY(lower(labels::text)::text[])
OR 'enhancement' = ANY(lower(labels::text)::text[]))
"""
# AND 'invalid' <> ANY(labels) AND 'duplicate' <> ANY(labels)
# AND 'question' <> ANY(labels) AND 'wontfix' <> ANY(labels)
df = pd.read_sql(sql, database.connection)

### Exploratory Data Analysis

Some simple analysis to get a feel for patterns in the data.

In [38]:
@np.vectorize
def create_label_indicator(labels, label):
    labels = [x.lower() for x in labels]
    if label.lower() in labels:
        return 1
    else:
        return 0

In [39]:
default_labels = ['bug', 'feature', 'enhancement', 'suggestion', 'change']
for label in default_labels:
    df[label.replace(' ','_')] = create_label_indicator(df['labels'], label)

In [40]:
df['contributor'] = df['commit_pct'] > 0

In [41]:
df['contributor'].value_counts()

True     1278
False    1161
Name: contributor, dtype: int64

In [42]:
df['central'] = df['betweenness_centrality'] > 0

In [43]:
df['central'].value_counts()

True     1236
False    1203
Name: central, dtype: int64

In [44]:
package_counts = dict(df['package'].value_counts())

In [45]:
@np.vectorize
def add_total_issues(package, package_counts):
    if package in package_counts:
        return package_counts[package]
    else:
        return 0

In [46]:
df['total_issues'] = add_total_issues(df['package'], package_counts)

In [47]:
len(df['package'].unique())

189

In [48]:
df['priority_pct'] = df['priority_order'] / df['total_issues']

In [52]:
df['total_issues'].describe()

count    2439.000000
mean       61.663387
std        67.320498
min         1.000000
25%        13.000000
50%        33.000000
75%        83.000000
max       213.000000
Name: total_issues, dtype: float64

In [88]:
reg_df = df[df['total_issues'] >= 20]
reg_df = reg_df[reg_df['priority_order'] <= 20]

In [89]:
len(reg_df['package'].unique())

31

### OLS Sanity Check

Before getting into our more complicated model, we want to run an OLS model just to get a feel for how the model is working.

In [90]:
y, X = dmatrices('priority_order ~ '
                 'avg_clustering + avg_min_path + gini_coefficient + '
                 ' betweenness_centrality + pull_request + commit_pct + bug',
                 data=reg_df, return_type='dataframe')

In [91]:
mod = sm.OLS(y, X)
res = mod.fit()

In [92]:
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:         priority_order   R-squared:                       0.016
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     1.433
Date:                Sun, 28 Jul 2019   Prob (F-statistic):              0.189
Time:                        18:15:49   Log-Likelihood:                -1961.0
No. Observations:                 620   AIC:                             3938.
Df Residuals:                     612   BIC:                             3973.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                  8

### Cox Proportional Hazard Model

Now that we have a better idea about the relationships between the variables, let's train a model that better accounts for the lifetime of the issue. Specifically, the cox proportional hazard model assumes a conditional exponential distribution, and accounts for issues that have never been closed through the use of left censoring.

In [282]:
keep_cols = ['adj_duration', 'closed', 'avg_clustering', 'avg_min_path',
             'gini_coefficient', 'central', 'contributor', 'bug']
#keep_cols = ['duration', 'closed', 'central', 'contributor']
df_regress = df[keep_cols]

In [283]:
cph = CoxPHFitter()
cph.fit(df_regress, duration_col='adj_duration', event_col='closed', show_progress=True)

cph.print_summary()  # access the results using cph.summary

Iteration 5: norm_delta = 0.00000, step_size = 1.0000, ll = -10658.56993, newton_decrement = 0.00000, seconds_since_start = 0.2Convergence completed after 5 iterations.
<lifelines.CoxPHFitter: fitted with 2349 observations, 879 censored>
      duration col = 'adj_duration'
         event col = 'closed'
number of subjects = 2349
  number of events = 1470
partial log-likelihood = -10658.57
  time fit was run = 2019-07-26 03:20:46 UTC

---
                  coef exp(coef)  se(coef)  coef lower 95%  coef upper 95% exp(coef) lower 95% exp(coef) upper 95%
avg_clustering   -1.37      0.25      0.20           -1.77           -0.97                0.17                0.38
avg_min_path     -0.79      0.45      0.14           -1.07           -0.52                0.34                0.59
gini_coefficient  5.40    221.67      0.58            4.25            6.55               70.43              697.70
central          -0.22      0.80      0.07           -0.36           -0.09                0.70     