In [1]:
from research_utils.database.database import Database

from lifelines import CoxPHFitter
import pandas as pd
import numpy as np

In [2]:
database = Database()

In [3]:
@np.vectorize
def create_indicator(item, value):
    if item == value:
        return 1
    else:
        return 0

In [33]:
sql = """
SELECT DISTINCT *, 
       CASE WHEN betweenness_centrality > 0 THEN 1 ELSE 0 END as central,
       CASE WHEN commit_pct > 0 THEN 1 ELSE 0 END as contributor
FROM open_source.reqs_prioritization2 x
INNER JOIN(
SELECT package, organization, user_id, commit_pct
FROM open_source.issue_contributors a
INNER JOIN (
	SELECT DISTINCT package_name as package, org_name as organization, id
	FROM open_source.packages
) b
ON (a.package_id = b.id)
) y
ON (x.package=y.package AND x.organization=y.organization AND x.user_id = y.user_id)
WHERE x.pull_request IS FALSE AND total_stakeholders > 30
"""
df = pd.read_sql(sql, database.connection)

In [34]:
len(df)

25221

In [35]:
df['contributor'] = df['commit_pct'] > 0

In [36]:
df['contributor'].value_counts()

False    20903
True      4318
Name: contributor, dtype: int64

In [37]:
df['central'].value_counts()

0    17698
1     7523
Name: central, dtype: int64

In [38]:
#packages = list(df['package'].unique())
#for package in packages:
#    df[package] = create_indicator(df['package'], package)

In [39]:
df['central_contributor'] = df['central'] * df['contributor']
df['betweenness_centrality_100'] = df['betweenness_centrality'] * 100

In [42]:
keep_cols = ['duration', 'closed', 'avg_clustering', 'avg_min_path',
             'gini_coefficient', 'central', 'contributor']
#keep_cols = ['duration', 'closed', 'central', 'contributor']
df_regress = df[keep_cols]

In [43]:
cph = CoxPHFitter()
cph.fit(df_regress, duration_col='duration', event_col='closed', show_progress=True)

cph.print_summary()  # access the results using cph.summary

Iteration 5: norm_delta = 0.00000, step_size = 1.0000, ll = -174406.61775, newton_decrement = 0.00000, seconds_since_start = 0.2Convergence completed after 5 iterations.
<lifelines.CoxPHFitter: fitted with 25221 observations, 6978 censored>
      duration col = 'duration'
         event col = 'closed'
number of subjects = 25221
  number of events = 18243
partial log-likelihood = -174406.62
  time fit was run = 2019-07-24 12:38:26 UTC

---
                  coef exp(coef)  se(coef)  coef lower 95%  coef upper 95% exp(coef) lower 95% exp(coef) upper 95%
avg_clustering   -0.60      0.55      0.05           -0.70           -0.49                0.49                0.61
avg_min_path     -0.88      0.41      0.03           -0.94           -0.83                0.39                0.44
gini_coefficient  4.00     54.77      0.13            3.75            4.25               42.58               70.45
central          -0.32      0.73      0.02           -0.35           -0.28                0.70   