## Work Notebook for Requirements Engineering Journal Article

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from patsy import dmatrices
import pyDEA
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.stats.diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor

import research_utils.utils as utils
from research_utils.database.database import Database
from research_utils.analytics.beta_regression import Beta
from research_utils.analytics.lda import TopicModel
from research_utils.analytics.dea import DEA

In [2]:
database = Database()

In [None]:
data = pd.read_sql(utils.INPUT_QUERY, database.connection)

In [None]:
tm = TopicModel(25, load=True)

In [None]:
df = tm.load_topic_model_results()

In [None]:
df.head()

In [None]:
for i in range(tm.num_topics):
    df['topic_{}'.format(i)] = [x[i] if x else np.nan for x in df['topics']]
    df['topic_{}_2'.format(i)] = df['topic_{}'.format(i)]**2

In [None]:
all_topics = []
reg_topics = []
simple_reg_topics = []
simple_reg_topics_2 = []
for i in range(tm.num_topics-1):    
    all_topics.append('topic_{}'.format(i))
    simple_reg_topics.append('topic_{}'.format(i))
    all_topics.append('topic_{}_2'.format(i))
    simple_reg_topics_2.append('topic_{}_2'.format(i))
    for j in range(tm.num_topics-1):
        reg_topics.append("topic_{}*topic_{}".format(i, j))
        #reg_topics.append("topic_{}_2".format(i, j))

In [None]:
mean_topics = df.groupby(['organization', 'package']).mean()[all_topics]

In [None]:
all_data = mean_topics.merge(data, on=['package', 'organization'])

In [None]:
all_data['crowd_pct_sq'] = np.sqrt(all_data['crowd_pct'])
all_data['crowd_pct_2'] = all_data['crowd_pct']**2
all_data['issues_over_time'] = (all_data['total_issues'] / all_data['project_age'])*90
all_data['avg_clusteringXcrowd_pct'] = all_data['avg_clustering'] * all_data['crowd_pct']
all_data['avg_min_pathXcrowd_pct'] = all_data['avg_min_path'] * all_data['crowd_pct']
all_data['gini_coefficientXcrowd_pct'] = all_data['gini_coefficient'] * all_data['crowd_pct']
all_data['log_duration'] = np.log(all_data['duration_median'] + 1e-7)
all_data['log_duration_mean'] = np.log(all_data['duration_mean'] + 1e-7)
all_data['pct_under_30'] = all_data['under_30'] / all_data['total_issues']
all_data['pct_under_60'] = all_data['under_60'] / all_data['total_issues']
all_data['pct_under_90'] = all_data['under_90'] / all_data['total_issues']
all_data['issues_per_user'] = all_data['total_issues'] / all_data['num_users']
all_data.to_csv('/home/matt/research_data.csv', index=False)

In [None]:
base_columns = ['crowd_pct', 'crowd_pct_2',
                'avg_clustering', 'avg_min_path', 'gini_coefficient',
                'avg_clusteringXcrowd_pct', 'avg_min_pathXcrowd_pct', 'gini_coefficientXcrowd_pct',
                'total_contributors', 'project_age']
base_features = ' + '.join(base_columns)

In [None]:
def glm_marginal_effect(variable, res, X, all_data):
    """Computes the GLM marginal effects for the variable.
    
    Parameters
    ----------
    variable : str
        the variable for which we would like to calculate the marginal effect
    res : sm.model
        results of the linear regression
    X : pd.DataFrame
        the input to the linear regression
    all_data : pd.DataFrame
        the full set of input data
        
    Returns
    -------
    marginal_effect : float
    """
    data = all_data.copy(deep=True)
    param = res.params[variable]
    cross_term = '{}Xcrowd_pct'.format(variable)
    if cross_term in res.params:
        data['effect'] = param + data['crowd_pct'] * res.params[cross_term]
    else:
        data['effect'] = param
    data['prediction'] = res.predict(X)
    data['marginal_effect'] = data['effect'] * data['prediction']
    return data['marginal_effect'].mean()

### Exploratory Data Analysis

In [None]:
plt.figure(figsize=(12, 5))
sns.scatterplot(x=all_data['avg_clustering'], y=all_data['crowd_pct'], color='blue')
plt.title('Avg Clustering vs Crowd Pct', fontsize=18)
plt.xlabel('Avg Clustering', fontsize=16)
plt.ylabel('Crowd Pct', fontsize=16)
#plt.ylim([0, 1])
#plt.xlim([0, 1])

In [None]:
plt.figure(figsize=(12, 5))
sns.scatterplot(x=all_data['gini_coefficient'], y=all_data['crowd_pct'], color='blue')
plt.title('Gini Coefficient vs Crowd Pct', fontsize=18)
plt.xlabel('Gini Coefficient', fontsize=16)
plt.ylabel('Crowd Pct', fontsize=16)
#plt.ylim([0, 1])
#plt.xlim([0, 1])

In [None]:
plt.figure(figsize=(12, 5))
sns.scatterplot(x=all_data['avg_min_path'], y=all_data['crowd_pct'], color='blue')
plt.title('Avg Min Path vs Crowd Pct', fontsize=18)
plt.xlabel('Avg Min Path', fontsize=16)
plt.ylabel('Crowd Pct', fontsize=16)
#plt.ylim([0, 1])
#plt.xlim([0, 1])

### Regression on Average Comments

In [None]:
plt.figure(figsize=(12, 5))
sns.distplot(all_data['avg_comments'], kde=False, color='blue', bins=100)
plt.title('Requirement Durations in Open Source Projects', fontsize=18)
plt.xlabel('Average Commend Activity', fontsize=16)
plt.ylabel('Count', fontsize=16)

In [None]:
all_data['avg_comments'].describe()

In [None]:
y, X = dmatrices('avg_comments ~ ' + base_features,
                 data=all_data, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
yhat = res.predict(X)
print(res.summary())

In [None]:
formula = """
avg_comments ~ crowd_pct + crowd_pct_2 + avg_clustering + 
    avg_min_path + gini_coefficient + avg_clusteringXcrowd_pct + 
    avg_min_pathXcrowd_pct + gini_coefficientXcrowd_pct + project_age + 
    topic_0 + topic_1 + topic_2 + topic_3 + topic_4 + topic_5 + 
    topic_6 + topic_7 + topic_8 + topic_9 + topic_10 + topic_11 + 
    topic_12 + topic_13 + topic_14 + topic_15 + topic_16 + topic_17 + 
    topic_18 + topic_19 + topic_20 + topic_21 + topic_22 + topic_23 + 
    topic_0:topic_1 + topic_0:topic_7 + topic_0:topic_9 + topic_0:topic_10 + 
    topic_0:topic_18 + topic_0:topic_19 + topic_0:topic_21 + 
    topic_1:topic_2 + topic_1:topic_9 + topic_1:topic_10 + topic_1:topic_13 + 
    topic_1:topic_14 + topic_1:topic_21 + topic_2:topic_11 + 
    topic_2:topic_12 + topic_2:topic_18 + topic_2:topic_20 + 
    topic_2:topic_22 + topic_3:topic_5 + topic_3:topic_11 + topic_3:topic_21 + 
    topic_4:topic_8 + topic_4:topic_18 + topic_5:topic_7 + topic_5:topic_9 + 
    topic_5:topic_11 + topic_5:topic_15 + topic_5:topic_17 + 
    topic_5:topic_22 + topic_6:topic_7 + topic_6:topic_8 + topic_6:topic_12 + 
    topic_6:topic_14 + topic_6:topic_16 + topic_6:topic_17 + 
    topic_6:topic_18 + topic_6:topic_19 + topic_7:topic_8 + topic_7:topic_16 + 
    topic_7:topic_23 + topic_8:topic_9 + topic_8:topic_15 + topic_9:topic_10 + 
    topic_9:topic_16 + topic_9:topic_19 + topic_9:topic_20 + 
    topic_9:topic_22 + topic_10:topic_12 + topic_10:topic_16 + 
    topic_10:topic_20 + topic_10:topic_22 + topic_10:topic_23 + 
    topic_11:topic_13 + topic_11:topic_14 + topic_11:topic_15 + 
    topic_11:topic_16 + topic_11:topic_17 + topic_11:topic_22 + 
    topic_14:topic_17 + topic_14:topic_20 + topic_14:topic_22 + 
    topic_15:topic_16 + topic_15:topic_18 + topic_15:topic_21 + 
    topic_16:topic_19 + topic_16:topic_22 + topic_17:topic_18 + 
    topic_17:topic_20 + topic_17:topic_22 + topic_18:topic_23 + 
    topic_19:topic_20 + topic_19:topic_21 + topic_19:topic_23 + 
    topic_20:topic_21 + topic_21:topic_22 + topic_3:topic_8 + 
    topic_7:topic_18 + topic_5:topic_12

"""
y, X = dmatrices(formula,
                 data=all_data, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

In [None]:
plt.figure(figsize=(12, 5))
sns.distplot(res.resid, kde=False, color='blue', bins=50)
plt.title('OLS Residuals', fontsize=18)
plt.xlabel('Residuals', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.xlim([-2.0, 2.0])

In [None]:
plt.figure(figsize=(12, 5))
sns.scatterplot(x=res.resid, y=all_data['avg_comments'], color='blue')
plt.title('Residuals vs Actuals', fontsize=18)
plt.xlabel('Residuals', fontsize=16)
plt.ylabel('Comment Activity', fontsize=16)
plt.ylim([0, 8])
plt.xlim([-2.0, 2.0])

In [None]:
mod = sm.GLM(y, X, family=sm.families.Gamma(link=sm.families.links.log))
res = mod.fit()
print(1 - (res.deviance/res.null_deviance))
print(len(res.params))
print(res.summary())

In [None]:
glm_marginal_effect('avg_clustering', res, X, all_data)

In [None]:
glm_marginal_effect('avg_min_path', res, X, all_data)

In [None]:
glm_marginal_effect('gini_coefficient', res, X, all_data)

In [None]:
all_data['comment_prediction'] = res.predict(X)

In [None]:
plt.figure(figsize=(12, 5))
sns.scatterplot(x=abs(yhat - all_data['avg_comments']), y=all_data['avg_comments'], color='blue')
plt.title('Residuals vs Actuals', fontsize=18)
plt.xlabel('Residuals', fontsize=16)
plt.ylabel('Comment Activity', fontsize=16)
plt.ylim([0, 8])
plt.xlim([0, 1.5])

In [None]:
avg_min_path = all_data['avg_min_path']
gini_coefficient = all_data['gini_coefficient']
avg_clustering = all_data['avg_clustering']
crowd_pct = all_data['crowd_pct']

effects_data = X.copy(deep=True)

effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering
effects_data['avg_min_pathXcrowd_pct'] = crowd_pct*avg_min_path
effects_data['gini_coefficientXcrowd_pct'] = crowd_pct*gini_coefficient

effects_data['crowd_pct'] = crowd_pct
effects_data['crowd_pct_2'] = crowd_pct**2

effects_data['predictions'] = res.predict(effects_data)
effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
effects_data['crowd_pct_param'] = (res.params['crowd_pct']
                                    + avg_clustering * res.params['avg_clusteringXcrowd_pct']
                                    + gini_coefficient * res.params['gini_coefficientXcrowd_pct']
                                    + avg_min_path * res.params['avg_min_pathXcrowd_pct'])
effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                              + effects_data['crowd_pct_param'])
        
        
avg_effect = effects_data['total_effect'].mean()
print(avg_effect)

In [None]:
plt.figure(figsize=(12, 5))

marginal_effects = {'crowd_pct': [], 'effect': []}
avg_min_path = all_data['avg_min_path']
gini_coefficient = all_data['gini_coefficient']
avg_clustering = all_data['avg_clustering']


for i in range(100):
    effects_data = X.copy(deep=True)
    crowd_pct = i/100
    effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering
    effects_data['avg_min_pathXcrowd_pct'] = crowd_pct*avg_min_path
    effects_data['gini_coefficientXcrowd_pct'] = crowd_pct*gini_coefficient

    effects_data['crowd_pct'] = crowd_pct
    effects_data['crowd_pct_2'] = crowd_pct**2


    effects_data['predictions'] = res.predict(effects_data)
    effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
    effects_data['crowd_pct_param'] = (res.params['crowd_pct']
                                        + avg_clustering * res.params['avg_clusteringXcrowd_pct']
                                        + gini_coefficient * res.params['gini_coefficientXcrowd_pct']
                                        + avg_min_path * res.params['avg_min_pathXcrowd_pct'])
    effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                    + effects_data['crowd_pct_param'])
        
        
    avg_effect = effects_data['total_effect'].mean()
    marginal_effects['crowd_pct'].append(crowd_pct)
    marginal_effects['effect'].append(avg_effect)
        
marginal_effects = pd.DataFrame(marginal_effects)
sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'])

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on Comment Activity', fontsize=16)
plt.xlabel('Crowd Pct', fontsize=16)

In [None]:
plt.figure(figsize=(12, 5))

for avg_clustering in [0.4, 0.5, 0.6, 0.7, 0.8]:
    marginal_effects = {'crowd_pct': [], 'effect': []}
    label = 'Avg Clustering: {} '.format(avg_clustering)
    avg_min_path = all_data['avg_min_path']
    gini_coefficient = all_data['gini_coefficient']

    for i in range(100):
        effects_data = X.copy(deep=True)
        crowd_pct = i/100
        effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering
        effects_data['avg_min_pathXcrowd_pct'] = crowd_pct*avg_min_path
        effects_data['gini_coefficientXcrowd_pct'] = crowd_pct*gini_coefficient

        effects_data['crowd_pct'] = crowd_pct
        effects_data['crowd_pct_2'] = crowd_pct**2


        effects_data['predictions'] = res.predict(effects_data)
        effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
        effects_data['crowd_pct_param'] = (res.params['crowd_pct']
                                           + avg_clustering * res.params['avg_clusteringXcrowd_pct']
                                           + gini_coefficient * res.params['gini_coefficientXcrowd_pct']
                                           + avg_min_path * res.params['avg_min_pathXcrowd_pct'])
        effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                      + effects_data['crowd_pct_param'])
        
        
        avg_effect = effects_data['total_effect'].mean()
        marginal_effects['crowd_pct'].append(crowd_pct)
        marginal_effects['effect'].append(avg_effect)
        
    marginal_effects = pd.DataFrame(marginal_effects)
    sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'], label=label)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on Comment Activity', fontsize=16)
plt.xlabel('Marginal Effect', fontsize=16)

In [None]:
plt.figure(figsize=(12, 5))

for avg_min_path in [2, 2.25, 2.5, 2.75, 3]:
    marginal_effects = {'crowd_pct': [], 'effect': []}
    label = 'Avg Min Path: {} '.format(avg_min_path)
    avg_clustering = all_data['avg_clustering']
    gini_coefficient = all_data['gini_coefficient']

    for i in range(100):
        effects_data = X.copy(deep=True)
        crowd_pct = i/100
        effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering
        effects_data['avg_min_pathXcrowd_pct'] = crowd_pct*avg_min_path
        effects_data['gini_coefficientXcrowd_pct'] = crowd_pct*gini_coefficient

        effects_data['crowd_pct'] = crowd_pct
        effects_data['crowd_pct_2'] = crowd_pct**2


        effects_data['predictions'] = res.predict(effects_data)
        effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
        effects_data['crowd_pct_param'] = (res.params['crowd_pct']
                                           + avg_clustering * res.params['avg_clusteringXcrowd_pct']
                                           + gini_coefficient * res.params['gini_coefficientXcrowd_pct']
                                           + avg_min_path * res.params['avg_min_pathXcrowd_pct'])
        effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                      + effects_data['crowd_pct_param'])
        
        
        avg_effect = effects_data['total_effect'].mean()
        marginal_effects['crowd_pct'].append(crowd_pct)
        marginal_effects['effect'].append(avg_effect)
        
    marginal_effects = pd.DataFrame(marginal_effects)
    sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'], label=label)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on Comment Activity', fontsize=16)
plt.xlabel('Marginal Effect', fontsize=16)

In [None]:
plt.figure(figsize=(12, 5))

for gini_coefficient in [0.4, 0.5, 0.6, 0.7]:
    marginal_effects = {'crowd_pct': [], 'effect': []}
    label = 'Gini Coefficient: {} '.format(gini_coefficient)
    avg_clustering = all_data['avg_clustering']
    avg_min_path = all_data['avg_min_path']

    for i in range(100):
        effects_data = X.copy(deep=True)
        crowd_pct = i/100
        effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering
        effects_data['avg_min_pathXcrowd_pct'] = crowd_pct*avg_min_path
        effects_data['gini_coefficientXcrowd_pct'] = crowd_pct*gini_coefficient

        effects_data['crowd_pct'] = crowd_pct
        effects_data['crowd_pct_2'] = crowd_pct**2


        effects_data['predictions'] = res.predict(effects_data)
        effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
        effects_data['crowd_pct_param'] = (res.params['crowd_pct']
                                           + avg_clustering * res.params['avg_clusteringXcrowd_pct']
                                           + gini_coefficient * res.params['gini_coefficientXcrowd_pct']
                                           + avg_min_path * res.params['avg_min_pathXcrowd_pct'])
        effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                      + effects_data['crowd_pct_param'])
        
        
        avg_effect = effects_data['total_effect'].mean()
        marginal_effects['crowd_pct'].append(crowd_pct)
        marginal_effects['effect'].append(avg_effect)
        
    marginal_effects = pd.DataFrame(marginal_effects)
    sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'], label=label)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on Comment Activity', fontsize=16)
plt.xlabel('Marginal Effect', fontsize=16)

### Regression on Issues Submitted Over Time

In [None]:
plt.figure(figsize=(12, 5))
sns.distplot(all_data['issues_over_time'], kde=False, color='blue', bins=100)
plt.title('Issue Volume in Open Source Projects', fontsize=18)
plt.xlabel('Median Requirement Duration (Days)', fontsize=16)
plt.ylabel('Count', fontsize=16)

In [None]:
all_data['issues_over_time'].describe()

In [None]:
y, X = dmatrices('issues_over_time ~ ' + base_features,
                 data=all_data, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
yhat = res.predict(X)
print(res.summary())

In [None]:
formula = """
issues_over_time ~ crowd_pct_2 + avg_clustering + 
    gini_coefficient + avg_clusteringXcrowd_pct + avg_min_pathXcrowd_pct + 
    gini_coefficientXcrowd_pct + total_contributors + project_age + 
    topic_0 + topic_1 + topic_2 + topic_3 + topic_4 + topic_5 + 
    topic_6 + topic_7 + topic_8 + topic_9 + topic_10 + topic_11 + 
    topic_12 + topic_13 + topic_14 + topic_15 + topic_16 + topic_17 + 
    topic_18 + topic_19 + topic_20 + topic_21 + topic_22 + topic_23 + 
    avg_min_path + topic_0:topic_1 + topic_0:topic_2 + topic_0:topic_3 + 
    topic_0:topic_5 + topic_0:topic_6 + topic_0:topic_7 + topic_0:topic_9 + 
    topic_0:topic_10 + topic_0:topic_11 + topic_0:topic_13 + 
    topic_0:topic_14 + topic_0:topic_15 + topic_0:topic_16 + 
    topic_0:topic_17 + topic_0:topic_18 + topic_0:topic_20 + 
    topic_0:topic_22 + topic_0:topic_23 + topic_1:topic_2 + topic_1:topic_4 + 
    topic_1:topic_5 + topic_1:topic_6 + topic_1:topic_10 + topic_1:topic_12 + 
    topic_1:topic_13 + topic_1:topic_14 + topic_1:topic_16 + 
    topic_1:topic_17 + topic_1:topic_18 + topic_1:topic_20 + 
    topic_2:topic_5 + topic_2:topic_16 + topic_2:topic_17 + topic_2:topic_19 + 
    topic_2:topic_22 + topic_2:topic_23 + topic_3:topic_7 + topic_3:topic_8 + 
    topic_3:topic_9 + topic_3:topic_11 + topic_3:topic_17 + topic_3:topic_18 + 
    topic_3:topic_22 + topic_4:topic_7 + topic_4:topic_10 + topic_4:topic_11 + 
    topic_4:topic_12 + topic_4:topic_14 + topic_4:topic_15 + 
    topic_4:topic_16 + topic_4:topic_18 + topic_4:topic_22 + 
    topic_4:topic_23 + topic_5:topic_6 + topic_5:topic_8 + topic_5:topic_11 + 
    topic_5:topic_14 + topic_5:topic_16 + topic_5:topic_17 + 
    topic_5:topic_18 + topic_6:topic_8 + topic_6:topic_10 + topic_6:topic_11 + 
    topic_6:topic_12 + topic_6:topic_14 + topic_6:topic_16 + 
    topic_6:topic_17 + topic_6:topic_18 + topic_6:topic_21 + 
    topic_6:topic_22 + topic_7:topic_9 + topic_7:topic_10 + topic_7:topic_11 + 
    topic_7:topic_17 + topic_7:topic_19 + topic_7:topic_20 + 
    topic_7:topic_22 + topic_8:topic_9 + topic_8:topic_10 + topic_8:topic_14 + 
    topic_8:topic_15 + topic_8:topic_17 + topic_8:topic_19 + 
    topic_8:topic_21 + topic_8:topic_22 + topic_9:topic_13 + 
    topic_9:topic_16 + topic_9:topic_18 + topic_9:topic_19 + 
    topic_9:topic_21 + topic_9:topic_22 + topic_9:topic_23 + 
    topic_10:topic_12 + topic_10:topic_15 + topic_10:topic_17 + 
    topic_10:topic_18 + topic_10:topic_19 + topic_10:topic_21 + 
    topic_10:topic_22 + topic_11:topic_13 + topic_11:topic_14 + 
    topic_11:topic_16 + topic_11:topic_17 + topic_11:topic_21 + 
    topic_11:topic_23 + topic_12:topic_13 + topic_12:topic_15 + 
    topic_12:topic_19 + topic_12:topic_22 + topic_12:topic_23 + 
    topic_13:topic_16 + topic_13:topic_17 + topic_13:topic_18 + 
    topic_13:topic_19 + topic_13:topic_20 + topic_14:topic_16 + 
    topic_14:topic_17 + topic_14:topic_20 + topic_14:topic_23 + 
    topic_15:topic_16 + topic_15:topic_17 + topic_15:topic_18 + 
    topic_15:topic_20 + topic_15:topic_21 + topic_16:topic_17 + 
    topic_16:topic_22 + topic_16:topic_23 + topic_17:topic_19 + 
    topic_17:topic_20 + topic_17:topic_21 + topic_17:topic_23 + 
    topic_18:topic_19 + topic_18:topic_20 + topic_18:topic_22 + 
    topic_18:topic_23 + topic_19:topic_23 + topic_20:topic_21 + 
    topic_20:topic_23 + topic_21:topic_22 + topic_22:topic_23
"""
y, X = dmatrices(formula,
                 data=all_data, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

In [None]:
plt.figure(figsize=(12, 5))
sns.distplot(res.resid, kde=False, color='blue', bins=50)
plt.title('OLS Residuals', fontsize=18)
plt.xlabel('Residuals', fontsize=16)
plt.ylabel('Count', fontsize=16)

In [None]:
plt.figure(figsize=(12, 5))
sns.scatterplot(x=res.resid, y=all_data['issues_over_time'], color='blue')
plt.title('Residuals vs Actuals', fontsize=18)
plt.xlabel('Residuals', fontsize=16)
plt.ylabel('Issues over Time', fontsize=16)
#plt.ylim([0, 2.5])
#plt.xlim([-0.75, .75])

In [None]:
mod = sm.GLM(y, X, family=sm.families.Gamma(link=sm.families.links.log))
res = mod.fit()
print(1 - (res.deviance/res.null_deviance))
print(len(res.params))
print(res.summary())

In [None]:
glm_marginal_effect('avg_clustering', res, X, all_data)

In [None]:
glm_marginal_effect('avg_min_path', res, X, all_data)

In [None]:
glm_marginal_effect('gini_coefficient', res, X, all_data)

In [None]:
avg_clustering = all_data['avg_clustering']
avg_min_path = all_data['avg_min_path']
crowd_pct = all_data['crowd_pct']


effects_data = X.copy(deep=True)

effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering
effects_data['avg_min_pathXcrowd_pct'] = crowd_pct*avg_min_path
effects_data['gini_coefficientXcrowd_pct'] = crowd_pct*gini_coefficient

effects_data['crowd_pct_2'] = crowd_pct**2

effects_data['predictions'] = res.predict(effects_data)
effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
effects_data['crowd_pct_param'] = (avg_clustering * res.params['avg_clusteringXcrowd_pct']
                                    + gini_coefficient * res.params['gini_coefficientXcrowd_pct']
                                    + avg_min_path * res.params['avg_min_pathXcrowd_pct'])
effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                              + effects_data['crowd_pct_param'])
        
        
avg_effect = effects_data['total_effect'].mean()
print(avg_effect)

In [None]:
plt.figure(figsize=(12, 5))

marginal_effects = {'crowd_pct': [], 'effect': []}
avg_min_path = all_data['avg_min_path']
gini_coefficient = all_data['gini_coefficient']
avg_clustering = all_data['avg_clustering']


for i in range(100):
    effects_data = X.copy(deep=True)
    crowd_pct = i/100
    effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering
    effects_data['avg_min_pathXcrowd_pct'] = crowd_pct*avg_min_path
    effects_data['gini_coefficientXcrowd_pct'] = crowd_pct*gini_coefficient

    effects_data['crowd_pct_2'] = crowd_pct**2

    effects_data['predictions'] = res.predict(effects_data)
    effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
    effects_data['crowd_pct_param'] = (avg_clustering * res.params['avg_clusteringXcrowd_pct']
                                        + gini_coefficient * res.params['gini_coefficientXcrowd_pct']
                                        + avg_min_path * res.params['avg_min_pathXcrowd_pct'])
    effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                    + effects_data['crowd_pct_param'])
        
        
    avg_effect = effects_data['total_effect'].mean()
    marginal_effects['crowd_pct'].append(crowd_pct)
    marginal_effects['effect'].append(avg_effect)
        
marginal_effects = pd.DataFrame(marginal_effects)
sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'])

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on Issue Volume', fontsize=16)
plt.xlabel('Crowd Pct', fontsize=16)

In [None]:
plt.figure(figsize=(12, 5))

for avg_clustering in [0.4, 0.5, 0.6, 0.7, 0.8]:
    marginal_effects = {'crowd_pct': [], 'effect': []}
    label = 'Avg Clustering: {} '.format(avg_clustering)
    avg_min_path = all_data['avg_min_path']
    gini_coefficient = all_data['gini_coefficient']

    for i in range(100):
        effects_data = X.copy(deep=True)
        crowd_pct = i/100
        effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering
        effects_data['avg_min_pathXcrowd_pct'] = crowd_pct*avg_min_path
        effects_data['gini_coefficientXcrowd_pct'] = crowd_pct*gini_coefficient

        effects_data['crowd_pct_2'] = crowd_pct**2

        effects_data['predictions'] = res.predict(effects_data)
        effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
        effects_data['crowd_pct_param'] = (avg_clustering * res.params['avg_clusteringXcrowd_pct']
                                           + gini_coefficient * res.params['gini_coefficientXcrowd_pct']
                                           + avg_min_path * res.params['avg_min_pathXcrowd_pct'])
        effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                      + effects_data['crowd_pct_param'])
        
        
        avg_effect = effects_data['total_effect'].mean()
        marginal_effects['crowd_pct'].append(crowd_pct)
        marginal_effects['effect'].append(avg_effect)
        
    marginal_effects = pd.DataFrame(marginal_effects)
    sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'], label=label)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on Issue Volume', fontsize=16)
plt.xlabel('Marginal Effect', fontsize=16)

In [None]:
plt.figure(figsize=(12, 5))

for avg_min_path in [2, 2.25, 2.5, 2.75, 3]:
    marginal_effects = {'crowd_pct': [], 'effect': []}
    label = 'Avg Min Path: {} '.format(avg_min_path)
    avg_clustering = all_data['avg_clustering']
    gini_coefficient = all_data['gini_coefficient']

    for i in range(100):
        effects_data = X.copy(deep=True)
        crowd_pct = i/100
        effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering
        effects_data['avg_min_pathXcrowd_pct'] = crowd_pct*avg_min_path
        effects_data['gini_coefficientXcrowd_pct'] = crowd_pct*gini_coefficient

        effects_data['crowd_pct_2'] = crowd_pct**2

        effects_data['predictions'] = res.predict(effects_data)
        effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
        effects_data['crowd_pct_param'] = (avg_clustering * res.params['avg_clusteringXcrowd_pct']
                                           + gini_coefficient * res.params['gini_coefficientXcrowd_pct']
                                           + avg_min_path * res.params['avg_min_pathXcrowd_pct'])
        effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                      + effects_data['crowd_pct_param'])
        
        
        avg_effect = effects_data['total_effect'].mean()
        marginal_effects['crowd_pct'].append(crowd_pct)
        marginal_effects['effect'].append(avg_effect)
        
    marginal_effects = pd.DataFrame(marginal_effects)
    sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'], label=label)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on Issue Volume', fontsize=16)
plt.xlabel('Marginal Effect', fontsize=16)

In [None]:
plt.figure(figsize=(12, 5))

for gini_coefficient in [0.4, 0.5, 0.6, 0.7]:
    marginal_effects = {'crowd_pct': [], 'effect': []}
    label = 'Gini Coefficient: {} '.format(gini_coefficient)
    avg_clustering = all_data['avg_clustering']
    avg_min_path = all_data['avg_min_path']

    for i in range(100):
        effects_data = X.copy(deep=True)
        crowd_pct = i/100
        effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering
        effects_data['avg_min_pathXcrowd_pct'] = crowd_pct*avg_min_path
        effects_data['gini_coefficientXcrowd_pct'] = crowd_pct*gini_coefficient

        effects_data['crowd_pct_2'] = crowd_pct**2

        effects_data['predictions'] = res.predict(effects_data)
        effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
        effects_data['crowd_pct_param'] = (avg_clustering * res.params['avg_clusteringXcrowd_pct']
                                           + gini_coefficient * res.params['gini_coefficientXcrowd_pct']
                                           + avg_min_path * res.params['avg_min_pathXcrowd_pct'])
        effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                      + effects_data['crowd_pct_param'])
        
        
        avg_effect = effects_data['total_effect'].mean()
        marginal_effects['crowd_pct'].append(crowd_pct)
        marginal_effects['effect'].append(avg_effect)
        
    marginal_effects = pd.DataFrame(marginal_effects)
    sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'], label=label)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on Issue Volume', fontsize=16)
plt.xlabel('Marginal Effect', fontsize=16)

### Pct Closed out under 30

In [None]:
plt.figure(figsize=(12, 5))
sns.distplot(all_data['pct_under_30'], kde=False, color='blue', bins=50)
plt.title('Close Out Time for Requirements', fontsize=18)
plt.xlabel('Percentage of Requirements Closed in 30 Days', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.xlim([0,1])

In [None]:
all_data['pct_under_30'].describe()

In [None]:
y, X = dmatrices('pct_under_30 ~ ' + base_features,
                 data=all_data, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
yhat = res.predict(X)
print(res.summary())

In [None]:
formula = """
 pct_under_30 ~ crowd_pct_2 + avg_clustering + gini_coefficient + 
    avg_min_pathXcrowd_pct + project_age + topic_0 + topic_1 + 
    topic_2 + topic_3 + topic_4 + topic_5 + topic_6 + topic_7 + 
    topic_8 + topic_9 + topic_10 + topic_11 + topic_12 + topic_13 + 
    topic_14 + topic_15 + topic_16 + topic_17 + topic_18 + topic_19 + 
    topic_20 + topic_21 + topic_22 + topic_23 + topic_0:topic_6 + 
    topic_0:topic_7 + topic_0:topic_8 + topic_0:topic_18 + topic_0:topic_20 + 
    topic_0:topic_22 + topic_0:topic_23 + topic_1:topic_10 + 
    topic_1:topic_11 + topic_1:topic_13 + topic_1:topic_14 + 
    topic_1:topic_17 + topic_1:topic_20 + topic_1:topic_22 + 
    topic_2:topic_4 + topic_2:topic_10 + topic_2:topic_16 + topic_2:topic_17 + 
    topic_2:topic_18 + topic_2:topic_22 + topic_3:topic_4 + topic_3:topic_9 + 
    topic_3:topic_10 + topic_3:topic_13 + topic_3:topic_15 + 
    topic_3:topic_16 + topic_3:topic_21 + topic_3:topic_22 + 
    topic_4:topic_6 + topic_4:topic_9 + topic_4:topic_10 + topic_4:topic_12 + 
    topic_4:topic_14 + topic_4:topic_19 + topic_4:topic_23 + 
    topic_5:topic_6 + topic_5:topic_9 + topic_5:topic_15 + topic_5:topic_18 + 
    topic_5:topic_19 + topic_5:topic_23 + topic_6:topic_10 + 
    topic_6:topic_17 + topic_6:topic_22 + topic_7:topic_8 + topic_7:topic_9 + 
    topic_7:topic_10 + topic_7:topic_12 + topic_7:topic_14 + 
    topic_7:topic_15 + topic_7:topic_16 + topic_7:topic_20 + 
    topic_8:topic_9 + topic_8:topic_12 + topic_8:topic_14 + topic_8:topic_17 + 
    topic_8:topic_18 + topic_9:topic_13 + topic_9:topic_15 + 
    topic_9:topic_17 + topic_9:topic_19 + topic_9:topic_22 + 
    topic_9:topic_23 + topic_10:topic_15 + topic_10:topic_16 + 
    topic_10:topic_18 + topic_10:topic_19 + topic_10:topic_21 + 
    topic_11:topic_15 + topic_11:topic_17 + topic_12:topic_18 + 
    topic_12:topic_19 + topic_12:topic_20 + topic_13:topic_14 + 
    topic_13:topic_16 + topic_13:topic_19 + topic_13:topic_22 + 
    topic_13:topic_23 + topic_14:topic_18 + topic_14:topic_19 + 
    topic_14:topic_22 + topic_15:topic_20 + topic_15:topic_21 + 
    topic_15:topic_23 + topic_16:topic_18 + topic_16:topic_23 + 
    topic_17:topic_19 + topic_17:topic_23 + topic_18:topic_21 + 
    topic_18:topic_22 + topic_18:topic_23 + topic_19:topic_20 + 
    topic_20:topic_22 + topic_21:topic_23 + topic_2:topic_19 + 
    topic_0:topic_10 + topic_1:topic_3
"""
y, X = dmatrices(formula,
                 data=all_data, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

In [None]:
plt.figure(figsize=(12, 5))
sns.distplot(res.resid, kde=False, color='blue', bins=50)
plt.title('OLS Residuals', fontsize=18)
plt.xlabel('Residuals', fontsize=16)
plt.ylabel('Count', fontsize=16)

In [None]:
plt.figure(figsize=(12, 5))
sns.scatterplot(x=res.resid, y=all_data['issues_over_time'], color='blue')
plt.title('Residuals vs Actuals', fontsize=18)
plt.xlabel('Residuals', fontsize=16)
plt.ylabel('Issues over Time', fontsize=16)
plt.ylim([0, 20])
plt.xlim([-0.2, 0.2])

In [None]:
params = {'crowd_pct_2': 1.45256931291647,
          'avg_min_pathXcrowd_pct': -1.0979488383932,
          'avg_clustering': -1.94602300853525,
          'gini_coefficient': 1.99828032440915}

In [None]:
for x in params:
    print('{}: {}'.format(x, np.exp(params[x])))

In [None]:
np.exp(np.mean(all_data['crowd_pct']*params['avg_min_pathXcrowd_pct']))

In [None]:
plt.figure(figsize=(12, 5))

marginal_effects = {'crowd_pct': [], 'effect': []}
avg_min_path = all_data['avg_min_path']
crowd_pct = all_data['crowd_pct']

effects_data = X.copy(deep=True)

effects_data['avg_min_pathXcrowd_pct'] = crowd_pct*avg_min_path
effects_data['crowd_pct_2'] = crowd_pct**2
        
effects_data['crowd_pct_param'] = avg_min_path * res.params['avg_min_pathXcrowd_pct']
effects_data['total_effect'] = np.exp(2 * params['crowd_pct_2'] * crowd_pct
                                      + effects_data['crowd_pct_param'])
        
        
avg_effect = effects_data['total_effect'].mean()
print(avg_effect)

In [None]:
plt.figure(figsize=(12, 5))

marginal_effects = {'crowd_pct': [], 'effect': []}
avg_min_path = all_data['avg_min_path']

for i in range(100):
    effects_data = X.copy(deep=True)
    crowd_pct = i/100
    effects_data['avg_min_pathXcrowd_pct'] = crowd_pct*avg_min_path
    effects_data['crowd_pct_2'] = crowd_pct**2
        
    effects_data['crowd_pct_param'] = avg_min_path * res.params['avg_min_pathXcrowd_pct']
    effects_data['total_effect'] = np.exp(2 * params['crowd_pct_2'] * crowd_pct
                                            + effects_data['crowd_pct_param'])
        
        
    avg_effect = effects_data['total_effect'].mean()
    marginal_effects['crowd_pct'].append(crowd_pct)
    marginal_effects['effect'].append(avg_effect)
        
marginal_effects = pd.DataFrame(marginal_effects)
sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'])

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on Issue Volume', fontsize=16)
plt.xlabel('Marginal Effect', fontsize=16)

In [None]:
plt.figure(figsize=(12, 5))

for avg_min_path in [2, 2.25, 2.5, 2.75, 3]:
    marginal_effects = {'crowd_pct': [], 'effect': []}
    label = 'Avg Min Path: {} '.format(avg_min_path)

    for i in range(100):
        effects_data = X.copy(deep=True)
        crowd_pct = i/100
        effects_data['avg_min_pathXcrowd_pct'] = crowd_pct*avg_min_path
        effects_data['crowd_pct_2'] = crowd_pct**2
        
        effects_data['crowd_pct_param'] = avg_min_path * res.params['avg_min_pathXcrowd_pct']
        effects_data['total_effect'] = np.exp(2 * params['crowd_pct_2'] * crowd_pct
                                              + effects_data['crowd_pct_param'])
        
        
        avg_effect = effects_data['total_effect'].mean()
        marginal_effects['crowd_pct'].append(crowd_pct)
        marginal_effects['effect'].append(avg_effect)
        
    marginal_effects = pd.DataFrame(marginal_effects)
    sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'], label=label)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on Issue Volume', fontsize=16)
plt.xlabel('Marginal Effect', fontsize=16)

### Avg First Comment Time

In [None]:
all_data['avg_first_comment'].describe()

In [None]:
plt.figure(figsize=(12, 5))
sns.distplot(all_data['avg_first_comment'], kde=False, color='blue', bins=100)
plt.title('Initial Response to Issues', fontsize=18)
plt.xlabel('Avg. Initial Response (Days)', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.xlim([0, 150])

In [None]:
y, X = dmatrices('avg_first_comment ~ ' + base_features,
                 data=all_data, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
yhat = res.predict(X)
print(res.summary())

In [None]:
formula = """
avg_first_comment ~ crowd_pct + crowd_pct_2 + avg_clustering + 
    avg_min_path + gini_coefficient + avg_clusteringXcrowd_pct + 
    total_contributors + project_age + topic_0 + topic_1 + topic_2 + 
    topic_3 + topic_4 + topic_5 + topic_6 + topic_7 + topic_8 + 
    topic_9 + topic_10 + topic_11 + topic_12 + topic_13 + topic_14 + 
    topic_15 + topic_16 + topic_17 + topic_18 + topic_19 + topic_20 + 
    topic_21 + topic_22 + topic_23 + topic_0:topic_5 + topic_0:topic_11 + 
    topic_0:topic_12 + topic_0:topic_13 + topic_0:topic_15 + 
    topic_0:topic_21 + topic_1:topic_3 + topic_1:topic_4 + topic_1:topic_7 + 
    topic_1:topic_8 + topic_1:topic_11 + topic_1:topic_12 + topic_1:topic_17 + 
    topic_1:topic_19 + topic_1:topic_21 + topic_1:topic_22 + 
    topic_2:topic_6 + topic_2:topic_7 + topic_2:topic_12 + topic_2:topic_13 + 
    topic_2:topic_14 + topic_2:topic_15 + topic_2:topic_18 + 
    topic_2:topic_20 + topic_2:topic_22 + topic_3:topic_6 + topic_3:topic_11 + 
    topic_3:topic_13 + topic_3:topic_14 + topic_3:topic_15 + 
    topic_3:topic_16 + topic_3:topic_19 + topic_3:topic_21 + 
    topic_3:topic_22 + topic_4:topic_5 + topic_4:topic_7 + topic_4:topic_8 + 
    topic_4:topic_9 + topic_4:topic_12 + topic_4:topic_14 + topic_4:topic_18 + 
    topic_4:topic_22 + topic_5:topic_11 + topic_5:topic_12 + 
    topic_5:topic_22 + topic_6:topic_7 + topic_6:topic_9 + topic_6:topic_13 + 
    topic_6:topic_17 + topic_6:topic_18 + topic_6:topic_19 + 
    topic_6:topic_21 + topic_6:topic_22 + topic_7:topic_9 + topic_7:topic_12 + 
    topic_7:topic_22 + topic_8:topic_11 + topic_8:topic_12 + 
    topic_8:topic_13 + topic_8:topic_14 + topic_8:topic_15 + 
    topic_8:topic_18 + topic_8:topic_21 + topic_9:topic_12 + 
    topic_9:topic_14 + topic_9:topic_18 + topic_9:topic_20 + 
    topic_9:topic_21 + topic_9:topic_22 + topic_10:topic_12 + 
    topic_10:topic_20 + topic_10:topic_21 + topic_11:topic_14 + 
    topic_11:topic_15 + topic_11:topic_17 + topic_11:topic_19 + 
    topic_11:topic_20 + topic_11:topic_22 + topic_12:topic_14 + 
    topic_12:topic_17 + topic_12:topic_19 + topic_12:topic_20 + 
    topic_12:topic_21 + topic_13:topic_15 + topic_13:topic_18 + 
    topic_13:topic_19 + topic_14:topic_15 + topic_14:topic_16 + 
    topic_14:topic_18 + topic_14:topic_20 + topic_14:topic_21 + 
    topic_15:topic_16 + topic_15:topic_17 + topic_15:topic_20 + 
    topic_16:topic_19 + topic_17:topic_18 + topic_17:topic_22 + 
    topic_18:topic_22 + topic_18:topic_23 + topic_19:topic_20 + 
    topic_21:topic_23 + topic_0:topic_1 + topic_1:topic_10 + 
    topic_9:topic_19
"""
y, X = dmatrices(formula,
                 data=all_data, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
print(len(res.params))
print(res.summary())

In [None]:
all_data['p_avg_first_comment'] = res.predict(X)

In [None]:
plt.figure(figsize=(12, 5))
sns.distplot(res.resid, kde=False, color='blue', bins=50)
plt.title('OLS Residuals', fontsize=18)
plt.xlabel('Residuals', fontsize=16)
plt.ylabel('Count', fontsize=16)

In [None]:
plt.figure(figsize=(12, 5))
sns.scatterplot(x=res.resid, y=all_data['issues_over_time'], color='blue')
plt.title('Residuals vs Actuals', fontsize=18)
plt.xlabel('Residuals', fontsize=16)
plt.ylabel('Issues over Time', fontsize=16)
#plt.ylim([0, 20])
#plt.xlim([-0.2, 0.2])

In [None]:
mod = sm.GLM(y, X, family=sm.families.Gamma(link=sm.families.links.log))
res = mod.fit()
print(1 - (res.deviance/res.null_deviance))
print(len(res.params))
print(res.summary())

In [None]:
glm_marginal_effect('avg_clustering', res, X, all_data)

In [None]:
glm_marginal_effect('avg_min_path', res, X, all_data)

In [None]:
glm_marginal_effect('gini_coefficient', res, X, all_data)

In [None]:
marginal_effects = {'crowd_pct': [], 'effect': []}
avg_clustering = all_data['avg_clustering']
crowd_pct = all_data['crowd_pct']

effects_data = X.copy(deep=True)
effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering


effects_data['crowd_pct'] = crowd_pct
effects_data['crowd_pct_2'] = crowd_pct**2


effects_data['predictions'] = res.predict(effects_data)
effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
effects_data['crowd_pct_param'] = (res.params['crowd_pct']
                                    + avg_clustering * res.params['avg_clusteringXcrowd_pct'])
effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                + effects_data['crowd_pct_param'])
        
        
avg_effect = effects_data['total_effect'].mean()
print(avg_effect)

In [None]:
plt.figure(figsize=(12, 5))

marginal_effects = {'crowd_pct': [], 'effect': []}
avg_clustering = all_data['avg_clustering']


for i in range(100):
    effects_data = X.copy(deep=True)
    crowd_pct = i/100
    effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering


    effects_data['crowd_pct'] = crowd_pct
    effects_data['crowd_pct_2'] = crowd_pct**2


    effects_data['predictions'] = res.predict(effects_data)
    effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
    effects_data['crowd_pct_param'] = (res.params['crowd_pct']
                                        + avg_clustering * res.params['avg_clusteringXcrowd_pct'])
    effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                    + effects_data['crowd_pct_param'])
        
        
    avg_effect = effects_data['total_effect'].mean()
    marginal_effects['crowd_pct'].append(crowd_pct)
    marginal_effects['effect'].append(avg_effect)
        
marginal_effects = pd.DataFrame(marginal_effects)
sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'])

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on First Comment', fontsize=16)
plt.xlabel('Crowd Pct', fontsize=16)

In [None]:
plt.figure(figsize=(12, 5))

for avg_clustering in [0.4, 0.5, 0.6, 0.7, 0.8]:
    marginal_effects = {'crowd_pct': [], 'effect': []}
    label = 'Avg Clustering: {} '.format(avg_clustering)

    for i in range(100):
        effects_data = X.copy(deep=True)
        crowd_pct = i/100
        effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering

        effects_data['crowd_pct'] = crowd_pct
        effects_data['crowd_pct_2'] = crowd_pct**2


        effects_data['predictions'] = res.predict(effects_data)
        effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
        effects_data['crowd_pct_param'] = (res.params['crowd_pct']
                                           + avg_clustering * res.params['avg_clusteringXcrowd_pct'])
        effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                      + effects_data['crowd_pct_param'])
        
        
        avg_effect = effects_data['total_effect'].mean()
        marginal_effects['crowd_pct'].append(crowd_pct)
        marginal_effects['effect'].append(avg_effect)
        
    marginal_effects = pd.DataFrame(marginal_effects)
    sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'], label=label)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on First Comment', fontsize=16)
plt.xlabel('Marginal Effect', fontsize=16)

### Percentage of One Time Users

In [None]:
all_data['issues_per_user'].describe()

In [None]:
plt.figure(figsize=(12, 5))
sns.distplot(all_data['issues_per_user'], kde=False, color='blue', bins=100)
plt.title('Issues Per User', fontsize=18)
plt.xlabel('Issues Per User', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.xlim([0, 40])

In [None]:
y, X = dmatrices('issues_per_user ~ ' + base_features,
                 data=all_data, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
yhat = res.predict(X)
print(res.summary())

In [None]:
formula = """
issues_per_user ~ crowd_pct + crowd_pct_2 + avg_clustering + 
    avg_min_path + gini_coefficient + avg_clusteringXcrowd_pct + 
    gini_coefficientXcrowd_pct + topic_0 + topic_1 + topic_2 + 
    topic_3 + topic_4 + topic_5 + topic_6 + topic_7 + topic_8 + 
    topic_9 + topic_10 + topic_11 + topic_12 + topic_13 + topic_14 + 
    topic_15 + topic_16 + topic_17 + topic_18 + topic_19 + topic_20 + 
    topic_21 + topic_22 + topic_23 + topic_0:topic_2 + topic_0:topic_7 + 
    topic_0:topic_15 + topic_1:topic_4 + topic_1:topic_7 + topic_1:topic_13 + 
    topic_1:topic_14 + topic_1:topic_17 + topic_2:topic_3 + topic_2:topic_8 + 
    topic_2:topic_9 + topic_2:topic_12 + topic_3:topic_4 + topic_3:topic_7 + 
    topic_3:topic_14 + topic_3:topic_18 + topic_4:topic_5 + topic_4:topic_9 + 
    topic_4:topic_11 + topic_4:topic_13 + topic_4:topic_15 + 
    topic_4:topic_17 + topic_4:topic_18 + topic_4:topic_19 + 
    topic_5:topic_9 + topic_5:topic_10 + topic_5:topic_12 + topic_5:topic_17 + 
    topic_6:topic_12 + topic_7:topic_10 + topic_7:topic_17 + 
    topic_7:topic_20 + topic_7:topic_21 + topic_7:topic_23 + 
    topic_8:topic_10 + topic_8:topic_12 + topic_8:topic_17 + 
    topic_8:topic_18 + topic_8:topic_19 + topic_8:topic_20 + 
    topic_9:topic_14 + topic_9:topic_16 + topic_9:topic_18 + 
    topic_10:topic_11 + topic_10:topic_12 + topic_10:topic_15 + 
    topic_10:topic_18 + topic_10:topic_20 + topic_12:topic_20 + 
    topic_13:topic_14 + topic_13:topic_15 + topic_13:topic_17 + 
    topic_13:topic_18 + topic_14:topic_16 + topic_15:topic_17 + 
    topic_15:topic_21 + topic_16:topic_22 + topic_17:topic_20 + 
    topic_17:topic_21 + topic_18:topic_20 + topic_18:topic_23 + 
    topic_7:topic_19 + topic_8:topic_11 + topic_7:topic_22 + 
    topic_1:topic_6 + topic_11:topic_22 + topic_0:topic_22 + 
    topic_2:topic_20
"""

In [None]:
y, X = dmatrices(formula,
                 data=all_data, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
print(len(res.params))
print(res.summary())

In [None]:
plt.figure(figsize=(12, 5))
sns.distplot(res.resid, kde=False, color='blue', bins=50)
plt.title('OLS Residuals', fontsize=18)
plt.xlabel('Residuals', fontsize=16)
plt.ylabel('Count', fontsize=16)

In [None]:
plt.figure(figsize=(12, 5))
sns.scatterplot(x=res.resid, y=all_data['issues_over_time'], color='blue')
plt.title('Residuals vs Actuals', fontsize=18)
plt.xlabel('Residuals', fontsize=16)
plt.ylabel('Issues over Time', fontsize=16)
#plt.ylim([0, 20])
#plt.xlim([-0.2, 0.2])

In [None]:
mod = sm.GLM(y, X, family=sm.families.Gamma(link=sm.families.links.log))
res = mod.fit()
print(1 - (res.deviance/res.null_deviance))
print(len(res.params))
print(res.summary())

In [None]:
glm_marginal_effect('avg_clustering', res, X, all_data)

In [None]:
glm_marginal_effect('avg_min_path', res, X, all_data)

In [None]:
glm_marginal_effect('gini_coefficient', res, X, all_data)

In [None]:
marginal_effects = {'crowd_pct': [], 'effect': []}
avg_clustering = all_data['avg_clustering']
gini_coefficient = all_data['gini_coefficient']
crowd_pct = all_data['crowd_pct']

effects_data = X.copy(deep=True)
effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering


effects_data['crowd_pct'] = crowd_pct
effects_data['crowd_pct_2'] = crowd_pct**2


effects_data['predictions'] = res.predict(effects_data)
effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
effects_data['crowd_pct_param'] = (res.params['crowd_pct']
                                   + gini_coefficient * res.params['gini_coefficientXcrowd_pct']
                                    + avg_clustering * res.params['avg_clusteringXcrowd_pct'])
effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                + effects_data['crowd_pct_param'])
        
        
avg_effect = effects_data['total_effect'].mean()
print(avg_effect)

In [None]:
plt.figure(figsize=(12, 5))

marginal_effects = {'crowd_pct': [], 'effect': []}
avg_min_path = all_data['avg_min_path']
gini_coefficient = all_data['gini_coefficient']
avg_clustering = all_data['avg_clustering']


for i in range(100):
    effects_data = X.copy(deep=True)
    crowd_pct = i/100
    effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering
    effects_data['gini_coefficientXcrowd_pct'] = crowd_pct*gini_coefficient

    effects_data['crowd_pct'] = crowd_pct
    effects_data['crowd_pct_2'] = crowd_pct**2


    effects_data['predictions'] = res.predict(effects_data)
    effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
    effects_data['crowd_pct_param'] = (res.params['crowd_pct']
                                        + avg_clustering * res.params['avg_clusteringXcrowd_pct']
                                        + gini_coefficient * res.params['gini_coefficientXcrowd_pct'])
    effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                    + effects_data['crowd_pct_param'])
        
        
    avg_effect = effects_data['total_effect'].mean()
    marginal_effects['crowd_pct'].append(crowd_pct)
    marginal_effects['effect'].append(avg_effect)
        
marginal_effects = pd.DataFrame(marginal_effects)
sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'])

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on Comment Activity', fontsize=16)
plt.xlabel('Crowd Pct', fontsize=16)

In [None]:
plt.figure(figsize=(12, 5))

for avg_clustering in [0.4, 0.5, 0.6, 0.7, 0.8]:
    marginal_effects = {'crowd_pct': [], 'effect': []}
    label = 'Avg Clustering: {} '.format(avg_clustering)
    avg_min_path = all_data['avg_min_path']
    gini_coefficient = all_data['gini_coefficient']

    for i in range(100):
        effects_data = X.copy(deep=True)
        crowd_pct = i/100
        effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering
        effects_data['gini_coefficientXcrowd_pct'] = crowd_pct*gini_coefficient

        effects_data['crowd_pct'] = crowd_pct
        effects_data['crowd_pct_2'] = crowd_pct**2


        effects_data['predictions'] = res.predict(effects_data)
        effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
        effects_data['crowd_pct_param'] = (res.params['crowd_pct']
                                           + avg_clustering * res.params['avg_clusteringXcrowd_pct']
                                           + gini_coefficient * res.params['gini_coefficientXcrowd_pct'])
        effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                      + effects_data['crowd_pct_param'])
        
        
        avg_effect = effects_data['total_effect'].mean()
        marginal_effects['crowd_pct'].append(crowd_pct)
        marginal_effects['effect'].append(avg_effect)
        
    marginal_effects = pd.DataFrame(marginal_effects)
    sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'], label=label)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on Comment Activity', fontsize=16)
plt.xlabel('Marginal Effect', fontsize=16)

In [None]:
plt.figure(figsize=(12, 5))

for gini_coefficient in [0.4, 0.5, 0.6, 0.7]:
    marginal_effects = {'crowd_pct': [], 'effect': []}
    label = 'Gini Coefficient: {} '.format(gini_coefficient)
    avg_clustering = all_data['avg_clustering']
    avg_min_path = all_data['avg_min_path']

    for i in range(100):
        effects_data = X.copy(deep=True)
        crowd_pct = i/100
        effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering
        effects_data['gini_coefficientXcrowd_pct'] = crowd_pct*gini_coefficient

        effects_data['crowd_pct'] = crowd_pct
        effects_data['crowd_pct_2'] = crowd_pct**2


        effects_data['predictions'] = res.predict(effects_data)
        effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
        effects_data['crowd_pct_param'] = (res.params['crowd_pct']
                                           + avg_clustering * res.params['avg_clusteringXcrowd_pct']
                                           + gini_coefficient * res.params['gini_coefficientXcrowd_pct'])
        effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                      + effects_data['crowd_pct_param'])
        
        
        avg_effect = effects_data['total_effect'].mean()
        marginal_effects['crowd_pct'].append(crowd_pct)
        marginal_effects['effect'].append(avg_effect)
        
    marginal_effects = pd.DataFrame(marginal_effects)
    sns.lineplot(y=marginal_effects['effect'], x=marginal_effects['crowd_pct'], label=label)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Marginal Effect', fontsize=18)
plt.ylabel('Effect on Comment Activity', fontsize=16)
plt.xlabel('Marginal Effect', fontsize=16)

### DEA Analysis

In [None]:
all_data['lhs'] = 1

In [None]:
all_data['pct_under_30_pct'] = pd.qcut(all_data.pct_under_30, 100, labels=False)
all_data['avg_comments_pct'] = pd.qcut(all_data.avg_comments, 100, labels=False)
all_data['issues_over_time_pct'] = pd.qcut(all_data.issues_over_time, 100, labels=False)
all_data['issues_per_user_pct'] = pd.qcut(all_data.issues_per_user, 100, labels=False)
all_data['avg_first_comment_pct'] = 100 - pd.qcut(all_data.issues_over_time, 100, labels=False)

In [None]:
inputs = all_data[['lhs']]
outputs = all_data[['pct_under_30_pct', 'avg_comments_pct', 'issues_over_time_pct',
                    'avg_first_comment_pct', 'issues_per_user_pct']]

In [None]:
dea = DEA(inputs=inputs, outputs=outputs, model = 'ccr')

In [None]:
dea.solve()

In [None]:
dea.eff_scores

In [None]:
all_data['eff_scores'] = dea.eff_scores

In [None]:
all_data['eff_scores'].describe()

In [None]:
all_data[all_data['eff_scores'] == 1][['package',
                                       'organization',
                                       'gini_coefficient',
                                       'avg_clustering',
                                       'avg_min_path',
                                       'crowd_pct']].sort_values('crowd_pct')

In [None]:
y, X = dmatrices('eff_scores ~ ' + base_features,
                 data=all_data, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
yhat = res.predict(X)
print(res.summary())

In [None]:
mod = sm.GLM(y, X, family=sm.families.Gamma(link=sm.families.links.log))
res = mod.fit()
print(1 - (res.deviance/res.null_deviance))
print(len(res.params))
print(res.summary())

In [None]:
glm_marginal_effect('avg_clustering', res, X, all_data)

In [None]:
glm_marginal_effect('avg_min_path', res, X, all_data)

In [None]:
glm_marginal_effect('gini_coefficient', res, X, all_data)

In [None]:
marginal_effects = {'crowd_pct': [], 'effect': []}
avg_clustering = all_data['avg_clustering']
gini_coefficient = all_data['gini_coefficient']
avg_min_path = all_data['avg_min_path']
crowd_pct = all_data['crowd_pct']

effects_data = X.copy(deep=True)
effects_data['avg_clusteringXcrowd_pct'] = crowd_pct*avg_clustering


effects_data['crowd_pct'] = crowd_pct
effects_data['crowd_pct_2'] = crowd_pct**2


effects_data['predictions'] = res.predict(effects_data)
effects_data['crowd_pct_2_effect'] = effects_data['predictions'] * res.params['crowd_pct_2']
effects_data['crowd_pct_param'] = (res.params['crowd_pct']
                                   + gini_coefficient * res.params['gini_coefficientXcrowd_pct']
                                   + avg_min_path * res.params['avg_min_pathXcrowd_pct']
                                   + avg_clustering * res.params['avg_clusteringXcrowd_pct'])
effects_data['total_effect'] = effects_data['predictions'] * (2 * res.params['crowd_pct_2'] * crowd_pct
                                                                + effects_data['crowd_pct_param'])
        
        
avg_effect = effects_data['total_effect'].mean()
print(avg_effect)