# DiD New Metrics

In [1]:
import pandas as pd
import statsmodels.formula.api as smf
import os

# Directory containing the aligned files
# aligned_dir = "aligned_did_outputs"
# metrics = ['stars', 'forks', 'commits', 'prs', 've_contributors']
# metric_column_map = {
#     "stars": "cumulative_stars",
#     "forks": "cumulative_forks",
#     "commits": "cumulative_commits",
#     "prs": "cumulative_prs",
#     "ve_contributors": "cumulative_ve_contributors"
# }

aligned_dir = "aligned_did_outputs_new"
metrics = ['new_stars', 'new_forks', 'new_commits', 'new_prs', 'active_contributors']
metric_column_map = {
    "new_stars": "new_stars",
    "new_forks": "new_forks",
    "new_commits": "new_commits",
    "new_prs": "new_prs",
    "active_contributors": "active_contributors"
}

results = {}

for metric in metrics:
    file_path = os.path.join(aligned_dir, f"aligned_did_{metric}.csv")
    df = pd.read_csv(file_path)

    if df.empty:
        print(f"[!] Skipping empty file: {file_path}")
        continue

    # Prep binary indicators
    df['treatment'] = (df['repo_type'] == 'treatment').astype(int)
    df['post'] = df['post_treatment'].astype(int)

    # DiD interaction term is treatment * post
    df['did'] = df['treatment'] * df['post']

    # Choose outcome metric
    outcome = metric_column_map[metric]

    # Run DiD regression
    model = smf.ols(formula=f"{outcome} ~ treatment + post + did", data=df).fit(cov_type='HC3')

    print(f"\n===== DiD Result: {outcome} =====")
    print(model.summary())
    results[metric] = model


===== DiD Result: new_stars =====
                            OLS Regression Results                            
Dep. Variable:              new_stars   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.020
Method:                 Least Squares   F-statistic:                     9.800
Date:                Fri, 11 Apr 2025   Prob (F-statistic):           2.08e-06
Time:                        02:32:38   Log-Likelihood:                -10131.
No. Observations:                1640   AIC:                         2.027e+04
Df Residuals:                    1636   BIC:                         2.029e+04
Df Model:                           3                                         
Covariance Type:                  HC3                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.

# DiD cumulative metrics

In [5]:
import pandas as pd
import statsmodels.formula.api as smf
import os

# Directory containing the aligned files
aligned_dir = "aligned_did_outputs"
metrics = ['stars', 'forks', 'commits', 'prs', 've_contributors']
metric_column_map = {
    "stars": "cumulative_stars",
    "forks": "cumulative_forks",
    "commits": "cumulative_commits",
    "prs": "cumulative_prs",
    "ve_contributors": "cumulative_ve_contributors"
}

results = {}

for metric in metrics:
    file_path = os.path.join(aligned_dir, f"aligned_did_{metric}.csv")
    df = pd.read_csv(file_path)

    if df.empty:
        print(f"[!] Skipping empty file: {file_path}")
        continue

    # Prep binary indicators
    df['treatment'] = (df['repo_type'] == 'treatment').astype(int)
    df['post'] = df['post_treatment'].astype(int)

    # DiD interaction term is treatment * post
    df['did'] = df['treatment'] * df['post']

    # Choose outcome metric
    outcome = metric_column_map[metric]

    # Run DiD regression
    model = smf.ols(formula=f"{outcome} ~ treatment + post + did", data=df).fit(cov_type='HC3')

    print(f"\n===== DiD Result: {outcome} =====")
    print(model.summary())
    results[metric] = model



===== DiD Result: cumulative_stars =====
                            OLS Regression Results                            
Dep. Variable:       cumulative_stars   R-squared:                       0.068
Model:                            OLS   Adj. R-squared:                  0.066
Method:                 Least Squares   F-statistic:                     22.32
Date:                Fri, 11 Apr 2025   Prob (F-statistic):           3.45e-14
Time:                        02:51:33   Log-Likelihood:                -13940.
No. Observations:                1816   AIC:                         2.789e+04
Df Residuals:                    1812   BIC:                         2.791e+04
Df Model:                           3                                         
Covariance Type:                  HC3                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept 

# further analysis test

lagstars

In [None]:
# for lagged value you need to sort first
df = df.sort_values(by=['repo_full_name', 'month'])
df['lag_stars'] = df.groupby('repo_full_name')['cumulative_stars'].shift(1)

model = smf.ols("cumulative_stars ~ lag_stars + treatment + post_treatment + did", data=df)\
           .fit(cov_type='HC3')
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:       cumulative_stars   R-squared:                       0.980
Model:                            OLS   Adj. R-squared:                  0.980
Method:                 Least Squares   F-statistic:                     3747.
Date:                Fri, 11 Apr 2025   Prob (F-statistic):               0.00
Time:                        02:57:02   Log-Likelihood:                -22055.
No. Observations:                2993   AIC:                         4.412e+04
Df Residuals:                    2988   BIC:                         4.415e+04
Df Model:                           4                                         
Covariance Type:                  HC3                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -2.1028      0.533     -3.

In [10]:
from pandas.tseries.offsets import DateOffset

# Convert to datetime just in case
df['hn_submission_date'] = pd.to_datetime(df['hn_submission_date'])
df['month'] = pd.to_datetime(df['month'])

# Subtract 3 months row-by-row to make placebo HN date
df['placebo_date'] = df['hn_submission_date'].apply(lambda d: d - DateOffset(months=3))

# Define placebo post-treatment and interaction term
df['post_placebo'] = (df['month'] >= df['placebo_date']).astype(int)
df['placebo_did'] = df['treatment'] * df['post_placebo']

# Run placebo DiD regression
model = smf.ols("cumulative_stars ~ treatment + post_placebo + placebo_did", data=df.dropna())\
           .fit(cov_type='HC3')
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:       cumulative_stars   R-squared:                       0.018
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                 1.326e+16
Date:                Fri, 11 Apr 2025   Prob (F-statistic):               0.00
Time:                        03:05:22   Log-Likelihood:                -13966.
No. Observations:                1451   AIC:                         2.794e+04
Df Residuals:                    1448   BIC:                         2.795e+04
Df Model:                           2                                         
Covariance Type:                  HC3                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept    -5.715e+14   4.96e+06  -1.15e+08   



In [1]:
import pandas as pd
import statsmodels.formula.api as smf
import os
from pandas.tseries.offsets import DateOffset

# ===== Configuration =====
aligned_dirs = {
    "cumulative": {
        "dir": "aligned_did_outputs",
        "metrics": ['stars', 'forks', 'commits', 'prs', 've_contributors'],
        "prefix": "cumulative_"
    },
    "non_cumulative": {
        "dir": "aligned_did_outputs_new",
        "metrics": ['new_stars', 'new_forks', 'new_commits', 'new_prs', 'active_contributors'],
        "prefix": ""
    }
}

# ===== Main Analysis =====
for data_type, config in aligned_dirs.items():
    print(f"\n{'='*50}\nAnalyzing {data_type} metrics\n{'='*50}")

    for metric in config['metrics']:
        # Load data
        file_path = os.path.join(config['dir'], f"aligned_did_{metric}.csv")
        try:
            df = pd.read_csv(file_path)
            if df.empty:
                print(f"[!] Skipping empty file: {file_path}")
                continue
        except FileNotFoundError:
            print(f"[!] File not found: {file_path}")
            continue

        # Prep variables
        outcome = f"{config['prefix']}{metric}" if config['prefix'] else metric
        df['treatment'] = (df['repo_type'] == 'treatment').astype(int)
        df['post'] = df['post_treatment'].astype(int)
        df['did'] = df['treatment'] * df['post']

        # ===== 1. Original DiD =====
        model_did = smf.ols(f"{outcome} ~ treatment + post + did", data=df).fit(cov_type='HC3')
        print(f"\n[Original DiD] {outcome}:")
        print(f"DID coefficient: {model_did.params['did']:.4f} (p={model_did.pvalues['did']:.4f})")

        # ===== 2. Lagged Metric Analysis =====
        df = df.sort_values(by=['repo_full_name', 'month'])
        df['lagged_outcome'] = df.groupby('repo_full_name')[outcome].shift(1)

        if df['lagged_outcome'].notnull().any():  # Only run if lag exists
            model_lag = smf.ols(
                f"{outcome} ~ lagged_outcome + treatment + post + did",
                data=df.dropna(subset=['lagged_outcome'])
            ).fit(cov_type='HC3')
            print(f"\n[Lagged Model] {outcome}:")
            print(f"DID coefficient with lag: {model_lag.params['did']:.4f} (p={model_lag.pvalues['did']:.4f})")

        # ===== 3. Placebo Test =====
        # df['hn_submission_date'] = pd.to_datetime(df['hn_submission_date'])
        # df['month'] = pd.to_datetime(df['month'])
        # df['placebo_date'] = df['hn_submission_date'].apply(lambda d: d - DateOffset(months=3))
        # df['post_placebo'] = (df['month'] >= df['placebo_date']).astype(int)
        df['post_placebo'] = (df['relative_month'] >= -3).astype(int)
        df['placebo_did'] = df['treatment'] * df['post_placebo']

        model_placebo = smf.ols(
            f"{outcome} ~ treatment + post_placebo + placebo_did",
            data=df.dropna()
        ).fit(cov_type='HC3')
        print(f"\n[Placebo Test] {outcome}:")
        print(f"Placebo DID coefficient: {model_placebo.params['placebo_did']:.4f} (p={model_placebo.pvalues['placebo_did']:.4f})")
        print("-"*80)


Analyzing cumulative metrics

[Original DiD] cumulative_stars:
DID coefficient: 299.6716 (p=0.0000)

[Lagged Model] cumulative_stars:
DID coefficient with lag: 30.1163 (p=0.0141)

[Placebo Test] cumulative_stars:
Placebo DID coefficient: -47566844174619.1641 (p=0.0000)
--------------------------------------------------------------------------------

[Original DiD] cumulative_forks:
DID coefficient: 55.1177 (p=0.0000)

[Lagged Model] cumulative_forks:
DID coefficient with lag: 2.8698 (p=0.0415)

[Placebo Test] cumulative_forks:
Placebo DID coefficient: 24.3053 (p=0.0000)
--------------------------------------------------------------------------------

[Original DiD] cumulative_commits:
DID coefficient: 35.4383 (p=0.0000)

[Lagged Model] cumulative_commits:
DID coefficient with lag: 0.7125 (p=0.6645)

[Placebo Test] cumulative_commits:
Placebo DID coefficient: 21.6388 (p=0.0000)
--------------------------------------------------------------------------------

[Original DiD] cumulative_p

# not sure

In [None]:
# import pandas as pd
# import statsmodels.formula.api as smf
# import os

# # Settings
# aligned_dir = "aligned_did_outputs"
# metrics = ['stars', 'forks', 'commits', 'prs', 've_contributors']
# metric_column_map = {
#     "stars": "cumulative_stars",
#     "forks": "cumulative_forks",
#     "commits": "cumulative_commits",
#     "prs": "cumulative_prs",
#     "ve_contributors": "cumulative_ve_contributors"
# }

# results = {}

# for metric in metrics:
#     file_path = os.path.join(aligned_dir, f"aligned_did_{metric}.csv")
#     df = pd.read_csv(file_path)

#     if df.empty:
#         print(f"[!] Skipping empty file: {file_path}")
#         continue

#     # Prep indicators
#     df['treatment'] = (df['repo_type'] == 'treatment').astype(int)
#     df['post'] = df['post_treatment'].astype(int)
#     df['did'] = df['treatment'] * df['post']

#     # Convert to categorical for fixed effects
#     df['repo_full_name'] = df['repo_full_name'].astype('category')
#     df['month'] = pd.to_datetime(df['month']).dt.to_period("M").astype(str)

#     # Regression with repo and month fixed effects
#     outcome = metric_column_map[metric]
#     formula = f"{outcome} ~ did + C(repo_full_name) + C(month)"

#     model = smf.ols(formula=formula, data=df).fit(cov_type='HC3')

#     print(f"\n===== DiD with Fixed Effects: {outcome} =====")
#     print(model.summary())
#     results[metric] = model



===== DiD with Fixed Effects: cumulative_stars =====
                            OLS Regression Results                            
Dep. Variable:       cumulative_stars   R-squared:                       0.482
Model:                            OLS   Adj. R-squared:                  0.435
Method:                 Least Squares   F-statistic:                     36.39
Date:                Tue, 08 Apr 2025   Prob (F-statistic):               0.00
Time:                        11:21:17   Log-Likelihood:                -13406.
No. Observations:                1816   AIC:                         2.712e+04
Df Residuals:                    1663   BIC:                         2.796e+04
Df Model:                         152                                         
Covariance Type:                  HC3                                         
                                                                                        coef    std err          z      P>|z|      [0.025      0.975]
------

# placebo test

In [None]:
import pandas as pd
import statsmodels.formula.api as smf
import os
import numpy as np
from linearmodels import PanelOLS  # For more robust estimation

# Directory containing the aligned files
aligned_dir = "aligned_did_outputs"
metrics = ['stars', 'forks', 'commits', 'prs', 've_contributors']
metric_column_map = {
    "stars": "cumulative_stars",
    "forks": "cumulative_forks",
    "commits": "cumulative_commits",
    "prs": "cumulative_prs",
    "ve_contributors": "cumulative_ve_contributors"
}

results = {}
placebo_results = {}

for metric in metrics:
    file_path = os.path.join(aligned_dir, f"aligned_did_{metric}.csv")
    df = pd.read_csv(file_path, parse_dates=['month', 'hn_submission_date'])

    if df.empty:
        print(f"[!] Skipping empty file: {file_path}")
        continue

    # ========== DATA PREP ==========
    # Create proper time variables
    df['time_period'] = df['month'].astype('str')  # For fixed effects
    df['month_diff'] = df['relative_month']  # Already appears to be months from treatment

    # Treatment indicators
    df['treatment'] = (df['repo_type'] == 'treatment').astype(int)
    df['post'] = df['post_treatment'].astype(int)
    df['did'] = df['treatment'] * df['post']
    outcome = metric_column_map[metric]

    # ========== IMPROVED DiD ANALYSIS ==========
    print(f"\n===== Enhanced DiD Analysis: {outcome} =====")

    # Option 1: Two-way fixed effects (more robust)
    try:
        df_panel = df.set_index(['repo_full_name', 'month'])
        model_twfe = PanelOLS.from_formula(
            f"{outcome} ~ treatment + post + did + EntityEffects + TimeEffects",
            data=df_panel
        ).fit(cov_type='clustered', cluster_entity=True)

        print("\nTwo-way FE model results:")
        print(model_twfe.summary)
    except Exception as e:
        print(f"PanelOLS failed: {str(e)}")

    # Option 2: OLS with repo clusters
    model_ols = smf.ols(
        formula=f"{outcome} ~ treatment + post + did + C(pair_id)",
        data=df
    ).fit(cov_type='cluster', cov_kwds={'groups': df['repo_full_name']})

    print("\nOLS with repo clustering:")
    print(model_ols.summary())

    # ========== DYNAMIC EFFECTS ==========
    print(f"\n----- Dynamic Effects for {outcome} -----")

    # Create event study indicators (avoiding the forbidden comparison)
    event_bins = [-6, -4, -2, 0, 2, 4, 6]
    df['event_window'] = pd.cut(df['month_diff'], bins=event_bins)

    # Run event study model
    event_model = smf.ols(
        f"{outcome} ~ C(event_window, Treatment(reference='[-6, -4)'))*treatment",
        data=df
    ).fit(cov_type='cluster', cov_kwds={'groups': df['repo_full_name']})

    print(event_model.summary().tables[1])

    # ========== PLACEBO TESTS ==========
    print(f"\n----- Placebo Tests for {outcome} -----")

    # Pre-treatment placebos (should show no effect)
    for months_before in [2, 4]:
        df[f'placebo_{months_before}'] = (df['month_diff'] == -months_before).astype(int)
        df[f'did_placebo_{months_before}'] = df['treatment'] * df[f'placebo_{months_before}']

        placebo_model = smf.ols(
            f"{outcome} ~ treatment + placebo_{months_before} + did_placebo_{months_before} + C(pair_id)",
            data=df
        ).fit(cov_type='cluster', cov_kwds={'groups': df['repo_full_name']})

        print(f"\nPlacebo {months_before} months before treatment:")
        print(placebo_model.summary().tables[1])


===== Enhanced DiD Analysis: cumulative_stars =====
PanelOLS failed: 
The model cannot be estimated. The included effects have fully absorbed
one or more of the variables. This occurs when one or more of the dependent
variable is perfectly explained using the effects included in the model.

The following variables or variable combinations have been fully absorbed
or have become perfectly collinear after effects are removed:

          treatment

Set drop_absorbed=True to automatically drop absorbed variables.


OLS with repo clustering:
                            OLS Regression Results                            
Dep. Variable:       cumulative_stars   R-squared:                       0.288
Model:                            OLS   Adj. R-squared:                  0.258
Method:                 Least Squares   F-statistic:                     15.43
Date:                Wed, 09 Apr 2025   Prob (F-statistic):           3.56e-37
Time:                        16:31:44   Log-Likelihood:      

PatsyError: specified level '[-6, -4)' not found