In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from dowhy import CausalModel
from econml.dml import CausalForestDML

# Load and prepare data
def load_and_prepare_data(metrics_files, hn_submission_file=None):
    """
    Load and prepare data from multiple CSV files.

    Parameters:
    metrics_files: Dictionary mapping metric names to file paths
    hn_submission_file: Path to HackerNews submission data (None for non-HN repos)

    Returns:
    DataFrame ready for causal inference
    """
    # Load metrics data
    metrics_data = {}
    for metric_name, file_path in metrics_files.items():
        metrics_data[metric_name] = pd.read_csv(file_path)

    # Start with one metric as base and get the repo list
    base_metric = list(metrics_data.keys())[0]
    repo_list = metrics_data[base_metric]['repo_url'].tolist()

    # Convert wide format to long format for each metric
    all_data = []

    for metric_name, df in metrics_data.items():
        # Melt the dataframe to convert from wide to long format
        id_vars = ['repo_url']
        value_vars = [col for col in df.columns if col != 'repo_url']

        melted_df = pd.melt(df,
                            id_vars=id_vars,
                            value_vars=value_vars,
                            var_name='month',
                            value_name=metric_name)

        # Filter out data points where the repo didn't exist yet (-1.0 values)
        melted_df = melted_df[(melted_df[metric_name] != -1.0) & (melted_df[metric_name] != -2.0)]

        if len(all_data) == 0:
            all_data = melted_df
        else:
            # Merge with existing data
            all_data = pd.merge(all_data, melted_df, on=['repo_url', 'month'])

    # Convert month string to datetime
    all_data['date'] = pd.to_datetime(all_data['month'], format='%Y-%m')
    all_data.drop('month', axis=1, inplace=True)

    # Add HN submission information if available
    if hn_submission_file is not None:
        # Load HN submission data
        hn_data = pd.read_csv(hn_submission_file)

        # Convert Unix timestamp to datetime
        hn_data['submission_date'] = pd.to_datetime(hn_data['date'], unit='s')
        hn_data = hn_data[['url', 'submission_date']]
        hn_data.rename(columns={'url': 'repo_url'}, inplace=True)

        # Merge with metrics data
        all_data = pd.merge(all_data, hn_data, on='repo_url', how='left')

        # Add treatment indicator
        all_data['hn_submitted'] = all_data['submission_date'].notna().astype(int)

        # Create post-treatment indicator
        all_data['post_treatment'] = 0
        mask = (all_data['submission_date'].notna()) & (all_data['date'] >= all_data['submission_date'])
        all_data.loc[mask, 'post_treatment'] = 1

        # Create treatment variable
        all_data['treatment'] = all_data['hn_submitted'] * all_data['post_treatment']
    else:
        # For non-HN repos
        all_data['hn_submitted'] = 0
        all_data['submission_date'] = np.nan
        all_data['post_treatment'] = 0
        all_data['treatment'] = 0

    # Create time period column (months since start)
    min_date = all_data['date'].min()
    all_data['time_period'] = ((all_data['date'].dt.year - min_date.year) * 12 +
                              (all_data['date'].dt.month - min_date.month))

    return all_data

# Combine HN and non-HN repo data
def combine_datasets(hn_repos_data, non_hn_repos_data):
    """
    Combine datasets from HN-submitted and non-HN-submitted repos.

    Parameters:
    hn_repos_data: DataFrame with data for repos submitted to HackerNews
    non_hn_repos_data: DataFrame with data for repos not submitted to HackerNews

    Returns:
    Combined DataFrame
    """
    # Make sure the columns match
    common_cols = set(hn_repos_data.columns).intersection(set(non_hn_repos_data.columns))

    # Add any missing columns to non_hn_repos_data
    for col in hn_repos_data.columns:
        if col not in common_cols:
            if col in ['submission_date']:
                non_hn_repos_data[col] = np.nan
            elif col in ['hn_submitted', 'post_treatment', 'treatment']:
                non_hn_repos_data[col] = 0

    # Combine datasets
    combined_data = pd.concat([hn_repos_data, non_hn_repos_data], ignore_index=True)

    return combined_data

# Perform DiD analysis using DoWhy
def did_analysis(df, outcome_var='PRs'):
    """
    Perform Difference-in-Differences analysis using DoWhy.

    Parameters:
    df: Prepared DataFrame
    outcome_var: Outcome variable to measure (default: PRs)

    Returns:
    Causal effect estimate
    """
    # Define causal graph
    graph = """
    digraph {
        time_period -> %s;
        repo_url -> %s;
        treatment -> %s;
        repo_url -> treatment;
    }
    """ % (outcome_var, outcome_var, outcome_var)

    # Specify variables
    treatment_var = "treatment"
    common_causes = ["repo_url", "time_period"]

    # Create causal model
    model = CausalModel(
        data=df,
        treatment=treatment_var,
        outcome=outcome_var,
        graph=graph,
        common_causes=common_causes
    )

    # Identify causal effect
    identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)

    # Estimate causal effect using DiD estimator
    estimate = model.estimate_effect(
        identified_estimand,
        method_name="backdoor.econml.dml.DML",
        control_value=0,
        treatment_value=1,
        target_units="att",  # Average Treatment Effect on the Treated
        method_params={
            "init_params": {
                "model_y": LassoCV(),
                "model_t": LassoCV(),
                "model_final": RandomForestRegressor(n_estimators=100, min_samples_leaf=10)
            }
        }
    )

    return estimate

# Heterogeneous treatment effects
def heterogeneous_effects(df, outcome_var='PRs'):
    """
    Analyze heterogeneous treatment effects using CausalForest.

    Parameters:
    df: Prepared DataFrame
    outcome_var: Outcome variable to measure (default: PRs)

    Returns:
    CausalForest model and DataFrame with CATE estimates
    """
    # Select numeric features (excluding the outcome and treatment variables)
    feature_cols = [col for col in df.columns if df[col].dtype in [np.int64, np.float64]]
    feature_cols = [col for col in feature_cols if col not in
                    [outcome_var, 'treatment', 'hn_submitted', 'post_treatment']]

    # Prepare features
    features = df[feature_cols]
    treatment = df['treatment']
    outcome = df[outcome_var]

    # Drop rows with NaN values
    mask = ~(features.isna().any(axis=1) | treatment.isna() | outcome.isna())
    features = features[mask]
    treatment = treatment[mask]
    outcome = outcome[mask]

    # Fit causal forest model
    cf_model = CausalForestDML(
        model_y=None,  # Will use default models
        model_t=None,
        n_estimators=100,
        min_samples_leaf=10
    )
    cf_model.fit(features, treatment, outcome)

    # Calculate conditional treatment effects
    cate_estimates = cf_model.effect(features)

    # Add CATE estimates to DataFrame
    df_with_cate = df[mask].copy()
    df_with_cate['cate'] = cate_estimates

    return cf_model, df_with_cate

# Validate results with placebo tests
def placebo_test(df, outcome_var='PRs', num_placebos=50):
    """
    Perform placebo tests by randomizing treatment assignment.

    Parameters:
    df: Prepared DataFrame
    outcome_var: Outcome variable to measure (default: PRs)
    num_placebos: Number of placebo tests to run

    Returns:
    Array of placebo estimates and p-value
    """
    # Run actual analysis
    real_estimate = did_analysis(df, outcome_var)
    placebo_estimates = []

    print(f"Real effect estimate: {real_estimate.value}")

    for i in range(num_placebos):
        # Create copy of DataFrame
        placebo_df = df.copy()

        # Randomize treatment assignment
        placebo_df['treatment'] = np.random.permutation(placebo_df['treatment'].values)

        # Run DiD analysis
        try:
            placebo_estimate = did_analysis(placebo_df, outcome_var)
            placebo_estimates.append(placebo_estimate.value)
        except Exception as e:
            print(f"Placebo test {i} failed: {str(e)}")
            continue

    # Calculate p-value
    p_value = sum(abs(pe) >= abs(real_estimate.value) for pe in placebo_estimates) / len(placebo_estimates)

    return real_estimate, placebo_estimates, p_value

# Visualize parallel trends assumption
def check_parallel_trends(df, outcome_var='PRs'):
    """
    Check the parallel trends assumption by plotting pre-treatment trends.

    Parameters:
    df: Prepared DataFrame
    outcome_var: Outcome variable to measure (default: PRs)
    """
    # Get unique repos that were submitted to HN
    hn_repos = df[df['hn_submitted'] == 1]['repo_url'].unique()

    # Filter for pre-treatment periods for these repos
    pre_treatment = df[(df['repo_url'].isin(hn_repos)) & (df['post_treatment'] == 0)]
    control_group = df[df['hn_submitted'] == 0]

    # Aggregate by time period
    treated_pre = pre_treatment.groupby('time_period')[outcome_var].mean().reset_index()
    control_all = control_group.groupby('time_period')[outcome_var].mean().reset_index()

    # Get the earliest treatment period
    min_treatment_period = df[df['post_treatment'] == 1]['time_period'].min()

    # Plot
    plt.figure(figsize=(12, 6))
    plt.plot(treated_pre['time_period'], treated_pre[outcome_var], 'b-',
             label='Pre-treatment (HN Repos)')
    plt.plot(control_all['time_period'], control_all[outcome_var], 'r-',
             label='Control (Non-HN Repos)')
    plt.axvline(x=min_treatment_period, color='green', linestyle='--',
                label='First Treatment')
    plt.xlabel('Time Period (Months since start)')
    plt.ylabel(f'Average {outcome_var}')
    plt.title(f'Parallel Trends Check: {outcome_var}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig(f'parallel_trends_{outcome_var}.png')
    plt.close()

# Visualize treatment effect
def visualize_treatment_effect(df, outcome_var='PRs'):
    """
    Visualize the treatment effect over time.

    Parameters:
    df: Prepared DataFrame
    outcome_var: Outcome variable to measure (default: PRs)
    """
    # Aggregate data by time period and treatment status
    treatment_data = df[df['hn_submitted'] == 1].copy()
    control_data = df[df['hn_submitted'] == 0].copy()

    # For repos that were submitted to HN, mark pre and post treatment periods
    treatment_pre = treatment_data[treatment_data['post_treatment'] == 0].groupby('time_period')[outcome_var].mean().reset_index()
    treatment_post = treatment_data[treatment_data['post_treatment'] == 1].groupby('time_period')[outcome_var].mean().reset_index()
    control_all = control_data.groupby('time_period')[outcome_var].mean().reset_index()

    # Plot
    plt.figure(figsize=(14, 7))
    plt.plot(treatment_pre['time_period'], treatment_pre[outcome_var], 'b-',
             label='Pre-treatment (HN Repos)')
    plt.plot(treatment_post['time_period'], treatment_post[outcome_var], 'g-',
             label='Post-treatment (HN Repos)')
    plt.plot(control_all['time_period'], control_all[outcome_var], 'r-',
             label='Control (Non-HN Repos)')

    # Add a vertical line at the first treatment
    first_treatment = treatment_data[treatment_data['post_treatment'] == 1]['time_period'].min()
    plt.axvline(x=first_treatment, color='black', linestyle='--',
                label='First HN Submission')

    plt.xlabel('Time Period (Months since start)')
    plt.ylabel(f'Average {outcome_var}')
    plt.title(f'Effect of HackerNews Submission on {outcome_var}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig(f'treatment_effect_{outcome_var}.png')
    plt.close()

# Main function to run the analysis
def main():
    # File paths (replace with your actual file paths)
    hn_metrics_files = {
        'stars': 'hn_stars_metrics.csv',
        'forks': 'hn_forks_metrics.csv',
        'commits': 'hn_commits_metrics.csv',
        'PRs': 'hn_prs_metrics.csv',
        'contributors': 'hn_contributors_metrics.csv'
    }

    non_hn_metrics_files = {
        'stars': 'non_hn_stars_metrics.csv',
        'forks': 'non_hn_forks_metrics.csv',
        'commits': 'non_hn_commits_metrics.csv',
        'PRs': 'non_hn_prs_metrics.csv',
        'contributors': 'non_hn_contributors_metrics.csv'
    }

    hn_submission_file = 'rq1_stories_github_valid_projs_only_292.csv'

    # Load and prepare data
    print("Loading HackerNews repo data...")
    hn_repos_data = load_and_prepare_data(hn_metrics_files, hn_submission_file)

    print("Loading non-HackerNews repo data...")
    non_hn_repos_data = load_and_prepare_data(non_hn_metrics_files)

    print("Combining datasets...")
    all_repos_data = combine_datasets(hn_repos_data, non_hn_repos_data)

    # Drop rows with NaN values in the outcome variable
    outcomes = ['PRs', 'stars', 'forks', 'commits', 'contributors']

    results = {}

    for outcome_var in outcomes:
        print(f"\nAnalyzing causal effect on {outcome_var}...")

        # Drop NaN values for this specific outcome
        analysis_data = all_repos_data.dropna(subset=[outcome_var])

        # Check parallel trends assumption
        print("Checking parallel trends assumption...")
        check_parallel_trends(analysis_data, outcome_var)

        # Run DiD analysis
        print("Running DiD analysis...")
        estimate = did_analysis(analysis_data, outcome_var)

        print(f"Estimated causal effect on {outcome_var}: {estimate.value}")
        if hasattr(estimate, "confidence_intervals") and estimate.confidence_intervals is not None:
            lower, upper = estimate.confidence_intervals[0]
            print(f"95% Confidence Interval: ({lower}, {upper})")
        else:
            print("Confidence interval not available.")


        # Visualize treatment effect
        print("Visualizing treatment effect...")
        visualize_treatment_effect(analysis_data, outcome_var)

        # Run placebo tests
        print("Running placebo tests...")
        real_estimate, placebo_estimates, p_value = placebo_test(analysis_data, outcome_var)

        print(f"Placebo test p-value: {p_value}")

        # Store results
        results[outcome_var] = {
            'estimate': estimate.value,
            'stderr': estimate.stderr,
            'p_value': p_value
        }

        # Optional: Analyze heterogeneous effects
        print("Analyzing heterogeneous effects...")
        try:
            cf_model, df_with_cate = heterogeneous_effects(analysis_data, outcome_var)

            # Save the top and bottom repos by treatment effect
            top_repos = df_with_cate.sort_values('cate', ascending=False).head(10)
            bottom_repos = df_with_cate.sort_values('cate', ascending=True).head(10)

            print(f"Top 10 repos with highest treatment effect on {outcome_var}:")
            print(top_repos[['repo_url', 'cate']].drop_duplicates('repo_url').head())

            print(f"Bottom 10 repos with lowest treatment effect on {outcome_var}:")
            print(bottom_repos[['repo_url', 'cate']].drop_duplicates('repo_url').head())

        except Exception as e:
            print(f"Heterogeneous effects analysis failed: {str(e)}")

    # Summary of results
    print("\nSummary of Results:")
    print("-" * 50)
    for outcome, result in results.items():
        print(f"Outcome: {outcome}")
        print(f"  Causal Effect: {result['estimate']:.4f} ± {result['stderr']:.4f}")
        print(f"  p-value: {result['p_value']:.4f}")
        if result['p_value'] < 0.05:
            print("  Result is statistically significant at p < 0.05")
        else:
            print("  Result is not statistically significant at p < 0.05")
        print("-" * 50)

    return all_repos_data, results

all_repos_data, results = main()

Loading HackerNews repo data...
Loading non-HackerNews repo data...
Combining datasets...

Analyzing causal effect on PRs...
Checking parallel trends assumption...


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


Running DiD analysis...


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
The final model has a nonzero intercept for at least one outcome; it will be subtracted, but consider fitting a model without an intercept if possible.


Estimated causal effect on PRs: -3.346403150464679
Confidence interval not available.
Visualizing treatment effect...
Running placebo tests...


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
The final model has a nonzero intercept for at least one outcome; it will be subtracted, but consider fitting a model without an intercept if possible.


Real effect estimate: -25.297756621863037


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
The final model has a nonzero intercept for at least one outcome; it will be subtracted, but consider fitting a model without an intercept if possible.
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
The final model has a nonzero intercept for at least one outcome; it will be subtracted, but consider fitting a model without an intercept if possible.
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
The final model has a nonzero intercept for at least one outcome; it will be subtracted, but consider fitting a model without an intercept if possible.
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
The final mo

KeyboardInterrupt: 