# Model â€” Naive TWFE DiD (Sophistication)

Load `data/processed/panel_sophistication.csv`, convert grouping/time to categorical, run naive TWFE DiD models for `nkill_mean` and `weaptype_score`, and save results to `reports/tables/`.

In [1]:
from pathlib import Path
import os
import pandas as pd
import statsmodels.formula.api as smf

def find_repo_root(start=Path.cwd()):
    p = start
    while True:
        if (p / 'notebooks').exists() or (p / '.git').exists() or (p / 'README.md').exists():
            return p
        if p == p.parent:
            raise FileNotFoundError('Could not find repository root from cwd')
        p = p.parent

ROOT = find_repo_root()
# ensure reports/tables exists when saving results
(ROOT / 'reports' / 'tables').mkdir(parents=True, exist_ok=True)
print('Repository root:', ROOT)

Repository root: /Users/kanishkraghavendra/Documents/Project/data-minds-causal-analysis


In [2]:
# Load the processed panel for sophistication
data_path = ROOT / 'data' / 'processed' / 'panel_sophistication.csv'
print('Loading:', data_path)
if not data_path.exists():
    raise FileNotFoundError(f'Expected dataset not found at {data_path!s}. Run notebooks/01_Data_Cleaning_and_Aggregation.ipynb first to produce it.')
df = pd.read_csv(data_path.as_posix(), parse_dates=['date'])

# Basic column fixes: ensure expected columns exist
# If there are known typos, they can be corrected here (kept conservative)
if 'nkill_mean' not in df.columns and 'nkill_mear' in df.columns:
    df = df.rename(columns={'nkill_mear': 'nkill_mean'})
if 'weaptype_score' not in df.columns and 'weaptype_post' in df.columns:
    df = df.rename(columns={'weaptype_post': 'weaptype_score'})

# Ensure datetime and create monthly time_period categorical
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['time_period'] = df['date'].dt.to_period('M').astype(str)
df['time_period'] = df['time_period'].astype('category')

# Ensure group_name exists
if 'group_name' not in df.columns:
    raise KeyError('Missing column: group_name in panel_sophistication.csv')
df['group_name'] = df['group_name'].astype('category')

print('Loaded dataframe shape:', df.shape)
df.head()

Loading: /Users/kanishkraghavendra/Documents/Project/data-minds-causal-analysis/data/processed/panel_sophistication.csv
Loaded dataframe shape: (672, 9)


Unnamed: 0,group_name,date,nkill_mean,suicide_prop,weaptype_score,post,treated,treated_post,time_period
0,hard,1990-01-31,1.371901,0.0,1.983471,0,1,0,1990-01
1,hard,1990-02-28,1.884211,0.0,2.021053,0,1,0,1990-02
2,hard,1990-03-31,1.649007,0.0,1.993421,0,1,0,1990-03
3,hard,1990-04-30,1.629032,0.0,2.032,0,1,0,1990-04
4,hard,1990-05-31,1.136691,0.0,2.0,0,1,0,1990-05


In [4]:
# Define models to run (omit suicide_prop per request)
models = {
    'nkill_mean': ROOT / 'reports' / 'tables' / 'naive_sophistication_nkill.txt',
    'weaptype_score': ROOT / 'reports' / 'tables' / 'naive_sophistication_weapon.txt'
}

for y_var, out_path in models.items():
    print(f'Running TWFE model for {y_var}...')
    formula = f'{y_var} ~ treated_post + C(group_name) + C(time_period)'
    model = smf.ols(formula=formula, data=df).fit(cov_type='HC1')
    # Save regression summary
    with open(out_path.as_posix(), 'w') as f:
        f.write(model.summary().as_text())
    print(f'âœ… Saved: {out_path}')

print('\nðŸŽ¯ Models completed: nkill_mean and weaptype_score')

Running TWFE model for nkill_mean...
âœ… Saved: /Users/kanishkraghavendra/Documents/Project/data-minds-causal-analysis/reports/tables/naive_sophistication_nkill.txt
Running TWFE model for weaptype_score...




âœ… Saved: /Users/kanishkraghavendra/Documents/Project/data-minds-causal-analysis/reports/tables/naive_sophistication_weapon.txt

ðŸŽ¯ Models completed: nkill_mean and weaptype_score




### Note on `suicide_prop`

We do not use `suicide_prop` in the sophistication models because it is rarely observed (very low prevalence) and shows near-zero variance in the monthly panel, which makes OLS estimates unstable and uninformative. If you want, we can run a separate descriptive check on the `suicide_prop` distribution or consider a different modelling approach (e.g., rare-event methods or zero-inflated models).