In [2]:
import pandas as pd

# Load the dataset
data_path = "karlan_list_2007.dta"
df = pd.read_stata(data_path)

# Dictionary of variable descriptions
descriptions = {
    "treatment": "Treatment",
    "control": "Control",
    "ratio": "Match ratio",
    "ratio2": "2:1 match ratio",
    "ratio3": "3:1 match ratio",
    "size": "Match threshold",
    "size25": "$25,000 match threshold",
    "size50": "$50,000 match threshold",
    "size100": "$100,000 match threshold",
    "sizeno": "Unstated match threshold",
    "ask": "Suggested donation amount",
    "askd1": "Suggested donation was highest previous contribution",
    "askd2": "Suggested donation was 1.25 x highest previous contribution",
    "askd3": "Suggested donation was 1.50 x highest previous contribution",
    "ask1": "Highest previous contribution (for suggestion)",
    "ask2": "1.25 x highest previous contribution (for suggestion)",
    "ask3": "1.50 x highest previous contribution (for suggestion)",
    "amount": "Dollars given",
    "gave": "Gave anything",
    "amountchange": "Change in amount given",
    "hpa": "Highest previous contribution",
    "ltmedmra": "Small prior donor: last gift was less than median $35",
    "freq": "Number of prior donations",
    "years": "Number of years since initial donation",
    "year5": "At least 5 years since initial donation",
    "mrm2": "Number of months since last donation",
    "dormant": "Already donated in 2005",
    "female": "Female",
    "couple": "Couple",
    "state50one": "State tag: 1 for one observation of each of 50 states; 0 otherwise",
    "nonlit": "Nonlitigation",
    "cases": "Court cases from state in 2004-5 in which organization was involved",
    "statecnt": "Percent of sample from state",
    "stateresponse": "Proportion of sample from the state who gave",
    "stateresponset": "Proportion of treated sample from the state who gave",
    "stateresponsec": "Proportion of control sample from the state who gave",
    "stateresponsetminc": "stateresponset - stateresponsec",
    "perbush": "State vote share for Bush",
    "close25": "State vote share for Bush between 47.5% and 52.5%",
    "red0": "Red state",
    "blue0": "Blue state",
    "redcty": "Red county",
    "bluecty": "Blue county",
    "pwhite": "Proportion white within zip code",
    "pblack": "Proportion black within zip code",
    "page18_39": "Proportion age 18-39 within zip code",
    "ave_hh_sz": "Average household size within zip code",
    "median_hhincome": "Median household income within zip code",
    "powner": "Proportion house owner within zip code",
    "psch_atlstba": "Proportion who finished college within zip code",
    "pop_propurban": "Proportion of population urban within zip code"
}

print("\nVariable Descriptions:")
for var, desc in descriptions.items():
    print(f" - `{var}`: {desc}")
# Identify variable types
numerical_vars = df.select_dtypes(include=['number']).columns.tolist()
categorical_vars = df.select_dtypes(include=['category']).columns.tolist()

# Summary statistics for numerical variables
summary_stats = df.describe()

# Display variable types and summary
print("Numerical Variables:")
for var in numerical_vars:
    print(f" - {var}")

print("\nCategorical Variables:")
for var in categorical_vars:
    print(f" - {var}")

print("\nSummary Statistics for Numerical Variables:")
print(summary_stats)





Variable Descriptions:
 - `treatment`: Treatment
 - `control`: Control
 - `ratio`: Match ratio
 - `ratio2`: 2:1 match ratio
 - `ratio3`: 3:1 match ratio
 - `size`: Match threshold
 - `size25`: $25,000 match threshold
 - `size50`: $50,000 match threshold
 - `size100`: $100,000 match threshold
 - `sizeno`: Unstated match threshold
 - `ask`: Suggested donation amount
 - `askd1`: Suggested donation was highest previous contribution
 - `askd2`: Suggested donation was 1.25 x highest previous contribution
 - `askd3`: Suggested donation was 1.50 x highest previous contribution
 - `ask1`: Highest previous contribution (for suggestion)
 - `ask2`: 1.25 x highest previous contribution (for suggestion)
 - `ask3`: 1.50 x highest previous contribution (for suggestion)
 - `amount`: Dollars given
 - `gave`: Gave anything
 - `amountchange`: Change in amount given
 - `hpa`: Highest previous contribution
 - `ltmedmra`: Small prior donor: last gift was less than median $35
 - `freq`: Number of prior donat

In [4]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from scipy import stats

# Load dataset
df = pd.read_stata("projects/HW1/karlan_list_2007.dta")

# Filter for valid mrm2 values
df_mrm2 = df[['mrm2', 'treatment']].dropna()

# Split groups
treat = df_mrm2[df_mrm2['treatment'] == 1]['mrm2']
control = df_mrm2[df_mrm2['treatment'] == 0]['mrm2']

# Sample statistics
x1, x2 = treat.mean(), control.mean()
s1, s2 = treat.std(ddof=1), control.std(ddof=1)
n1, n2 = len(treat), len(control)

# Welch's t-test manually (from slide 37)
se_diff = np.sqrt((s1**2 / n1) + (s2**2 / n2))
t_stat = (x1 - x2) / se_diff

# Degrees of freedom (Welch–Satterthwaite)
df_num = (s1**2 / n1 + s2**2 / n2)**2
df_denom = ((s1**2 / n1)**2 / (n1 - 1)) + ((s2**2 / n2)**2 / (n2 - 1))
df_welch = df_num / df_denom

# Two-sided p-value
p_val_ttest = 2 * stats.t.sf(np.abs(t_stat), df_welch)

# Linear regression
model = smf.ols('mrm2 ~ treatment', data=df_mrm2).fit()

# Show results
print(f"Welch's t-statistic: {t_stat:.3f}")
print(f"Welch's p-value: {p_val_ttest:.3f}")
print(f"Regression coefficient (treatment effect): {model.params['treatment']:.3f}")
print(f"Regression t-statistic: {model.tvalues['treatment']:.3f}")
print(f"Regression p-value: {model.pvalues['treatment']:.3f}")


Welch's t-statistic: 0.120
Welch's p-value: 0.905
Regression coefficient (treatment effect): 0.014
Regression t-statistic: 0.119
Regression p-value: 0.905
