# Coding Quiz Analysis
This notebook runs the full analysis for:

- Regression (Y ~ X1 + X2 + X3)
- Nearest Neighbor matching distance
- Radius Neighbors matching effect and duplicate count

Data: `homework_1.1.csv` and `homework_1.2.csv`.

In [None]:

import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.neighbors import NearestNeighbors
from collections import Counter


In [None]:

# Load data
file1 = "homework_1.1.csv"
file2 = "homework_1.2.csv"

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

df1.head(), df2.head()


## 1️⃣ Regression: Y ~ X1 + X2 + X3

In [None]:

X = df1[['X1', 'X2', 'X3']]
X = sm.add_constant(X)
y = df1['Y']

model = sm.OLS(y, X).fit()
print(model.summary())

# t-stats and coefficients
t_stats = model.tvalues
coeffs = model.params

t_stats, coeffs


## 2️⃣ Nearest Neighbor match distance

In [None]:

treated = df2[df2['X'] == 1]
control = df2[df2['X'] == 0]

treated_z = treated[['Z']].values
control_z = control[['Z']].values

nn = NearestNeighbors(n_neighbors=1)
nn.fit(control_z)
distances, indices = nn.kneighbors(treated_z)

max_distance = distances.max()
print(f"Farthest match distance: {max_distance:.4f}")


## 3️⃣ Radius Neighbors effect & duplicates

In [None]:

radius = 0.2
nbrs = NearestNeighbors(radius=radius)
nbrs.fit(control_z)

radius_neighbors = nbrs.radius_neighbors(treated_z, return_distance=False)

treated_Y = treated['Y'].values
control_Y = control['Y'].values

group_effects = []
all_matched_indices = []

for i, indices in enumerate(radius_neighbors):
    if len(indices) > 0:
        group_effect = treated_Y[i] - control_Y[indices].mean()
        group_effects.append(group_effect)
        all_matched_indices.extend(indices)

effect = np.mean(group_effects)
print(f"Estimated effect: {effect:.4f}")

# Duplicates
control_counts = Counter(all_matched_indices)
duplicates = sum([count - 1 for count in control_counts.values() if count > 1])
print(f"Number of duplicates (all but first use): {duplicates}")
