In [13]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [15]:
file_path = '/Users/yarden/Documents/Atuda/spring_2024/causal_inference/Causal-Inference-Project-Effect-of-Age-on-Graduating/code/data/processed_data.csv'
data = pd.read_csv(file_path)

# First, encode the target variable into binary (1 for Graduate, 0 for Dropout)
data['Target_binary'] = data['Target'].apply(lambda x: 1 if x == 'Graduate' else 0)

data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Mother's qualification,Father's qualification,Mother's occupation,...,Displaced,Educational special needs,Gender,Scholarship holder,Unemployment rate,Inflation rate,GDP,Target,Adult,Target_binary
0,1,17,5,171,1,Complete Secondary Education,122.0,Incomplete Secondary Education,Complete Secondary Education,Administrative,...,1,0,1,0,10.8,1.4,1.74,Dropout,0,0
1,1,15,1,9254,1,Complete Secondary Education,160.0,Complete Secondary Education,Higher Education - Undergraduate,Professionals,...,1,0,1,0,13.9,-0.3,0.79,Graduate,0,1
2,1,1,5,9070,1,Complete Secondary Education,122.0,Other Specific Qualifications,Other Specific Qualifications,Plant and Machine Operators,...,1,0,1,0,10.8,1.4,1.74,Dropout,0,0
3,1,17,2,9773,1,Complete Secondary Education,122.0,Other Specific Qualifications,Other Specific Qualifications,Administrative,...,1,0,0,0,9.4,-0.8,-3.12,Graduate,0,1
4,2,39,1,8014,0,Complete Secondary Education,100.0,Other Specific Qualifications,Other Specific Qualifications,Plant and Machine Operators,...,0,0,0,0,13.9,-0.3,0.79,Graduate,1,1


# Covariate Adjustment

## S-learner

In [16]:
# Choose X (features) and t (treatment variable)
# We'll use a subset of features for simplicity
X = data[['Marital status', 'Application mode', 'Daytime/evening attendance', 'Admission grade',
          'Previous qualification (grade)', 'Unemployment rate', 'Inflation rate', 'GDP']]
t = data['Adult']

# Combine features and treatment variable to create the full feature set
X_full = pd.concat([X, t], axis=1)

# Define the target variable (binary target)
y = data['Target_binary']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size=0.3, random_state=42)

# Fit a Random Forest Classifier model to predict the target variable
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

accuracy

0.5929411764705882

In [22]:
# Make predictions for all data points
y_pred_all = model.predict(X_full)

# Filter predictions where Adult = 1 and Adult = 0
treated_predictions_sum = sum(y_pred_all[data['Adult'] == 1])
control_predictions_sum = sum(y_pred_all[data['Adult'] == 0])

# Calculate the number of data points (n)
n = len(data)

# Calculate the desired value
ATE = (treated_predictions_sum - control_predictions_sum) / n

ATE

-0.21252059308072488

## T-learner

In [24]:
X_1 = X[X_full['Adult'] == 1]
X_0 = X[X_full['Adult'] == 0]

y_1 = data[data['Adult'] == 1]['Target_binary']
y_0 = data[data['Adult'] == 0]['Target_binary']

# Split the dataset into training and testing sets for each subgroup
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.3, random_state=42)
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X_0, y_0, test_size=0.3, random_state=42)

# Fit a Random Forest Classifier model to predict the target variable for each subgroup
model_1 = RandomForestClassifier(random_state=42)
model_1.fit(X_train_1, y_train_1)

model_0 = RandomForestClassifier(random_state=42)
model_0.fit(X_train_0, y_train_0)

# Predict on the test set for each subgroup
y_pred_1 = model_1.predict(X_test_1)
y_pred_0 = model_0.predict(X_test_0)

# Calculate accuracy of the model for each subgroup
accuracy_1 = accuracy_score(y_test_1, y_pred_1)
accuracy_0 = accuracy_score(y_test_0, y_pred_0)

accuracy_1, accuracy_0

(0.6052631578947368, 0.6002691790040376)

In [26]:
y_pred_all_1 = model_1.predict(X)
y_pred_all_0 = model_0.predict(X)

treated_predictions_sum = sum(y_pred_all_1)
control_predictions_sum = sum(y_pred_all_0)

ATE = (treated_predictions_sum - control_predictions_sum) / n

ATE

-0.2704165686043775