In [None]:
from sklearn.compose import ColumnTransformer
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score


#Loading presplit datasets : train, val, test
train_df = pd.read_csv("https://media.githubusercontent.com/media/leahhkim/final_project_analyticsII/refs/heads/main/Data/nvss_train.csv", low_memory=False)
val_df   = pd.read_csv("https://media.githubusercontent.com/media/leahhkim/final_project_analyticsII/refs/heads/main/Data/nvss_val.csv",   low_memory=False)
test_df  = pd.read_csv("https://media.githubusercontent.com/media/leahhkim/final_project_analyticsII/refs/heads/main/Data/nvss_test.csv",  low_memory=False)

#Separate y and X for each split dataset
y_train = train_df["infant_death"].astype(int)
X_train = train_df.drop(columns=["infant_death"], errors="ignore")

y_val = val_df["infant_death"].astype(int)
X_val = val_df.drop(columns=["infant_death"], errors="ignore")

y_test = test_df["infant_death"].astype(int)
X_test = test_df.drop(columns=["infant_death"], errors="ignore")

# # columns by dtype (simple)
# num_cols = X_train.select_dtypes(include="number").columns
# cat_cols = X_train.select_dtypes(exclude="number").columns

#standard scaler: scales each feature to mean = 0, std = 1
#linearSVC: linear support vector machine classifier (fast for large datasets)
model = Pipeline([
    ("sc", StandardScaler()),
    ("svm", LinearSVC(max_iter=20000))
])

#define hyperparameters to tune with cross-validation
#set class_weight to balanced, which adjusts weights inversely proportional to class frequency, which
#helps when infant death is a rare event (class imbalance)
param_grid = {
    "svm__C": [0.01, 0.1, 1, 10],
    "svm__class_weight": [None, "balanced"]
}

#set up stratified cross validation
#stratifiedKFold keeps the class proportions (death vs alive) similar in each fold
#important for rare outcomes
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state = 42)

#grid search CV: train multiple models and pick the best by F1 score
#scoring="f1" balances precision and recall for the positive class
#more meaningful than accuracy when positive class is rare
grid = GridSearchCV(model, param_grid, scoring="f1", cv=cv, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)

#evaluate on validation set (used for model selection)
val_pred = grid.predict(X_val)
print("\nValidation")
print(confusion_matrix(y_val, val_pred))
print(classification_report(y_val, val_pred, digits=4))

#final evaluation on the test set
#test set should remain untouched
test_pred = grid.predict(X_test)
print("\nTest")
print(confusion_matrix(y_test, test_pred))
print(classification_report(y_test, test_pred, digits=4))

#report accuracy score
#accuracy can look very high even if model misses many infant death cases, because "alive" class dominates
print("VAL accuracy:", accuracy_score(y_val, val_pred))
print("TEST accuracy:", accuracy_score(y_test, test_pred))