In [35]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from part2.shared import load_processed_data, load_train_with_validation_data
from constants import numeric_features, categorical_features

In [36]:
df = load_processed_data()
df = df[df["Target"].isin(["Graduate", "Dropout"])]
df["y"] = (df["Target"] == "Graduate").astype(int)
df = df.drop(columns=["Target", "Target encoded"], errors='ignore')

X = df.drop(columns=["y"])
y = df["y"].values

num_features = [c for c in numeric_features if c in X.columns]
cat_features = [c for c in categorical_features if c in X.columns]

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    # ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

X_train_raw, X_val_raw, X_test_raw, y_train, y_val, y_test = load_train_with_validation_data(X, y)
X_train = full_pipeline.fit_transform(X_train_raw)
X_val = full_pipeline.transform(X_val_raw)
X_test = full_pipeline.transform(X_test_raw)

# sm = SMOTE(random_state=0)
# X_train, y_train = sm.fit_resample(X_train, y_train)


In [37]:
log_clf = LogisticRegression(penalty="l1", solver="liblinear", C=1.0, max_iter=3000)
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2, random_state=42)

stacking  = StackingClassifier(
    estimators=[
        ('logreg', log_clf),
        ('rf', rf_clf),
    ],
    final_estimator=LogisticRegression(max_iter=1000)
)
stacking.fit(X_train, y_train)


In [38]:
y_pred = stacking.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))


              precision    recall  f1-score   support

           0     0.9331    0.8838    0.9078       284
           1     0.9279    0.9594    0.9434       443

    accuracy                         0.9298       727
   macro avg     0.9305    0.9216    0.9256       727
weighted avg     0.9300    0.9298    0.9295       727

