In [11]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from constants import numeric_features, categorical_features
from sklearn.linear_model import LogisticRegression
from part2.shared import load_processed_data
from part2.shared import load_train_with_validation_data
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestClassifier

In [12]:

df = load_processed_data()
df = df[df["Target"].isin(["Graduate", "Dropout"])]
df["y"] = (df["Target"] == "Graduate").astype(int)
df = df.drop(columns=["Target", "Target encoded"], errors='ignore')

In [13]:
X = df.drop(columns=["y"])
y = df["y"].values

In [14]:
num_features = [c for c in numeric_features if c in X.columns]
cat_features = [c for c in categorical_features if c in X.columns]

In [15]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [16]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [17]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

In [18]:
X_train_raw, X_val_raw, X_test_raw, y_train, y_val, y_test = load_train_with_validation_data(X, y)
X_train = full_pipeline.fit_transform(X_train_raw)
X_test = full_pipeline.transform(X_test_raw)
X_val = full_pipeline.transform(X_val_raw)

In [19]:
log_clf = LogisticRegression(max_iter=1000)
rf_clf = RandomForestClassifier()

voting  = VotingClassifier(
    estimators=[
        ('logreg', log_clf),
        ('rf', rf_clf),
    ],
    voting='soft'
)

voting.fit(X_train, y_train)
y_pred_voting = voting.predict(X_test)

stacking  = StackingClassifier(
    estimators=[
        ('logreg', log_clf),
        ('rf', rf_clf),
    ],
    final_estimator=LogisticRegression(max_iter=1000)
)

In [20]:
from sklearn.metrics import classification_report

stacking .fit(X_train, y_train)
y_pred_stacking = stacking.predict(X_test)

def evaluate_model(name, y_true, y_pred):
    print(f"\n{name} Results")
    print(classification_report(y_true, y_pred, digits=4))

evaluate_model("VotingClassifier", y_test, y_pred_voting)
evaluate_model("StackingClassifier", y_test, y_pred_stacking)


VotingClassifier Results
              precision    recall  f1-score   support

           0     0.9321    0.8697    0.8998       284
           1     0.9199    0.9594    0.9392       443

    accuracy                         0.9243       727
   macro avg     0.9260    0.9145    0.9195       727
weighted avg     0.9247    0.9243    0.9238       727


StackingClassifier Results
              precision    recall  f1-score   support

           0     0.9231    0.8873    0.9048       284
           1     0.9295    0.9526    0.9409       443

    accuracy                         0.9271       727
   macro avg     0.9263    0.9200    0.9229       727
weighted avg     0.9270    0.9271    0.9268       727

