In [67]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from constants import numeric_features, categorical_features
from part2.shared import load_processed_data
from part3.Mixture import SimpleMixtureOfExperts
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from part2.shared import load_train_with_validation_data

In [68]:

df = load_processed_data()
df = df[df["Target"].isin(["Graduate", "Dropout"])]
df["y"] = (df["Target"] == "Graduate").astype(int)
df = df.drop(columns=["Target", "Target encoded"], errors='ignore')

In [69]:
X = df.drop(columns=["y"])
y = df["y"].values

In [70]:
num_features = [c for c in numeric_features if c in X.columns]
cat_features = [c for c in categorical_features if c in X.columns]

In [71]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [72]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [73]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

In [74]:
X_train_raw, X_val_raw, X_test_raw, y_train, y_val, y_test = load_train_with_validation_data(X, y)
X_train = full_pipeline.fit_transform(X_train_raw)
X_test = full_pipeline.transform(X_test_raw)
X_val = full_pipeline.transform(X_val_raw)

In [75]:
from sklearn.svm import SVC

experts = [LogisticRegression(max_iter=1000), RandomForestClassifier(), SVC(kernel='linear', probability=True)]
moe = SimpleMixtureOfExperts(experts=experts)

moe.fit(X_train, y_train)
y_pred = moe.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9074    0.8627    0.8845       284
           1     0.9147    0.9436    0.9289       443

    accuracy                         0.9120       727
   macro avg     0.9110    0.9031    0.9067       727
weighted avg     0.9118    0.9120    0.9115       727



In [76]:
logistic = LogisticRegression(max_iter=1000)
logistic.fit(X_train, y_train)
y_pred_logistic = logistic.predict(X_test)
print("Logistic Regression Metrics:")
print(classification_report(y_test, y_pred_logistic, digits=4))

Logistic Regression Metrics:
              precision    recall  f1-score   support

           0     0.9135    0.8556    0.8836       284
           1     0.9111    0.9481    0.9292       443

    accuracy                         0.9120       727
   macro avg     0.9123    0.9019    0.9064       727
weighted avg     0.9120    0.9120    0.9114       727



In [77]:
randomForest = RandomForestClassifier()
randomForest.fit(X_train, y_train)
y_pred_rf = randomForest.predict(X_test)
print("Random Forest Metrics:")
print(classification_report(y_test, y_pred_rf, digits=4))

Random Forest Metrics:
              precision    recall  f1-score   support

           0     0.9453    0.8521    0.8963       284
           1     0.9108    0.9684    0.9387       443

    accuracy                         0.9230       727
   macro avg     0.9281    0.9103    0.9175       727
weighted avg     0.9243    0.9230    0.9222       727



In [78]:
svc = SVC(kernel='linear', probability=True)
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)
print("SVC Metrics:")
print(classification_report(y_test, y_pred_svc, digits=4))

SVC Metrics:
              precision    recall  f1-score   support

           0     0.9225    0.8380    0.8782       284
           1     0.9019    0.9549    0.9276       443

    accuracy                         0.9092       727
   macro avg     0.9122    0.8964    0.9029       727
weighted avg     0.9100    0.9092    0.9083       727

