In [37]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from constants import numeric_features, categorical_features
from part2.shared import load_processed_data
from sklearn.model_selection import KFold


In [38]:
NUMBER_OF_EPOCHS = 250

In [39]:

df = load_processed_data()
df = df[df["Target"].isin(["Graduate", "Dropout"])]
df["y"] = (df["Target"] == "Graduate").astype(int)
df = df.drop(columns=["Target", "Target encoded"], errors='ignore')

In [40]:
X = df.drop(columns=["y"])
y = df["y"].values

In [41]:
num_features = [c for c in numeric_features if c in X.columns]
cat_features = [c for c in categorical_features if c in X.columns]

In [42]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [43]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [44]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

In [45]:
def sigmoid(z: np.ndarray) -> np.ndarray:
    return 1.0 / (1.0 + np.exp(-z))

$$
\sigma(z) = \frac{1}{1 + e^{-z}}
$$

In [46]:
def cross_entropy(y_true, y_prob):
    eps = 1e-15
    y_prob = np.clip(y_prob, eps, 1 - eps) # Avoid log(0)
    return -np.mean(y_true * np.log(y_prob) + (1 - y_true) * np.log(1 - y_prob))
#average cross entropy loss

$$
\mathcal{L} = -\frac{1}{m} \sum_{i=1}^{m} \left[ y^{(i)} \log(\hat{y}^{(i)}) + (1 - y^{(i)}) \log(1 - \hat{y}^{(i)}) \right]
$$

In [47]:
def logistic_gd(X, y, lr=0.05, epochs=NUMBER_OF_EPOCHS, batch_size=64, lambda_l2=0.001, seed=0):
    m, n = X.shape
    rng = np.random.default_rng(seed)
    w = rng.normal(scale=0.01, size=n)
    b = 0.0
    history = []

    for ep in range(epochs):
        idx = rng.permutation(m)
        for start in range(0, m, batch_size):
            ii = idx[start:start + batch_size]
            Xb, yb = X[ii], y[ii]
            p = sigmoid(Xb @ w + b) #probabilities
            error = p - yb
            grad_w = (Xb.T @ error) / len(yb) + lambda_l2 * w # L2 regularization to avoid overfitting
            grad_b = error.mean()
            w -= lr * grad_w # update weights
            b -= lr * grad_b # update bias

        p_all = sigmoid(X @ w + b)
        loss = cross_entropy(y, p_all)
        history.append(loss)

        if (ep + 1) % 20 == 0 or ep == 0:
            pred = (p_all >= 0.5).astype(int)
            acc = (pred == y).mean()
            print(f"Epoch {ep+1:3d}/{epochs}  |  loss={loss:.4f}  |  acc={acc:.3f}")

    return w, b, history

In [48]:
from sklearn.model_selection import StratifiedKFold, train_test_split, StratifiedShuffleSplit
from sklearn.metrics import (accuracy_score,
                             precision_recall_fscore_support)
import numpy as np

kf_outer = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
fold_results = []

for fold, (train_val_idx, test_idx) in enumerate(kf_outer.split(X, y), 1):
    X_train_val_raw, X_test_raw = X.iloc[train_val_idx], X.iloc[test_idx]
    y_train_val,  y_test        = y[train_val_idx], y[test_idx]

    X_tr_raw, X_val_raw, y_tr, y_val = train_test_split(
        X_train_val_raw, y_train_val,
        test_size=0.20, stratify=y_train_val, random_state=fold
    )


    X_tr  = full_pipeline.fit_transform(X_tr_raw).toarray()
    X_val = full_pipeline.transform(X_val_raw).toarray()
    X_tst = full_pipeline.transform(X_test_raw).toarray()
    X_tr  = np.hstack([np.ones((X_tr.shape[0], 1)),  X_tr])
    X_val  = np.hstack([np.ones((X_val.shape[0], 1)),  X_val])
    X_tst  = np.hstack([np.ones((X_tst.shape[0], 1)),  X_tst])

    w, b, hist = logistic_gd(X_tr, y_tr)

    val_probs = sigmoid(X_val @ w + b)
    y_val_pred = (val_probs >= 0.5).astype(int)
    val_acc  = accuracy_score(y_val, y_val_pred)

    tst_probs = sigmoid(X_tst @ w + b)
    y_tst_pred = (tst_probs >= 0.5).astype(int)

    tst_acc = accuracy_score(y_test, y_tst_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_test, y_tst_pred, average="binary", zero_division=0
    )

    fold_results.append({"fold": fold,
                         "val_acc": val_acc,
                         "test_acc": tst_acc,
                         "precision": prec,
                         "recall": rec,
                         "f1": f1})

    print(f"Fold {fold}:  val_acc={val_acc:.3f} | "
          f"test_acc={tst_acc:.3f}  prec={prec:.3f}  "
          f"rec={rec:.3f}  f1={f1:.3f}")



Epoch   1/250  |  loss=0.4328  |  acc=0.825
Epoch  20/250  |  loss=0.2501  |  acc=0.910
Epoch  40/250  |  loss=0.2292  |  acc=0.918
Epoch  60/250  |  loss=0.2194  |  acc=0.921
Epoch  80/250  |  loss=0.2142  |  acc=0.924
Epoch 100/250  |  loss=0.2107  |  acc=0.924
Epoch 120/250  |  loss=0.2083  |  acc=0.926
Epoch 140/250  |  loss=0.2064  |  acc=0.926
Epoch 160/250  |  loss=0.2052  |  acc=0.926
Epoch 180/250  |  loss=0.2040  |  acc=0.925
Epoch 200/250  |  loss=0.2032  |  acc=0.926
Epoch 220/250  |  loss=0.2025  |  acc=0.927
Epoch 240/250  |  loss=0.2019  |  acc=0.927
Fold 1:  val_acc=0.917 | test_acc=0.908  prec=0.898  rec=0.958  f1=0.927
Epoch   1/250  |  loss=0.4370  |  acc=0.818
Epoch  20/250  |  loss=0.2548  |  acc=0.906
Epoch  40/250  |  loss=0.2329  |  acc=0.914
Epoch  60/250  |  loss=0.2229  |  acc=0.920
Epoch  80/250  |  loss=0.2170  |  acc=0.922
Epoch 100/250  |  loss=0.2131  |  acc=0.921
Epoch 120/250  |  loss=0.2104  |  acc=0.923
Epoch 140/250  |  loss=0.2082  |  acc=0.924
Epo