In [188]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from data_process import process_data
from constants import numeric_features, categorical_features
from sklearn.linear_model import LogisticRegression
from ai_models.shared import load_processed_data
from ai_models.shared import load_train_with_validation_data


In [189]:
NUMBER_OF_EPOCHS = 250

In [190]:

df = load_processed_data()
df = df[df["Target"].isin(["Graduate", "Dropout"])]
df["y"] = (df["Target"] == "Graduate").astype(int)
df = df.drop(columns=["Target", "Target encoded"], errors='ignore')

In [191]:
X = df.drop(columns=["y"])
y = df["y"].values

In [192]:
num_features = [c for c in numeric_features if c in X.columns]
cat_features = [c for c in categorical_features if c in X.columns]

In [193]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [194]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [195]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

In [196]:
X_train_raw, X_val_raw, X_test_raw, y_train, y_val, y_test = load_train_with_validation_data(X, y)
X_train = full_pipeline.fit_transform(X_train_raw)
X_test = full_pipeline.transform(X_test_raw)
X_val = full_pipeline.transform(X_val_raw)

In [197]:
X_train_bias = np.hstack([np.ones((X_train.shape[0], 1)), X_train.toarray()])
X_test_bias = np.hstack([np.ones((X_test.shape[0], 1)), X_test.toarray()])
X_val_bias = np.hstack([np.ones((X_val.shape[0], 1)), X_val.toarray()])

In [198]:
def sigmoid(z: np.ndarray) -> np.ndarray:
    return 1.0 / (1.0 + np.exp(-z))

$$
\sigma(z) = \frac{1}{1 + e^{-z}}
$$

In [199]:
def cross_entropy(y_true, y_prob):
    eps = 1e-15
    y_prob = np.clip(y_prob, eps, 1 - eps) # Avoid log(0)
    return -np.mean(y_true * np.log(y_prob) + (1 - y_true) * np.log(1 - y_prob))

$$
\mathcal{L} = -\frac{1}{m} \sum_{i=1}^{m} \left[ y^{(i)} \log(\hat{y}^{(i)}) + (1 - y^{(i)}) \log(1 - \hat{y}^{(i)}) \right]
$$

In [200]:
def logistic_gd(X, y, lr=0.05, epochs=NUMBER_OF_EPOCHS, batch_size=64, lambda_l2=0.001, seed=0):
    m, n = X.shape
    rng = np.random.default_rng(seed)
    w = rng.normal(scale=0.01, size=n)
    b = 0.0
    history = []

    for ep in range(epochs):
        idx = rng.permutation(m)
        for start in range(0, m, batch_size):
            ii = idx[start:start + batch_size]
            Xb, yb = X[ii], y[ii]
            p = sigmoid(Xb @ w + b) #probabilities
            error = p - yb
            grad_w = (Xb.T @ error) / len(yb) + lambda_l2 * w # L2 regularization to avoid overfitting
            grad_b = error.mean()
            w -= lr * grad_w # update weights
            b -= lr * grad_b # update bias

        p_all = sigmoid(X @ w + b)
        loss = cross_entropy(y, p_all)
        history.append(loss)

        if (ep + 1) % 20 == 0 or ep == 0:
            pred = (p_all >= 0.5).astype(int)
            acc = (pred == y).mean()
            print(f"Epoch {ep+1:3d}/{epochs}  |  loss={loss:.4f}  |  acc={acc:.3f}")

    return w, b, history

In [201]:
w, b, loss_hist = logistic_gd(X_train_bias, y_train)


Epoch   1/250  |  loss=0.4196  |  acc=0.831
Epoch  20/250  |  loss=0.2559  |  acc=0.908
Epoch  40/250  |  loss=0.2389  |  acc=0.913
Epoch  60/250  |  loss=0.2320  |  acc=0.915
Epoch  80/250  |  loss=0.2282  |  acc=0.917
Epoch 100/250  |  loss=0.2258  |  acc=0.917
Epoch 120/250  |  loss=0.2242  |  acc=0.916
Epoch 140/250  |  loss=0.2231  |  acc=0.918
Epoch 160/250  |  loss=0.2222  |  acc=0.918
Epoch 180/250  |  loss=0.2216  |  acc=0.919
Epoch 200/250  |  loss=0.2210  |  acc=0.920
Epoch 220/250  |  loss=0.2206  |  acc=0.919
Epoch 240/250  |  loss=0.2203  |  acc=0.921


In [202]:
def evaluate(X_set, y_set, label="SET"):
    probability = sigmoid(X_set @ w + b)
    pred = (probability >= 0.5).astype(int)
    print(f"\n{label} METRICS")
    print("Accuracy :", accuracy_score(y_set, pred))
    print("F1-score :", f1_score(y_set, pred))
    print("AUC-ROC  :", roc_auc_score(y_set, probability))

In [203]:
evaluate(X_train_bias, y_train, "TRAIN")
evaluate(X_test_bias, y_test, "TEST")
evaluate(X_val_bias, y_val, "VALIDATION")


TRAIN METRICS
Accuracy : 0.9208972845336482
F1-score : 0.9374805598755832
AUC-ROC  : 0.9614175279552455

TEST METRICS
Accuracy : 0.9192660550458716
F1-score : 0.9345238095238095
AUC-ROC  : 0.9614195796335843

VALIDATION METRICS
Accuracy : 0.9117647058823529
F1-score : 0.9300291545189504
AUC-ROC  : 0.9599546870574909


In [204]:
model_sklearn = LogisticRegression(
    penalty='l2',
    max_iter=NUMBER_OF_EPOCHS,
)

In [205]:
model_sklearn.fit(X_train, y_train)
y_pred_train = model_sklearn.predict(X_train)
y_pred_test  = model_sklearn.predict(X_test)

In [206]:
def evaluate_sklearn(model, X_set, y_set, label="SET"):
    probability = model.predict_proba(X_set)[:, 1]
    pred = model.predict(X_set)
    print(f"\n{label} METRICS")
    print("Accuracy :", accuracy_score(y_set, pred))
    print("F1-score :", f1_score(y_set, pred))
    print("AUC-ROC  :", roc_auc_score(y_set, probability))

evaluate_sklearn(model_sklearn, X_train, y_train, "TRAIN")
evaluate_sklearn(model_sklearn, X_test, y_test, "TEST")
evaluate_sklearn(model_sklearn, X_val, y_val, "VALIDATION")


TRAIN METRICS
Accuracy : 0.9208972845336482
F1-score : 0.9372463315641586
AUC-ROC  : 0.9635510488508555

TEST METRICS
Accuracy : 0.9155963302752294
F1-score : 0.9313432835820895
AUC-ROC  : 0.9615460267505902

VALIDATION METRICS
Accuracy : 0.90625
F1-score : 0.9253294289897511
AUC-ROC  : 0.960096290002832
