In [26]:
import numpy as np
import pandas as pd
from constants import numeric_features, categorical_features
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from data_process import process_data
from sklearn.linear_model import LogisticRegression

In [27]:
df = pd.read_csv("../data/students.csv", sep=";")
df = process_data(df)

In [28]:
df = df[df["Target"].isin(["Graduate", "Dropout"])]
df["y"] = (df["Target"] == "Graduate").astype(float)
df = df.drop(columns=["Target", "Target encoded"], errors='ignore')

In [29]:
num_cols = [c for c in numeric_features if c in df.columns]
X_num = df[num_cols].copy().fillna(df[num_cols].mean())
X_num = (X_num - X_num.mean()) / X_num.std(ddof=0)

In [30]:
cat_cols = [c for c in categorical_features if c in df.columns]
X_cat = pd.get_dummies(df[cat_cols], drop_first=True)

In [31]:
X = pd.concat([X_num, X_cat], axis=1).astype(np.float64).reset_index(drop=True)
y = df["y"].reset_index(drop=True).to_numpy(np.float64)
X = X.to_numpy()

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=0)

In [33]:
def sigmoid(z: np.ndarray) -> np.ndarray:
    return 1.0 / (1.0 + np.exp(-z))

def cross_entropy(y_true: np.ndarray, y_prob: np.ndarray) -> float:
    eps = 1e-15
    y_prob = np.clip(y_prob, eps, 1 - eps)
    return -np.mean(y_true * np.log(y_prob) + (1 - y_true) * np.log(1 - y_prob))

In [34]:
def logistic_gd(
    X_mat, y_vec,
    lr=0.05,
    epochs=250,
    batch_size=64,
    lambda_l2=0.001,
    seed=0,
):
    m, n = X_mat.shape
    rng = np.random.default_rng(seed)
    w = rng.normal(scale=0.01, size=n)
    b = 0.0
    history = []

    for ep in range(epochs):
        idx = rng.permutation(m)
        for start in range(0, m, batch_size):
            ii = idx[start:start + batch_size]
            Xb, yb = X_mat[ii], y_vec[ii]

            p = sigmoid(Xb @ w + b)
            error = p - yb

            grad_w = (Xb.T @ error) / len(yb) + lambda_l2 * w
            grad_b = error.mean()

            w -= lr * grad_w
            b -= lr * grad_b

        p_all = sigmoid(X_mat @ w + b)
        loss  = cross_entropy(y_vec, p_all) + (lambda_l2 / 2) * np.sum(w ** 2)
        history.append(loss)

        if (ep + 1) % 20 == 0 or ep == 0:
            pred = (p_all >= 0.5).astype(int)
            acc  = (pred == y_vec).mean()
            print(f"Epoch {ep+1:3d}/{epochs}  |  loss={loss:.4f}  |  acc={acc:.3f}")

    return w, b, history

In [35]:
w, b, loss_hist = logistic_gd(
    X_train, y_train,
    lr=0.05,
    epochs=250,
    batch_size=128,
    lambda_l2=0.001,
)

Epoch   1/250  |  loss=0.4721  |  acc=0.845
Epoch  20/250  |  loss=0.2825  |  acc=0.895
Epoch  40/250  |  loss=0.2730  |  acc=0.898
Epoch  60/250  |  loss=0.2481  |  acc=0.913
Epoch  80/250  |  loss=0.2521  |  acc=0.909
Epoch 100/250  |  loss=0.2387  |  acc=0.919
Epoch 120/250  |  loss=0.2403  |  acc=0.917
Epoch 140/250  |  loss=0.2373  |  acc=0.919
Epoch 160/250  |  loss=0.2334  |  acc=0.921
Epoch 180/250  |  loss=0.2324  |  acc=0.923
Epoch 200/250  |  loss=0.2325  |  acc=0.920
Epoch 220/250  |  loss=0.2339  |  acc=0.920
Epoch 240/250  |  loss=0.2345  |  acc=0.914


In [36]:
def evaluate(X_set, y_set, label="SET"):
    proba = sigmoid(X_set @ w + b)
    pred  = (proba >= 0.5).astype(int)
    print(f"\n{label} METRICS")
    print("-" * 30)
    print("Accuracy :", accuracy_score(y_set, pred))
    print("F1-score :", f1_score(y_set, pred))
    print("AUC-ROC  :", roc_auc_score(y_set, proba))

In [37]:
evaluate(X_train, y_train, "TRAIN")
evaluate(X_test,  y_test,  "TEST")


TRAIN METRICS
------------------------------
Accuracy : 0.9194214876033058
F1-score : 0.9368932038834952
AUC-ROC  : 0.9600025683410159

TEST METRICS
------------------------------
Accuracy : 0.9118457300275482
F1-score : 0.9302832244008714
AUC-ROC  : 0.9598336626091389


In [38]:
model_sklearn = LogisticRegression(
    penalty='l2',
    C=1 / 0.001,
    max_iter=250,
    solver='lbfgs'
)

In [39]:
model_sklearn.fit(X_train, y_train)
y_pred_train = model_sklearn.predict(X_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
print("\nSKLEARN TRAIN METRICS")
print("-" * 30)
print("Accuracy :", accuracy_score(y_train, y_pred_train))
print("F1-score :", f1_score(y_train, y_pred_train))
print("AUC-ROC  :", roc_auc_score(y_train, model_sklearn.predict_proba(X_train)[:,1]))



SKLEARN TRAIN METRICS
------------------------------
Accuracy : 0.9294077134986226
F1-score : 0.9435728048444811
AUC-ROC  : 0.9665179915772352


In [41]:
y_pred_test  = model_sklearn.predict(X_test)
print("\nSKLEARN TEST METRICS")
print("-" * 30)
print("Accuracy :", accuracy_score(y_test, y_pred_test))
print("F1-score :", f1_score(y_test, y_pred_test))
print("AUC-ROC  :", roc_auc_score(y_test, model_sklearn.predict_proba(X_test)[:,1]))


SKLEARN TEST METRICS
------------------------------
Accuracy : 0.9022038567493113
F1-score : 0.9206703910614525
AUC-ROC  : 0.9532534573959595
