In [1]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from constants import numeric_features, categorical_features
from sklearn.linear_model import LogisticRegression
from part2.shared import load_processed_data
from part2.shared import load_train_with_validation_data


In [2]:
NUMBER_OF_EPOCHS = 250

In [3]:

df = load_processed_data()
df = df[df["Target"].isin(["Graduate", "Dropout"])]
df["y"] = (df["Target"] == "Graduate").astype(int)
df = df.drop(columns=["Target", "Target encoded"], errors='ignore')

In [4]:
X = df.drop(columns=["y"])
y = df["y"].values

In [5]:
num_features = [c for c in numeric_features if c in X.columns]
cat_features = [c for c in categorical_features if c in X.columns]

In [6]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [7]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [8]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

In [9]:
X_train_raw, X_val_raw, X_test_raw, y_train, y_val, y_test = load_train_with_validation_data(X, y)
X_train = full_pipeline.fit_transform(X_train_raw)
X_test = full_pipeline.transform(X_test_raw)
X_val = full_pipeline.transform(X_val_raw)

In [10]:
X_train_bias = np.hstack([np.ones((X_train.shape[0], 1)), X_train.toarray()])
X_test_bias = np.hstack([np.ones((X_test.shape[0], 1)), X_test.toarray()])
X_val_bias = np.hstack([np.ones((X_val.shape[0], 1)), X_val.toarray()])

In [11]:
def sigmoid(z: np.ndarray) -> np.ndarray:
    return 1.0 / (1.0 + np.exp(-z))

$$
\sigma(z) = \frac{1}{1 + e^{-z}}
$$

In [12]:
def cross_entropy(y_true, y_prob):
    eps = 1e-15
    y_prob = np.clip(y_prob, eps, 1 - eps) # Avoid log(0)
    return -np.mean(y_true * np.log(y_prob) + (1 - y_true) * np.log(1 - y_prob))
#average cross entropy loss

$$
\mathcal{L} = -\frac{1}{m} \sum_{i=1}^{m} \left[ y^{(i)} \log(\hat{y}^{(i)}) + (1 - y^{(i)}) \log(1 - \hat{y}^{(i)}) \right]
$$

In [13]:
def logistic_gd(X, y, lr=0.05, epochs=NUMBER_OF_EPOCHS, batch_size=64, lambda_l2=0.001, seed=0):
    m, n = X.shape
    rng = np.random.default_rng(seed)
    w = rng.normal(scale=0.01, size=n)
    b = 0.0
    history = []

    for ep in range(epochs):
        idx = rng.permutation(m)
        for start in range(0, m, batch_size):
            ii = idx[start:start + batch_size]
            Xb, yb = X[ii], y[ii]
            p = sigmoid(Xb @ w + b) #probabilities
            error = p - yb
            grad_w = (Xb.T @ error) / len(yb) + lambda_l2 * w # L2 regularization to avoid overfitting
            grad_b = error.mean()
            w -= lr * grad_w # update weights
            b -= lr * grad_b # update bias

        p_all = sigmoid(X @ w + b)
        loss = cross_entropy(y, p_all)
        history.append(loss)

        if (ep + 1) % 20 == 0 or ep == 0:
            pred = (p_all >= 0.5).astype(int)
            acc = (pred == y).mean()
            print(f"Epoch {ep+1:3d}/{epochs}  |  loss={loss:.4f}  |  acc={acc:.3f}")

    return w, b, history

In [14]:
w, b, loss_hist = logistic_gd(X_train_bias, y_train)


Epoch   1/250  |  loss=0.4189  |  acc=0.832
Epoch  20/250  |  loss=0.2558  |  acc=0.907
Epoch  40/250  |  loss=0.2390  |  acc=0.914
Epoch  60/250  |  loss=0.2320  |  acc=0.915
Epoch  80/250  |  loss=0.2283  |  acc=0.917
Epoch 100/250  |  loss=0.2259  |  acc=0.917
Epoch 120/250  |  loss=0.2243  |  acc=0.916
Epoch 140/250  |  loss=0.2231  |  acc=0.917
Epoch 160/250  |  loss=0.2223  |  acc=0.919
Epoch 180/250  |  loss=0.2216  |  acc=0.919
Epoch 200/250  |  loss=0.2211  |  acc=0.920
Epoch 220/250  |  loss=0.2207  |  acc=0.917
Epoch 240/250  |  loss=0.2204  |  acc=0.920


In [15]:
def evaluate(X_set, y_set, label="SET"):
    probability = sigmoid(X_set @ w + b)
    pred = (probability >= 0.5).astype(int)
    print(f"\n{label} METRICS")
    print("Accuracy :", accuracy_score(y_set, pred))
    print("F1-score :", f1_score(y_set, pred))
    print("AUC :", roc_auc_score(y_set, probability))

In [16]:
evaluate(X_train_bias, y_train, "TRAIN")
evaluate(X_test_bias, y_test, "TEST")
evaluate(X_val_bias, y_val, "VALIDATION")


TRAIN METRICS
Accuracy : 0.9177165354330709
F1-score : 0.9346262120738192
AUC : 0.9614695340501791

TEST METRICS
Accuracy : 0.9202200825309491
F1-score : 0.9358407079646017
AUC : 0.9606635297109974

VALIDATION METRICS
Accuracy : 0.9146005509641874
F1-score : 0.9309576837416481
AUC : 0.9616717057193247


In [17]:
model_sklearn = LogisticRegression(
    penalty='l2',
    max_iter=NUMBER_OF_EPOCHS,
)

In [18]:
model_sklearn.fit(X_train, y_train)
y_pred_train = model_sklearn.predict(X_train)
y_pred_test  = model_sklearn.predict(X_test)

In [19]:
def evaluate_sklearn(model, X_set, y_set, label="SET"):
    probability = model.predict_proba(X_set)[:, 1]
    pred = model.predict(X_set)
    print(f"\n{label} METRICS")
    print("Accuracy :", accuracy_score(y_set, pred))
    print("F1-score :", f1_score(y_set, pred))
    print("AUC  :", roc_auc_score(y_set, probability))

evaluate_sklearn(model_sklearn, X_train, y_train, "TRAIN")
evaluate_sklearn(model_sklearn, X_test, y_test, "TEST")
evaluate_sklearn(model_sklearn, X_val, y_val, "VALIDATION")


TRAIN METRICS
Accuracy : 0.9204724409448819
F1-score : 0.936875
AUC  : 0.9635490387748452

TEST METRICS
Accuracy : 0.9119669876203577
F1-score : 0.9292035398230089
AUC  : 0.9607589101198615

VALIDATION METRICS
Accuracy : 0.9118457300275482
F1-score : 0.9288888888888889
AUC  : 0.9625220458553791
