In [209]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from constants import numeric_features, categorical_features
from sklearn.linear_model import LogisticRegression
from part2.shared import load_processed_data
from part2.shared import load_train_with_validation_data


In [210]:
NUMBER_OF_EPOCHS = 250

In [211]:

df = load_processed_data()
df = df[df["Target"].isin(["Graduate", "Dropout"])]
df["y"] = (df["Target"] == "Graduate").astype(int)
df = df.drop(columns=["Target", "Target encoded"], errors='ignore')

In [212]:
X = df.drop(columns=["y"])
y = df["y"].values

In [213]:
num_features = [c for c in numeric_features if c in X.columns]
cat_features = [c for c in categorical_features if c in X.columns]

In [214]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [215]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [216]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

In [217]:
X_train_raw, X_val_raw, X_test_raw, y_train, y_val, y_test = load_train_with_validation_data(X, y)
X_train = full_pipeline.fit_transform(X_train_raw)
X_test = full_pipeline.transform(X_test_raw)
X_val = full_pipeline.transform(X_val_raw)

In [218]:
X_train_bias = np.hstack([np.ones((X_train.shape[0], 1)), X_train.toarray()])
X_test_bias = np.hstack([np.ones((X_test.shape[0], 1)), X_test.toarray()])
X_val_bias = np.hstack([np.ones((X_val.shape[0], 1)), X_val.toarray()])

In [219]:
from Logistic import CustomLogisticRegression
from sklearn.metrics import classification_report
import numpy as np

baseline_clf = CustomLogisticRegression(
    lr=0.05,
    epochs=NUMBER_OF_EPOCHS,
    batch_size=256,
)
baseline_clf.fit(
    X_train_bias, y_train,
    X_val_bias, y_val,
    X_test_bias, y_test
)

y_pred = baseline_clf.predict(X_test_bias)
print(classification_report(y_test, y_pred, digits=4))


              precision    recall  f1-score   support

           0     0.9254    0.8732    0.8986       284
           1     0.9216    0.9549    0.9379       443

    accuracy                         0.9230       727
   macro avg     0.9235    0.9140    0.9182       727
weighted avg     0.9231    0.9230    0.9225       727



In [220]:
unique, counts = np.unique(y_train, return_counts=True)
print("Liczebność klas:", dict(zip(unique, counts)))

Liczebność klas: {np.int64(0): np.int64(990), np.int64(1): np.int64(1550)}


In [221]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=1)

X_train_os, y_train_os = smote.fit_resample(X_train_bias, y_train)

unique_os, counts_os = np.unique(y_train_os, return_counts=True)
print("Liczebność klas (po SMOTE):", dict(zip(unique_os, counts_os)))


Liczebność klas (po SMOTE): {np.int64(0): np.int64(1550), np.int64(1): np.int64(1550)}


In [222]:
from Logistic import CustomLogisticRegression
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

model_os = CustomLogisticRegression(
    lr=0.05,
    epochs=NUMBER_OF_EPOCHS,
    batch_size=64,
    verbose=True,
)

model_os.fit(X_train_os, y_train_os, X_val_bias, y_val, X_test_bias, y_test)

y_pred_os = model_os.predict(X_test_bias)
print(classification_report(y_test, y_pred_os, digits=4))


Epoch   1/250  train_loss=0.4197  val_loss=0.3914
Epoch  20/250  train_loss=0.2598  val_loss=0.2452
Epoch  40/250  train_loss=0.2421  val_loss=0.2334
Epoch  60/250  train_loss=0.2342  val_loss=0.2231
Epoch  80/250  train_loss=0.2304  val_loss=0.2152
Epoch 100/250  train_loss=0.2273  val_loss=0.2146
Epoch 120/250  train_loss=0.2256  val_loss=0.2114
Epoch 140/250  train_loss=0.2242  val_loss=0.2120
Epoch 160/250  train_loss=0.2232  val_loss=0.2116
Epoch 180/250  train_loss=0.2226  val_loss=0.2121
Epoch 200/250  train_loss=0.2219  val_loss=0.2109
Epoch 220/250  train_loss=0.2214  val_loss=0.2098
Epoch 240/250  train_loss=0.2210  val_loss=0.2109
              precision    recall  f1-score   support

           0     0.8905    0.8873    0.8889       284
           1     0.9279    0.9300    0.9290       443

    accuracy                         0.9133       727
   macro avg     0.9092    0.9087    0.9089       727
weighted avg     0.9133    0.9133    0.9133       727



In [223]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=1)

X_train_us, y_train_us = rus.fit_resample(X_train_bias, y_train)

unique_us, counts_us = np.unique(y_train_us, return_counts=True)
print("Liczebność klas (po undersamplingu):", dict(zip(unique_us, counts_us)))


Liczebność klas (po undersamplingu): {np.int64(0): np.int64(990), np.int64(1): np.int64(990)}


In [224]:
model_us = CustomLogisticRegression(
    lr=0.05,
    epochs=NUMBER_OF_EPOCHS,
    batch_size=64,
    verbose=True,
)

model_us.fit(X_train_us, y_train_us, X_val_bias, y_val, X_test_bias, y_test)

y_pred_us = model_us.predict(X_test_bias)
print(classification_report(y_test, y_pred_us, digits=4))



Epoch   1/250  train_loss=0.4580  val_loss=0.4381
Epoch  20/250  train_loss=0.2787  val_loss=0.2635
Epoch  40/250  train_loss=0.2588  val_loss=0.2465
Epoch  60/250  train_loss=0.2500  val_loss=0.2366
Epoch  80/250  train_loss=0.2450  val_loss=0.2304
Epoch 100/250  train_loss=0.2418  val_loss=0.2269
Epoch 120/250  train_loss=0.2395  val_loss=0.2236
Epoch 140/250  train_loss=0.2378  val_loss=0.2231
Epoch 160/250  train_loss=0.2366  val_loss=0.2217
Epoch 180/250  train_loss=0.2355  val_loss=0.2201
Epoch 200/250  train_loss=0.2347  val_loss=0.2190
Epoch 220/250  train_loss=0.2340  val_loss=0.2189
Epoch 240/250  train_loss=0.2334  val_loss=0.2192
              precision    recall  f1-score   support

           0     0.8832    0.9049    0.8939       284
           1     0.9381    0.9233    0.9306       443

    accuracy                         0.9161       727
   macro avg     0.9106    0.9141    0.9123       727
weighted avg     0.9166    0.9161    0.9163       727



In [225]:
from sklearn.metrics import recall_score

results_summary = {
    "Model": ["Oryginalne dane", "SMOTE", "Undersampling"],
    "Accuracy": [
        accuracy_score(y_test, baseline_clf.predict(X_test_bias)),
        accuracy_score(y_test, model_os.predict(X_test_bias)),
        accuracy_score(y_test, model_us.predict(X_test_bias))
    ],
    "Recall (klasa 0)": [
        recall_score(y_test, baseline_clf.predict(X_test_bias), pos_label=0),
        recall_score(y_test, model_os.predict(X_test_bias), pos_label=0),
        recall_score(y_test, model_us.predict(X_test_bias), pos_label=0)
    ],
    "F1 (klasa 0)": [
        f1_score(y_test, baseline_clf.predict(X_test_bias), pos_label=0),
        f1_score(y_test, model_os.predict(X_test_bias), pos_label=0),
        f1_score(y_test, model_us.predict(X_test_bias), pos_label=0)
    ]
}

import pandas as pd

summary_df = pd.DataFrame(results_summary)
print(summary_df)


             Model  Accuracy  Recall (klasa 0)  F1 (klasa 0)
0  Oryginalne dane  0.922971          0.873239      0.898551
1            SMOTE  0.913343          0.887324      0.888889
2    Undersampling  0.916094          0.904930      0.893913
