In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from constants import numeric_features, categorical_features
from sklearn.linear_model import LogisticRegression
from part2.shared import load_processed_data
from part2.shared import load_train_with_validation_data


In [None]:
NUMBER_OF_EPOCHS = 250

In [None]:

df = load_processed_data()
df = df[df["Target"].isin(["Graduate", "Dropout"])]
df["y"] = (df["Target"] == "Graduate").astype(int)
df = df.drop(columns=["Target", "Target encoded"], errors='ignore')

In [None]:
X = df.drop(columns=["y"])
y = df["y"].values

In [None]:
num_features = [c for c in numeric_features if c in X.columns]
cat_features = [c for c in categorical_features if c in X.columns]

In [None]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [None]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [None]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

In [None]:
X_train_raw, X_val_raw, X_test_raw, y_train, y_val, y_test = load_train_with_validation_data(X, y)
X_train = full_pipeline.fit_transform(X_train_raw)
X_test = full_pipeline.transform(X_test_raw)
X_val = full_pipeline.transform(X_val_raw)

In [None]:
from Logistic import CustomLogisticRegression
import numpy as np

X_train = full_pipeline.fit_transform(X_train_raw)
X_val = full_pipeline.transform(X_val_raw)
X_test = full_pipeline.transform(X_test_raw)

X_train_bias = np.hstack([np.ones((X_train.shape[0], 1)), X_train.toarray()])
X_val_bias = np.hstack([np.ones((X_val.shape[0], 1)), X_val.toarray()])
X_test_bias = np.hstack([np.ones((X_test.shape[0], 1)), X_test.toarray()])

baseline_clf = CustomLogisticRegression(
    lr=0.05,
    epochs=250,
    batch_size=64,
    verbose=False,
)

baseline_clf.fit(X_train_bias, y_train, X_val_bias, y_val, X_test_bias, y_test)


In [None]:
import matplotlib.pyplot as plt

plt.plot(baseline_clf.history['train'], label="Train loss")
plt.plot(baseline_clf.history['test'], label="Train loss")
plt.xlabel("Epoch")
plt.ylabel("Log loss")
plt.title("Zbieżność CustomLogisticRegression")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_pipeline = ColumnTransformer([
    ("num_poly", Pipeline([
        ("imp", SimpleImputer(strategy="mean")),
        ("poly", PolynomialFeatures(degree=2, include_bias=False)),
        ("sc", StandardScaler())
    ]), num_features),
    ("cat", cat_pipeline, cat_features)
])


In [None]:
X_train_poly = poly_pipeline.fit_transform(X_train_raw)
X_val_poly   = poly_pipeline.transform(X_val_raw)
X_test_poly  = poly_pipeline.transform(X_test_raw)

X_train_poly_bias = np.hstack([np.ones((X_train_poly.shape[0], 1)), X_train_poly])
X_val_poly_bias   = np.hstack([np.ones((X_val_poly.shape[0], 1)), X_val_poly])
X_test_poly_bias  = np.hstack([np.ones((X_test_poly.shape[0], 1)), X_test_poly])

model_poly = CustomLogisticRegression(
    lr=0.05,
    epochs=NUMBER_OF_EPOCHS,
    batch_size=64,
    verbose=True,
)

model_poly.fit(X_train_poly_bias, y_train, X_val_poly_bias, y_val, X_test_poly_bias, y_test)

In [None]:
plt.plot(model_poly.history["train"], label="Train loss (poly)")
plt.plot(model_poly.history["val"], label="Validation loss (poly)")
plt.xlabel("Epoch")
plt.ylabel("Log loss")
plt.title("Zbieżność – model z PolynomialFeatures")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

y_pred_poly = model_poly.predict(X_test_poly_bias)
print(classification_report(y_test, y_pred_poly, digits=4))


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline

select_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("selector", SelectKBest(score_func=f_classif, k=5)),
    ("scaler", StandardScaler())
])

X_train_selected = select_pipeline.fit_transform(X_train_raw[num_features], y_train)
X_val_selected = select_pipeline.transform(X_val_raw[num_features])
X_test_selected = select_pipeline.transform(X_test_raw[num_features])

# bias
X_train_sel_bias = np.hstack([np.ones((X_train_selected.shape[0], 1)), X_train_selected])
X_val_sel_bias = np.hstack([np.ones((X_val_selected.shape[0], 1)), X_val_selected])
X_test_sel_bias = np.hstack([np.ones((X_test_selected.shape[0], 1)), X_test_selected])

# nowy model
model_sel = CustomLogisticRegression(lr=0.05, epochs=250, batch_size=64, verbose=True)
model_sel.fit(X_train_sel_bias, y_train, X_val_sel_bias, y_val, X_test_sel_bias, y_test)

In [None]:
plt.plot(model_sel.history["train"], label="Train loss (selected features)")
plt.plot(model_sel.history["val"], label="Val loss (selected features)")
plt.plot(model_sel.history["test"], label="Test loss (selected features)")
plt.title("Zbieżność – po selekcji 5 najlepszych cech")
plt.xlabel("Epoch")
plt.ylabel("Log loss")
plt.legend()
plt.grid(True)
plt.show()

In [26]:
y_pred_sel = model_sel.predict(X_test_sel_bias)
print(classification_report(y_test, y_pred_sel, digits=4))

              precision    recall  f1-score   support

           0     0.9065    0.7852    0.8415       284
           1     0.8732    0.9481    0.9091       443

    accuracy                         0.8845       727
   macro avg     0.8898    0.8666    0.8753       727
weighted avg     0.8862    0.8845    0.8827       727

