In [28]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from constants import numeric_features, categorical_features
from part2.shared import load_processed_data
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support
from Logistic import CustomLogisticRegression

In [29]:
NUMBER_OF_EPOCHS = 250

In [30]:

df = load_processed_data()
df = df[df["Target"].isin(["Graduate", "Dropout"])]
df["y"] = (df["Target"] == "Graduate").astype(int)
df = df.drop(columns=["Target", "Target encoded"], errors='ignore')

In [31]:
X = df.drop(columns=["y"])
y = df["y"].values

In [32]:
num_features = [c for c in numeric_features if c in X.columns]
cat_features = [c for c in categorical_features if c in X.columns]

In [33]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [34]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [35]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

In [36]:
kf_outer = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
fold_results = []

for fold, (train_val_idx, test_idx) in enumerate(kf_outer.split(X, y), 1):
    X_train_val_raw, X_test_raw = X.iloc[train_val_idx], X.iloc[test_idx]
    y_train_val, y_test = y[train_val_idx], y[test_idx]

    X_train_raw, X_val_raw, y_train, y_val = train_test_split(X_train_val_raw, y_train_val,
        test_size=0.20, stratify=y_train_val, random_state=fold
    )

    X_train = full_pipeline.fit_transform(X_train_raw)
    X_val = full_pipeline.transform(X_val_raw)
    X_test = full_pipeline.transform(X_test_raw)

    model = CustomLogisticRegression(lr=0.05, epochs=250,batch_size=64)

    model.fit(X_train, y_train, X_val, y_val, X_test, y_test)
    train_acc = model.score(X_train, y_train)
    val_acc = model.score(X_val, y_val)
    y_tst_pred = model.predict(X_test)
    tst_acc = model.score(X_test, y_test)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_tst_pred, average="binary")

    print(f"Fold {fold}:  train_acc={train_acc:.3f} | val_acc={val_acc:.3f} | "
          f"test_acc={tst_acc:.3f}  prec={prec:.3f}  "
          f"rec={rec:.3f}  f1={f1:.3f}")


Fold 1:  train_acc=0.928 | val_acc=0.921 | test_acc=0.907  prec=0.898  rec=0.957  f1=0.926
Fold 2:  train_acc=0.923 | val_acc=0.921 | test_acc=0.901  prec=0.895  rec=0.948  f1=0.921
Fold 3:  train_acc=0.921 | val_acc=0.909 | test_acc=0.909  prec=0.900  rec=0.957  f1=0.928
