In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from time import time
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from constants import numeric_features, categorical_features
from part2.shared import load_processed_data
from part2.shared import load_train_with_validation_data

In [2]:
df = load_processed_data()
df = df[df["Target"].isin(["Graduate", "Dropout"])]
df["y"] = (df["Target"] == "Graduate").astype(int)
df = df.drop(columns=["Target", "Target encoded"], errors='ignore')

In [3]:
X = df.drop(columns=["y"])
y = df["y"].values

In [4]:
num_features = [c for c in numeric_features if c in X.columns]
cat_features = [c for c in categorical_features if c in X.columns]

In [5]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [6]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [7]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

In [8]:
X_train_raw, X_val_raw, X_test_raw, y_train, y_val, y_test = load_train_with_validation_data(X, y)
X_train = full_pipeline.fit_transform(X_train_raw)
X_test = full_pipeline.transform(X_test_raw)
X_val = full_pipeline.transform(X_val_raw)

In [9]:
def to_tensor(x, y, device):
    if hasattr(x, "toarray"):
        x = x.toarray()
    return torch.tensor(x, dtype=torch.float32).to(device), torch.tensor(y, dtype=torch.float32).to(device)


In [10]:
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

In [11]:
def train_model(X_train, y_train, device, epochs=250, batch_size=256, lr=0.05):
    X_train_tensor, y_train_tensor = to_tensor(X_train, y_train, device)
    model = LogisticRegressionModel(X_train.shape[1]).to(device)
    loss_fn = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        for xb, yb in loader:
            optimizer.zero_grad() #needed to clear gradients because PyTorch accumulates gradients by default
            pred = model(xb).squeeze()
            loss = loss_fn(pred, yb)
            loss.backward() #calculate gradients using backpropagation
            optimizer.step()

    return model.eval()

In [12]:
def evaluate_model(model, X, y, device):
    X_tensor, y_tensor = to_tensor(X, y, device)
    with torch.no_grad():
        pred = model(X_tensor).squeeze()
        pred_labels = (pred > 0.5).float()
        accuracy = accuracy_score(y_tensor.cpu(), pred_labels.cpu())
        f1 = f1_score(y_tensor.cpu(), pred_labels.cpu())
        auc = roc_auc_score(y_tensor.cpu(), pred.cpu())
    return {"accuracy": accuracy, "f1": f1, "auc": auc}

In [13]:
def run_and_report(device_label, device):
    print(f"\n=== {device_label} ===")

    start_time = time()
    model = train_model(X_train, y_train, device)
    duration = time() - start_time

    print(f"{device_label} TRAINING TIME: {duration:.2f} seconds")

    train_result = evaluate_model(model, X_train, y_train, device)
    test_result = evaluate_model(model, X_test, y_test, device)
    val_result  = evaluate_model(model, X_val, y_val, device)

    print(f"\n{device_label} TRAIN METRICS:")
    print(f"Accuracy: {train_result['accuracy']:.3f} | F1-score: {train_result['f1']:.3f} | AUC: {train_result['auc']:.3f}")

    print(f"\n{device_label} TEST METRICS:")
    print(f"Accuracy: {test_result['accuracy']:.3f} | F1-score: {test_result['f1']:.3f} | AUC: {test_result['auc']:.3f}")

    print(f"\n{device_label} VALIDATION METRICS:")
    print(f"Accuracy: {val_result['accuracy']:.3f} | F1-score: {val_result['f1']:.3f} | AUC: {val_result['auc']:.3f}")

run_and_report("CPU", torch.device("cpu"))

if torch.cuda.is_available():
    run_and_report("GPU", torch.device("cuda"))
else:
    print("\nGPU not available. Skipping GPU training.")



=== CPU ===
CPU TRAINING TIME: 16.60 seconds

CPU TRAIN METRICS:
Accuracy: 0.916 | F1-score: 0.933 | AUC: 0.959

CPU TEST METRICS:
Accuracy: 0.922 | F1-score: 0.937 | AUC: 0.959

CPU VALIDATION METRICS:
Accuracy: 0.923 | F1-score: 0.938 | AUC: 0.959

=== GPU ===


KeyboardInterrupt: 