In [150]:
import torch
import torch.nn as nn
import torch.optim as optim
from time import time
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from constants import numeric_features, categorical_features
from ai_models.shared import load_processed_data
from ai_models.shared import load_train_with_validation_data

In [151]:
df = load_processed_data()
df = df[df["Target"].isin(["Graduate", "Dropout"])]
df["y"] = (df["Target"] == "Graduate").astype(int)
df = df.drop(columns=["Target", "Target encoded"], errors='ignore')

In [152]:
X = df.drop(columns=["y"])
y = df["y"].values

In [153]:
num_features = [c for c in numeric_features if c in X.columns]
cat_features = [c for c in categorical_features if c in X.columns]

In [154]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [155]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [156]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

In [157]:
X_train_raw, X_val_raw, X_test_raw, y_train, y_val, y_test = load_train_with_validation_data(X, y)
X_train = full_pipeline.fit_transform(X_train_raw)
X_test = full_pipeline.transform(X_test_raw)
X_val = full_pipeline.transform(X_val_raw)

In [158]:
def to_tensor(x, y, device):
    if hasattr(x, "toarray"):
        x = x.toarray()
    return torch.tensor(x, dtype=torch.float32).to(device), torch.tensor(y, dtype=torch.float32).to(device)


In [159]:
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

In [160]:
def train_model(X_train, y_train, device, epochs=250, batch_size=256, lr=0.05):
    X_train_tensor, y_train_tensor = to_tensor(X_train, y_train, device)
    model = LogisticRegressionModel(X_train.shape[1]).to(device)
    loss_fn = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        for xb, yb in loader:
            pred = model(xb).squeeze()
            loss = loss_fn(pred, yb)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

    return model.eval()

In [161]:
def evaluate_model(model, X, y, device):
    X_tensor, y_tensor = to_tensor(X, y, device)
    with torch.no_grad():
        pred = model(X_tensor).squeeze()
        pred_labels = (pred > 0.5).float()
        accuracy = accuracy_score(y_tensor.cpu(), pred_labels.cpu())
        f1 = f1_score(y_tensor.cpu(), pred_labels.cpu())
        auc = roc_auc_score(y_tensor.cpu(), pred.cpu())
    return {"accuracy": accuracy, "f1": f1, "auc": auc}

In [162]:
device = torch.device("cpu")

start_time = time()
model_cpu = train_model(X_train, y_train, device)
duration = time() - start_time

print(f"\nCPU TRAINING TIME: {duration:.2f} seconds")

test_result = evaluate_model(model_cpu, X_test, y_test, device)
val_result  = evaluate_model(model_cpu, X_val, y_val, device)

print(f"\nTEST METRICS:")
print(f"Accuracy: {test_result['accuracy']:.3f} | F1-score: {test_result['f1']:.3f} | AUC: {test_result['auc']:.3f}")

print(f"\nVALIDATION METRICS:")
print(f"Accuracy: {val_result['accuracy']:.3f} | F1-score: {val_result['f1']:.3f} | AUC: {val_result['auc']:.3f}")



CPU TRAINING TIME: 4.88 seconds

TEST METRICS:
Accuracy: 0.930 | F1-score: 0.943 | AUC: 0.960

VALIDATION METRICS:
Accuracy: 0.914 | F1-score: 0.931 | AUC: 0.958


In [163]:
if torch.cuda.is_available():
    device = torch.device("cuda")

    start_time = time()
    model_gpu = train_model(X_train, y_train, device)
    duration = time() - start_time

    print(f"\nGPU TRAINING TIME: {duration:.2f} seconds")

    test_result = evaluate_model(model_gpu, X_test, y_test, device)
    val_result  = evaluate_model(model_gpu, X_val, y_val, device)

    print(f"\nTEST METRICS (GPU):")
    print(f"Accuracy: {test_result['accuracy']:.3f} | F1-score: {test_result['f1']:.3f} | AUC: {test_result['auc']:.3f}")

    print(f"\nVALIDATION METRICS (GPU):")
    print(f"Accuracy: {val_result['accuracy']:.3f} | F1-score: {val_result['f1']:.3f} | AUC: {val_result['auc']:.3f}")
else:
    print("\nGPU not available. Skipping GPU training.")



GPU TRAINING TIME: 6.12 seconds

TEST METRICS (GPU):
Accuracy: 0.936 | F1-score: 0.948 | AUC: 0.960

VALIDATION METRICS (GPU):
Accuracy: 0.914 | F1-score: 0.931 | AUC: 0.958
