In [18]:
# Core Libraries for Data Manipulation and System Interaction
import pandas as pd
import numpy as np
import os

# Scikit-learn for Modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Scikit-learn for Evaluation
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

print("Libraries imported successfully.")

Libraries imported successfully.


In [19]:
# --- Define the parameters for this single, perfect run ---

# 1. Data Parameters
DATASET_PATH = os.path.join("..", "data", "processed", "baseline.csv")
TARGET_VARIABLE = "Result"

# 2. Model Parameters
MODEL_CHOICE = "LogisticRegression"
RANDOM_STATE = 1

print(f"Blueprint configured for a single run:")
print(f" -> Dataset: {os.path.basename(DATASET_PATH)}")
print(f" -> Model: {MODEL_CHOICE}")
print(f" -> Run ID (Random State): {RANDOM_STATE}")

Blueprint configured for a single run:
 -> Dataset: baseline.csv
 -> Model: LogisticRegression
 -> Run ID (Random State): 1


In [20]:
# Load the specified dataset
try:
    df = pd.read_csv(DATASET_PATH)
    print("Dataset loaded successfully.")
    print(f"Shape of the dataset: {df.shape}")
    print("\nFirst 5 rows:")
    display(df.head())
    print("\nTarget variable distribution:")
    print(df[TARGET_VARIABLE].value_counts(normalize=True))
except FileNotFoundError:
    print(f"ERROR: Dataset not found at {DATASET_PATH}. Please check the path.")

Dataset loaded successfully.
Shape of the dataset: (1511, 20)

First 5 rows:


Unnamed: 0,index,Gender,Age,Hemoglobin(g/dl),Neutrophils(%),Lymphocytes(%),Monocytes(%),Eosinophils(%),RBC,HCT(%),MCV(fl),MCH(pg),MCHC(g/dl),RDW-CV(%),Total Platelet Count(/cumm),MPV(fl),PDW(%),PCT(%),Total WBC count(/cumm),Result
0,0,Male,21,14.8,48,47,3,2,5,48.0,96.0,29.6,30.8,11.6,112000,10.7,15.4,0.12,5100,positive
1,1,Male,30,15.0,47,49,6,3,5,49.8,96.1,28.4,29.5,11.8,96000,10.6,15.8,0.121,4500,positive
2,2,Male,51,16.3,41,48,4,5,5,50.1,93.5,31.3,32.7,13.5,184000,10.4,16.4,0.13,6000,negative
3,3,Female,26,12.3,46,49,7,5,5,44.0,90.0,30.5,30.5,14.7,167000,8.1,17.1,0.11,5000,negative
4,4,Male,35,16.1,45,46,4,4,5,50.53,91.0,29.12,29.2,15.2,155000,10.52,12.34,0.15,4600,negative



Target variable distribution:
Result
positive    0.684977
negative    0.315023
Name: proportion, dtype: float64


In [21]:
# 1. Separate Features (X) and Target (y)
X = df.drop(columns=[TARGET_VARIABLE])
y = df[TARGET_VARIABLE]

# --- Encode string target labels to numeric ---
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)  # Now y is numeric (e.g., 0/1)
print(f"Target classes: {list(le.classes_)}")

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

# --- One-hot encode categorical features ---
from sklearn.preprocessing import OneHotEncoder

# Identify categorical columns (object or category dtype)
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
if categorical_cols:
    print(f"Categorical columns detected: {categorical_cols}")
    encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
    X_encoded = encoder.fit_transform(X[categorical_cols])
    encoded_feature_names = encoder.get_feature_names_out(categorical_cols)
    X_encoded_df = pd.DataFrame(X_encoded, columns=encoded_feature_names, index=X.index)

    # Drop original categorical columns and concat encoded columns
    X = pd.concat([X.drop(columns=categorical_cols), X_encoded_df], axis=1)
    print(f"After encoding, features shape: {X.shape}")
else:
    print("No categorical columns detected.")

# 2. First Split: Create the Training Set and a Temporary "Holding" Set (Test + Validation)
# We split 70% for training, leaving 30% in the holding set.
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=RANDOM_STATE, stratify=y
)

# 3. Second Split: Split the "Holding" Set into Validation and Test Sets
# The holding set is 30% of the original data. We split it in half (50/50) to get
# two sets that are each 15% of the original data.
# test_size=0.5 means 50% of the 30% holding set -> 15% of the total
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE, stratify=y_temp
)

# 4. Report the shapes of the final sets to verify the architecture
print("\n--- Data Splitting Complete ---")
print(f"Training set shape:   {X_train.shape} (~70%)")
print(f"Validation set shape: {X_val.shape} (~15%)")
print(f"Test set shape:       {X_test.shape} (~15%)")
print("-" * 30)
original_rows = len(df)
print(f"Total rows in Train:   {len(X_train)} ({len(X_train)/original_rows:.2%})")
print(f"Total rows in Val:     {len(X_val)} ({len(X_val)/original_rows:.2%})")
print(f"Total rows in Test:    {len(X_test)} ({len(X_test)/original_rows:.2%})")

# 5. Scale Features
# We fit the scaler ONLY on the training data.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print(
    "\nFeatures for all three sets scaled successfully (scaler fit on training data only)."
)

Target classes: ['negative', 'positive']
Features (X) shape: (1511, 19)
Target (y) shape: (1511,)
Categorical columns detected: ['Gender']
After encoding, features shape: (1511, 20)

--- Data Splitting Complete ---
Training set shape:   (1057, 20) (~70%)
Validation set shape: (227, 20) (~15%)
Test set shape:       (227, 20) (~15%)
------------------------------
Total rows in Train:   1057 (69.95%)
Total rows in Val:     227 (15.02%)
Total rows in Test:    227 (15.02%)

Features for all three sets scaled successfully (scaler fit on training data only).


In [22]:
# Initialize the model based on our parameter
if MODEL_CHOICE == "LogisticRegression":
    model = LogisticRegression(random_state=RANDOM_STATE)
elif MODEL_CHOICE == "KNN":
    model = KNeighborsClassifier()
elif MODEL_CHOICE == "RandomForest":
    model = RandomForestClassifier(random_state=RANDOM_STATE)
else:
    raise ValueError("Invalid MODEL_CHOICE specified in the parameters.")

print(f"Training {MODEL_CHOICE} model...")

# Train the model on the scaled training data
model.fit(X_train_scaled, y_train)

print("Model training complete.")

Training LogisticRegression model...
Model training complete.


In [23]:
def evaluate_model(X_data, y_true, set_name):
    """A helper function to evaluate the model on a given dataset."""
    y_pred = model.predict(X_data)
    y_pred_proba = model.predict_proba(X_data)[:, 1]

    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_pred_proba),
    }

    print(f"\n--- Model Performance on {set_name} Set ---")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name.replace('_', ' ').title()}: {metric_value:.4f}")

    return metrics


# Evaluate on the Validation Set
val_metrics = evaluate_model(X_val_scaled, y_val, "Validation")

# Evaluate on the Test Set (The final, unbiased evaluation)
test_metrics = evaluate_model(X_test_scaled, y_test, "Test")


--- Model Performance on Validation Set ---
Accuracy: 0.7489
Precision: 0.7579
Recall: 0.9290
F1: 0.8348
Roc Auc: 0.7107

--- Model Performance on Test Set ---
Accuracy: 0.7181
Precision: 0.7255
Recall: 0.9487
F1: 0.8222
Roc Auc: 0.5872
