In [14]:
# Core Libraries for Data Manipulation and System Interaction
import pandas as pd
import numpy as np
import os

# Scikit-learn for Modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Scikit-learn for Evaluation
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

print("Libraries imported successfully.")

Libraries imported successfully.


In [15]:
# --- Define the parameters for this single, perfect run ---

# 1. Data Parameters
DATASET_PATH = os.path.join("..", "data", "processed", "baseline.csv")
TARGET_VARIABLE = "Result"

# 2. Model Parameters
MODEL_CHOICE = (
    "LogisticRegression"  # Options: 'LogisticRegression', 'KNN', 'RandomForest'
)
RANDOM_STATE = 1  # We use this to ensure our data split and model are reproducible. Corresponds to run_id=1.

print(f"Blueprint configured for a single run:")
print(f" -> Dataset: {os.path.basename(DATASET_PATH)}")
print(f" -> Model: {MODEL_CHOICE}")
print(f" -> Run ID (Random State): {RANDOM_STATE}")

Blueprint configured for a single run:
 -> Dataset: baseline.csv
 -> Model: LogisticRegression
 -> Run ID (Random State): 1


In [16]:
# Load the specified dataset
try:
    df = pd.read_csv(DATASET_PATH)
    print("Dataset loaded successfully.")
    print(f"Shape of the dataset: {df.shape}")
except FileNotFoundError:
    print(f"ERROR: Dataset not found at {DATASET_PATH}. Please check the path.")
    # Stop execution if file not found
    assert False, "Execution halted."

# --- Target Variable Encoding ---
# We map the positive class to 1 and the negative class to 0.
print(f"\nOriginal target values: {df[TARGET_VARIABLE].unique()}")
df[TARGET_VARIABLE] = df[TARGET_VARIABLE].map({"positive": 1, "negative": 0})
print(f"Encoded target values: {df[TARGET_VARIABLE].unique()}")

# --- Inspection (Post-Encoding) ---
print("\nFirst 5 rows (after target encoding):")
display(df.head())
print("\nTarget variable distribution:")
print(df[TARGET_VARIABLE].value_counts(normalize=True))

Dataset loaded successfully.
Shape of the dataset: (1511, 20)

Original target values: ['positive' 'negative']
Encoded target values: [1 0]

First 5 rows (after target encoding):


Unnamed: 0,index,Gender,Age,Hemoglobin(g/dl),Neutrophils(%),Lymphocytes(%),Monocytes(%),Eosinophils(%),RBC,HCT(%),MCV(fl),MCH(pg),MCHC(g/dl),RDW-CV(%),Total Platelet Count(/cumm),MPV(fl),PDW(%),PCT(%),Total WBC count(/cumm),Result
0,0,Male,21,14.8,48,47,3,2,5,48.0,96.0,29.6,30.8,11.6,112000,10.7,15.4,0.12,5100,1
1,1,Male,30,15.0,47,49,6,3,5,49.8,96.1,28.4,29.5,11.8,96000,10.6,15.8,0.121,4500,1
2,2,Male,51,16.3,41,48,4,5,5,50.1,93.5,31.3,32.7,13.5,184000,10.4,16.4,0.13,6000,0
3,3,Female,26,12.3,46,49,7,5,5,44.0,90.0,30.5,30.5,14.7,167000,8.1,17.1,0.11,5000,0
4,4,Male,35,16.1,45,46,4,4,5,50.53,91.0,29.12,29.2,15.2,155000,10.52,12.34,0.15,4600,0



Target variable distribution:
Result
1    0.684977
0    0.315023
Name: proportion, dtype: float64


In [17]:
from sklearn.preprocessing import OneHotEncoder

# 1. Separate Features (X) and Target (y)
X = df.drop(columns=[TARGET_VARIABLE])
y = df[TARGET_VARIABLE]

print(f"Initial Features (X) shape: {X.shape}")
print(f"Initial Target (y) shape: {y.shape}")

# 2. First Split: Create the Training Set and a Temporary "Holding" Set (Test + Validation)
# We split 70% for training, leaving 30% in the holding set.
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=RANDOM_STATE, stratify=y
)

# 3. Second Split: Split the "Holding" Set into Validation and Test Sets
# The holding set is 30% of the original data. We split it in half (50/50) to get
# two sets that are each 15% of the original data.
# test_size=0.5 means 50% of the 30% holding set -> 15% of the total.
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=RANDOM_STATE, stratify=y_temp
)

# --- Isolate Numerical and Categorical Columns ---
categorical_features = ["Gender"]  # Add other categorical columns here if any
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()

print(f"Identified Categorical Features: {categorical_features}")
print(f"Identified Numerical Features: {numerical_features}")

# --- One-Hot Encode Categorical Features ---
# We fit the encoder ONLY on the training data.
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_train_cat_encoded = ohe.fit_transform(X_train[categorical_features])
X_val_cat_encoded = ohe.transform(X_val[categorical_features])
X_test_cat_encoded = ohe.transform(X_test[categorical_features])

# Create DataFrames with new feature names
ohe_feature_names = ohe.get_feature_names_out(categorical_features)
X_train_cat_df = pd.DataFrame(
    X_train_cat_encoded, index=X_train.index, columns=ohe_feature_names
)
X_val_cat_df = pd.DataFrame(
    X_val_cat_encoded, index=X_val.index, columns=ohe_feature_names
)
X_test_cat_df = pd.DataFrame(
    X_test_cat_encoded, index=X_test.index, columns=ohe_feature_names
)


# 3. Scale Numerical Features
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train[numerical_features])
X_val_num_scaled = scaler.transform(X_val[numerical_features])
X_test_num_scaled = scaler.transform(X_test[numerical_features])

# Create DataFrames for scaled numerical data
X_train_num_df = pd.DataFrame(
    X_train_num_scaled, index=X_train.index, columns=numerical_features
)
X_val_num_df = pd.DataFrame(
    X_val_num_scaled, index=X_val.index, columns=numerical_features
)
X_test_num_df = pd.DataFrame(
    X_test_num_scaled, index=X_test.index, columns=numerical_features
)


# 4. Combine Processed Numerical and Categorical Features
X_train_processed = pd.concat([X_train_num_df, X_train_cat_df], axis=1)
X_val_processed = pd.concat([X_val_num_df, X_val_cat_df], axis=1)
X_test_processed = pd.concat([X_test_num_df, X_test_cat_df], axis=1)

print("\n--- Preprocessing Complete ---")
print(f"Shape of final processed training data: {X_train_processed.shape}")
print(f"Columns: {X_train_processed.columns.tolist()}")

Initial Features (X) shape: (1511, 19)
Initial Target (y) shape: (1511,)
Identified Categorical Features: ['Gender']
Identified Numerical Features: ['index', 'Age', 'Hemoglobin(g/dl)', 'Neutrophils(%)', 'Lymphocytes(%)', 'Monocytes(%)', 'Eosinophils(%)', 'RBC', 'HCT(%)', 'MCV(fl)', 'MCH(pg)', 'MCHC(g/dl)', 'RDW-CV(%)', 'Total Platelet Count(/cumm)', 'MPV(fl)', 'PDW(%)', 'PCT(%)', 'Total WBC count(/cumm)']

--- Preprocessing Complete ---
Shape of final processed training data: (1057, 20)
Columns: ['index', 'Age', 'Hemoglobin(g/dl)', 'Neutrophils(%)', 'Lymphocytes(%)', 'Monocytes(%)', 'Eosinophils(%)', 'RBC', 'HCT(%)', 'MCV(fl)', 'MCH(pg)', 'MCHC(g/dl)', 'RDW-CV(%)', 'Total Platelet Count(/cumm)', 'MPV(fl)', 'PDW(%)', 'PCT(%)', 'Total WBC count(/cumm)', 'Gender_Female', 'Gender_Male']


In [18]:
# Initialize the model based on our parameter
if MODEL_CHOICE == "LogisticRegression":
    model = LogisticRegression(random_state=RANDOM_STATE)
elif MODEL_CHOICE == "KNN":
    model = KNeighborsClassifier()  # KNN doesn't have a random_state for initialization
elif MODEL_CHOICE == "RandomForest":
    model = RandomForestClassifier(random_state=RANDOM_STATE)
else:
    raise ValueError("Invalid MODEL_CHOICE specified in the parameters.")

print(f"Training {MODEL_CHOICE} model...")

# Train the model on the scaled training data
model.fit(X_train_processed, y_train)

print("Model training complete.")

Training LogisticRegression model...
Model training complete.


In [19]:
def evaluate_model(X_data, y_true, set_name):
    """A helper function to evaluate the model on a given dataset."""
    y_pred = model.predict(X_data)
    y_pred_proba = model.predict_proba(X_data)[:, 1]

    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_pred_proba),
    }

    print(f"\n--- Model Performance on {set_name} Set ---")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name.replace('_', ' ').title()}: {metric_value:.4f}")

    return metrics


# Evaluate on the Validation Set
val_metrics = evaluate_model(X_val_processed, y_val, "Validation")

# Evaluate on the Test Set (The final, unbiased evaluation)
test_metrics = evaluate_model(X_test_processed, y_test, "Test")


--- Model Performance on Validation Set ---
Accuracy: 0.7489
Precision: 0.7579
Recall: 0.9290
F1 Score: 0.8348
Roc Auc: 0.7111

--- Model Performance on Test Set ---
Accuracy: 0.7181
Precision: 0.7255
Recall: 0.9487
F1 Score: 0.8222
Roc Auc: 0.5872
