# Baseline Model 2: Classical ML (Petrozziello et al., 2018)

**Objective:** Reproduce Classical Machine Learning baselines (Logistic Regression, Random Forest) using handcrafted features.

**Reference:** Petrozziello, A., et al. (2018). *Multimedia Information Retrieval for Multimodal Fetal Monitoring.*

## 1. Setup & Imports

In [1]:
import os
import sys
import numpy as np
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score

# --- GitHub & Colab Setup ---
try:
    from google.colab import userdata

    # 1. Clone Repo using Secret Token
    token = userdata.get('GITHUB_AUTH_TOKEN')
    repo_name = "NeuroFetal-AI"
    username = "Krishna200608"
    repo_url = f"https://{token}@github.com/{username}/{repo_name}.git"

    if not os.path.exists(repo_name):
        print(f"Cloning {repo_name}...")
        get_ipython().system(f"git clone {repo_url}")

    # 2. Configure Git
    os.chdir(repo_name)
    get_ipython().system('git config --global user.email "krishnasikheriya001@gmail.com"')
    get_ipython().system('git config --global user.name "Krishna200608"')

    # 3. Install Dependencies
    get_ipython().system('pip install wfdb')

    BASE_DIR = os.getcwd()
    sys.path.append(os.path.join(BASE_DIR, "Code", "scripts"))
    print("Running in Colab (GitHub Integration Active)")

except ImportError:
    # Local Fallback
    BASE_DIR = os.path.abspath(os.path.join("..", ".."))
    sys.path.append(os.path.abspath(os.path.join("..", "scripts")))
    print("Running Locally")

import data_ingestion


Running in Colab (GitHub Integration Active)


## 2. Load Tabular Data
Run `data_ingestion.py` if needed to generate `X_tabular.npy`.

In [2]:
PROCESSED_DATA_DIR = os.path.join(BASE_DIR, "Datasets", "processed")
X_path = os.path.join(PROCESSED_DATA_DIR, "X_tabular.npy")
y_path = os.path.join(PROCESSED_DATA_DIR, "y.npy")

# Check if data exists, if not, try ingestion
if not os.path.exists(X_path) or not os.path.exists(y_path):
    print("Tabular data not found. Running ingestion to extract features...")
    get_ipython().system(f"python Code/scripts/data_ingestion.py")

try:
    X_tabular = np.load(X_path)
    y = np.load(y_path)
    print(f"Loaded tabular features: {X_tabular.shape}")
except FileNotFoundError:
    print("Features not found. Ensure ingestion ran successfully.")
    raise

Loaded tabular features: (2546, 18)


## 3. Train & Evaluate Models

In [3]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=2000),
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
}

scores = {name: {'auc': [], 'acc': []} for name in models}
X = np.nan_to_num(X_tabular, nan=0.0)

print("Starting Cross-Validation...")

for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
    print(f"Fold {fold+1}")
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    for name, model in models.items():
        clf = model
        if name == "Random Forest":
            clf.fit(X_train, y_train)
            input_val = X_val
        else:
            clf.fit(X_train_scaled, y_train)
            input_val = X_val_scaled

        if hasattr(clf, "predict_proba"):
            y_pred_prob = clf.predict_proba(input_val)[:, 1]
        else:
            y_pred_prob = clf.predict(input_val)

        acc = accuracy_score(y_val, (y_pred_prob > 0.5).astype(int))
        auc = roc_auc_score(y_val, y_pred_prob)

        scores[name]['auc'].append(auc)
        scores[name]['acc'].append(acc)

        # Save Model (Update: Saving the last fold model for checking)
        model_dir = os.path.join(BASE_DIR, "Code", "Baseline", "Models")
        os.makedirs(model_dir, exist_ok=True)

        model_path = os.path.join(model_dir, f"baseline_paper4_{name.replace(' ', '_')}.pkl")
        joblib.dump(clf, model_path)

print("\n=== Results ===")
for name in models:
    print(f"{name} AUC: {np.mean(scores[name]['auc']):.4f} +/- {np.std(scores[name]['auc']):.4f}")
    print(f"{name} Acc: {np.mean(scores[name]['acc']):.4f}")

print(f"Models saved to {model_dir}")

Starting Cross-Validation...
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5

=== Results ===
Logistic Regression AUC: 0.6764 +/- 0.0166
Logistic Regression Acc: 0.6473
Random Forest AUC: 0.8373 +/- 0.0251
Random Forest Acc: 0.8256
Models saved to /content/NeuroFetal-AI/Code/Baseline/Models


In [4]:
# 4. Save Results & Push to GitHub
results_path = os.path.join(BASE_DIR, "Code", "Baseline", "baseline_paper4_results.txt")

with open(results_path, "w") as f:
    f.write(f"Ref: Paper 4 (Petrozziello 2018) - Classical ML Baseline (Colab Run)\n")
    for name in models:
        mean_auc = np.mean(scores[name]['auc'])
        std_auc = np.std(scores[name]['auc'])
        mean_acc = np.mean(scores[name]['acc'])
        f.write(f"{name}: AUC {mean_auc:.4f} +/- {std_auc:.4f}, Acc {mean_acc:.4f}\n")
print(f"Results saved to {results_path}")

# Git Commit & Push
try:
    if 'google.colab' in sys.modules:
        print("Pushing results to GitHub...")
        os.chdir(BASE_DIR)

        get_ipython().system('git config --global user.email "krishnasikheriya001@gmail.com"')
        get_ipython().system('git config --global user.name "Krishna200608"')

        get_ipython().system('git pull origin main')
        get_ipython().system('git add Code/Baseline/Models/*.pkl Code/Baseline/*.txt')
        get_ipython().system('git commit -m "Update Classical ML Baseline Results (Colab)"')
        get_ipython().system('git push origin main')
        print("Successfully pushed to GitHub!")
except Exception as e:
    print(f"Git Push Failed: {e}")

Results saved to /content/NeuroFetal-AI/Code/Baseline/baseline_paper4_results.txt
Pushing results to GitHub...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 5 (delta 3), reused 5 (delta 3), pack-reused 0 (from 0)[K
Unpacking objects: 100% (5/5), 13.92 KiB | 1.26 MiB/s, done.
From https://github.com/Krishna200608/NeuroFetal-AI
 * branch            main       -> FETCH_HEAD
   ed00edf..0c1319b  main       -> origin/main
Updating ed00edf..0c1319b
Fast-forward
 Code/Baseline/Paper3_CNN.ipynb | 1389 [32m++++++++++++++++++++++++++++++++[m[31m--------[m
 1 file changed, 1125 insertions(+), 264 deletions(-)
[main e2ef64c] Update Classical ML Baseline Results (Colab)
 3 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 Code/Baseline/Models/baseline_paper4_Logistic_Regression.pkl
 create mode 100644 Code/Baseline/Models/baseline_paper4_Random_Forest.pkl
Enumerating objects