In [1]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (classification_report, confusion_matrix, 
                             accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, roc_curve)
import os
from pathlib import Path

## Training

In [19]:
def train_logistic_regression(permutation_folder, target_column='Outcome', random_state=42):
    """
    Train a Logistic Regression classifier on a permutation dataset.
    
    Parameters:
    - permutation_folder: path to the permutation folder containing train/validation/test CSVs
    - target_column: name of the target column (default: 'Outcome')
    - random_state: random seed for reproducibility
    
    Returns:
    - model: trained LogisticRegression model
    - scaler: fitted StandardScaler
    - results: dictionary containing evaluation metrics
    """
    
    print(f"Loading data from: {permutation_folder}")
    print("="*60)
    
    # Load the datasets
    train_df = pd.read_csv(os.path.join(permutation_folder, 'train.csv'))
    val_df = pd.read_csv(os.path.join(permutation_folder, 'validation.csv'))
    test_df = pd.read_csv(os.path.join(permutation_folder, 'test.csv'))
    
    print(f"Train set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")
    print(f"Test set size: {len(test_df)}")
    
    # Separate features and target
    X_train = train_df.drop(columns=[target_column])
    X_train = X_train.drop(columns=['diabetes_stage'])
    y_train = train_df[target_column]
    
    X_val = val_df.drop(columns=[target_column])
    X_val = X_val.drop(columns=['diabetes_stage'])
    y_val = val_df[target_column]
    
    X_test = test_df.drop(columns=[target_column])
    X_test = X_test.drop(columns=['diabetes_stage'])
    y_test = test_df[target_column]

    # Handle categorical variables
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    
    if categorical_cols:
        print(f"\nCategorical columns found: {categorical_cols}")
        print("Applying one-hot encoding...")
        
        X_train = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
        X_val = pd.get_dummies(X_val, columns=categorical_cols, drop_first=True)
        X_test = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)
    
    # Align columns across all sets
    X_val = X_val.reindex(columns=X_train.columns, fill_value=0)
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
    
    print(f"\nFeatures: {list(X_train.columns)}")
    print(f"Number of features: {X_train.shape[1]}")
    
    # Standardize features
    print("\nStandardizing features...")
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Train Logistic Regression model
    print("\nTraining Logistic Regression model...")
    model = LogisticRegression(random_state=random_state, max_iter=1000)
    model.fit(X_train_scaled, y_train)
    
    print("Training complete!")
    
    # Make predictions
    y_train_pred = model.predict(X_train_scaled)
    y_val_pred = model.predict(X_val_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    # Get prediction probabilities for ROC curve
    y_train_proba = model.predict_proba(X_train_scaled)[:, 1]
    y_val_proba = model.predict_proba(X_val_scaled)[:, 1]
    y_test_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate metrics
    results = {}
    for split_name, y_true, y_pred, y_proba in [
        ('train', y_train, y_train_pred, y_train_proba),
        ('validation', y_val, y_val_pred, y_val_proba),
        ('test', y_test, y_test_pred, y_test_proba)
    ]:
        results[split_name] = {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, zero_division=0),
            'recall': recall_score(y_true, y_pred, zero_division=0),
            'f1': f1_score(y_true, y_pred, zero_division=0),
            'roc_auc': roc_auc_score(y_true, y_proba),
            'confusion_matrix': confusion_matrix(y_true, y_pred),
            'y_true': y_true,
            'y_pred': y_pred,
            'y_proba': y_proba
        }
    
    # Print results
    print("\n" + "="*60)
    print("EVALUATION RESULTS")
    print("="*60)
    
    for split_name in ['train', 'validation', 'test']:
        print(f"\n{split_name.upper()} SET:")
        print(f"  Accuracy:  {results[split_name]['accuracy']:.4f}")
        print(f"  Precision: {results[split_name]['precision']:.4f}")
        print(f"  Recall:    {results[split_name]['recall']:.4f}")
        print(f"  F1-Score:  {results[split_name]['f1']:.4f}")
        print(f"  ROC-AUC:   {results[split_name]['roc_auc']:.4f}")
    
    # Generate visualizations
    output_dir = '../results/LogisticRegression/'
    plot_results(results, model, X_train.columns, output_dir)
    
    # Save model metrics to file
    save_metrics(results, output_dir)
    
    return model, scaler, results

## Plot

In [3]:
def plot_results(results, model, feature_names, output_dir):
    """Generate and save visualization plots."""
    
    # Set style
    sns.set_style("whitegrid")
    
    # Create a figure with subplots
    fig = plt.figure(figsize=(18, 12))
    
    # 1. Confusion Matrices
    for idx, split_name in enumerate(['train', 'validation', 'test'], 1):
        ax = plt.subplot(3, 3, idx)
        cm = results[split_name]['confusion_matrix']
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax)
        ax.set_title(f'{split_name.capitalize()} Confusion Matrix', fontweight='bold')
        ax.set_ylabel('True Label')
        ax.set_xlabel('Predicted Label')
    
    # 2. Metrics Comparison Bar Chart
    ax = plt.subplot(3, 3, 4)
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    x = np.arange(len(metrics))
    width = 0.25
    
    train_scores = [results['train'][m] for m in metrics]
    val_scores = [results['validation'][m] for m in metrics]
    test_scores = [results['test'][m] for m in metrics]
    
    ax.bar(x - width, train_scores, width, label='Train', alpha=0.8)
    ax.bar(x, val_scores, width, label='Validation', alpha=0.8)
    ax.bar(x + width, test_scores, width, label='Test', alpha=0.8)
    
    ax.set_ylabel('Score')
    ax.set_title('Metrics Comparison', fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels([m.replace('_', ' ').title() for m in metrics], rotation=45, ha='right')
    ax.legend()
    ax.set_ylim(0, 1.1)
    ax.grid(axis='y', alpha=0.3)
    
    # 3. ROC Curves
    ax = plt.subplot(3, 3, 5)
    for split_name, color in [('train', 'blue'), ('validation', 'orange'), ('test', 'green')]:
        fpr, tpr, _ = roc_curve(results[split_name]['y_true'], results[split_name]['y_proba'])
        auc = results[split_name]['roc_auc']
        ax.plot(fpr, tpr, label=f'{split_name.capitalize()} (AUC={auc:.3f})', color=color, linewidth=2)
    
    ax.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('ROC Curves', fontweight='bold')
    ax.legend()
    ax.grid(alpha=0.3)
    
    # 4. Feature Importance (Coefficients)
    ax = plt.subplot(3, 3, 6)
    coefficients = model.coef_[0]
    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Coefficient': coefficients,
        'Abs_Coefficient': np.abs(coefficients)
    }).sort_values('Abs_Coefficient', ascending=True)
    
    colors = ['red' if c < 0 else 'green' for c in feature_importance['Coefficient']]
    ax.barh(range(len(feature_importance)), feature_importance['Coefficient'], color=colors, alpha=0.7)
    ax.set_yticks(range(len(feature_importance)))
    ax.set_yticklabels(feature_importance['Feature'])
    ax.set_xlabel('Coefficient Value')
    ax.set_title('Feature Importance (Coefficients)', fontweight='bold')
    ax.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
    ax.grid(axis='x', alpha=0.3)
    
    # 5. Prediction Distribution
    for idx, split_name in enumerate(['train', 'validation', 'test'], 7):
        ax = plt.subplot(3, 3, idx)
        y_proba = results[split_name]['y_proba']
        y_true = results[split_name]['y_true']
        
        ax.hist(y_proba[y_true == 0], bins=30, alpha=0.6, label='No Diabetes', color='green')
        ax.hist(y_proba[y_true == 1], bins=30, alpha=0.6, label='Diabetes', color='red')
        ax.set_xlabel('Predicted Probability')
        ax.set_ylabel('Frequency')
        ax.set_title(f'{split_name.capitalize()} Prediction Distribution', fontweight='bold')
        ax.legend()
        ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    
    # Save plot
    plot_path = os.path.join(output_dir, 'model_evaluation.png')
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    print(f"\nVisualization saved to: {plot_path}")
    plt.close()

In [13]:
def save_metrics(results, output_dir):
    """Save metrics to a CSV file."""
    metrics_data = []
    
    for split_name in ['train', 'validation', 'test']:
        metrics_data.append({
            'Split': split_name,
            'Accuracy': results[split_name]['accuracy'],
            'Precision': results[split_name]['precision'],
            'Recall': results[split_name]['recall'],
            'F1-Score': results[split_name]['f1'],
            'ROC-AUC': results[split_name]['roc_auc']
        })
    
    metrics_df = pd.DataFrame(metrics_data)
    csv_path = os.path.join(output_dir, 'model_metrics.csv')
    metrics_df.to_csv(csv_path, index=False)
    print(f"Metrics saved to: {csv_path}")


In [21]:
# Train on the first permutation
permutation_folder = '../data/processed/permutation_000001'

# Check if folder exists
if not os.path.exists(permutation_folder):
    print(f"Error: Folder '{permutation_folder}' not found!")
    print("Please specify the correct permutation folder path.")
else:
    model, scaler, results = train_logistic_regression(
        permutation_folder=permutation_folder,
        target_column='diagnosed_diabetes', 
        random_state=42
    )
    
    print("\n" + "="*60)
    print("Training and evaluation complete!")
    print("="*60)

Loading data from: ../data/processed/permutation_000001
Train set size: 60000
Validation set size: 20000
Test set size: 20000

Categorical columns found: ['gender', 'ethnicity', 'education_level', 'income_level', 'employment_status', 'smoking_status']
Applying one-hot encoding...

Features: ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'family_history_diabetes', 'hypertension_history', 'cardiovascular_history', 'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides', 'glucose_fasting', 'glucose_postprandial', 'insulin_level', 'hba1c', 'diabetes_risk_score', 'gender_Male', 'gender_Other', 'ethnicity_Black', 'ethnicity_Hispanic', 'ethnicity_Other', 'ethnicity_White', 'education_level_Highschool', 'education_level_No formal', 'education_level_Postgraduate', 'income_level_Low', 'income_level_Lower-Midd