In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)

def run_phase_f():
    """
    Executes Phase F: Baseline Modeling and Evaluation.
    Trains a Logistic Regression baseline with class balancing.
    """
    print(f"--- Starting PHASE F: Baseline Modeling and Evaluation ---")

    # 1. Defensive Programming: Verify Pre-conditions
    # Check if X and y exist in the global scope (from Phase E)
    assert 'X' in globals(), "CRITICAL ERROR: Feature matrix X not found. Run Phase E first."
    assert 'y' in globals(), "CRITICAL ERROR: Label vector y not found. Run Phase E first."

    # 7. Train-Test Split
    print(f"[Setup] Splitting data 80/20 (Stratified)...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.20,
        stratify=y,
        random_state=42
    )

    print(f" -> Training Set: {X_train.shape[0]} samples")
    print(f" -> Test Set:     {X_test.shape[0]} samples")

    # 10. Initialize Model
    # Using 'balanced' class weights to handle the severe imbalance
    print(f"[Training] Training Logistic Regression (class_weight='balanced')...")
    model = LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        solver='liblinear',
        random_state=42
    )

    # Train the model
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class (Toxic)

    # 12. Evaluation Metrics
    print(f"\n" + "="*45)
    print(f"          MODEL EVALUATION REPORT")
    print(f"="*45)

    # Calculate metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob)

    print(f"Accuracy:          {acc:.4f}")
    print(f"ROC-AUC:           {roc:.4f}")
    print(f"-"*45)
    print(f"Precision (Toxic): {prec:.4f}")
    print(f"Recall (Toxic):    {rec:.4f}  <-- CRITICAL METRIC")
    print(f"F1-Score (Toxic):  {f1:.4f}")
    print(f"="*45)

    # 13. Classification Report
    print(f"\n[Detailed Classification Report]")
    print(classification_report(y_test, y_pred, target_names=['Non-Toxic (0)', 'Toxic (1)']))

    # 14. Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print(f"\n[Confusion Matrix]")
    print(f"                 Predicted 0   Predicted 1")
    print(f"Actual 0 (Safe)      {tn:<5}         {fp:<5}")
    print(f"Actual 1 (Toxic)     {fn:<5}         {tp:<5}")
    print(f"\n -> False Negatives (Missed Toxins): {fn}")
    print(f" -> False Positives (False Alarms):  {fp}")

    # 15. Scientific Interpretation
    print(f"\n" + "-"*60)
    print(f"SCIENTIFIC INTERPRETATION:")
    print(f"-"*60)
    print(f"1. Recall vs. Accuracy:")
    print(f"   Accuracy ({acc:.2%}) is less important than Recall ({rec:.2%}) here.")
    print(f"   In toxicology, our primary goal is to catch toxic molecules.")

    print(f"\n2. The Cost of False Negatives:")
    print(f"   The model missed {fn} toxic molecules (False Negatives).")
    print(f"   These represent potential safety failures in drug discovery.")
    print(f"   A high Recall minimizes these dangerous misses.")

    print(f"\n3. Baseline Performance:")
    print(f"   Using class_weight='balanced' forces the model to pay attention")
    print(f"   to the minority class, often trading some Precision for higher Recall.")
    print(f"-"*60)

    return model, X_test, y_test, y_pred, y_prob

# --- EXECUTION ---

# Execute logic and assign to global variables for potential future use
lr_model, X_test, y_test, y_pred_baseline, y_prob_baseline = run_phase_f()

# 18. Final Success Message
print(f"\n--- PHASE F COMPLETED SUCCESSFULLY ---")

--- Starting PHASE F: Baseline Modeling and Evaluation ---
[Setup] Splitting data 80/20 (Stratified)...
 -> Training Set: 1168 samples
 -> Test Set:     293 samples
[Training] Training Logistic Regression (class_weight='balanced')...

          MODEL EVALUATION REPORT
Accuracy:          0.9215
ROC-AUC:           0.8598
---------------------------------------------
Precision (Toxic): 0.4444
Recall (Toxic):    0.3810  <-- CRITICAL METRIC
F1-Score (Toxic):  0.4103

[Detailed Classification Report]
               precision    recall  f1-score   support

Non-Toxic (0)       0.95      0.96      0.96       272
    Toxic (1)       0.44      0.38      0.41        21

     accuracy                           0.92       293
    macro avg       0.70      0.67      0.68       293
 weighted avg       0.92      0.92      0.92       293


[Confusion Matrix]
                 Predicted 0   Predicted 1
Actual 0 (Safe)      262           10   
Actual 1 (Toxic)     13            8    

 -> False Negatives (