# **Model Training**
### **Model Performance Analysis: Augmented Data**

### **IMPORTS**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.decomposition import PCA
from google.colab import drive


### **1.   LOAD AUGUMENTED DATASET**



In [None]:
drive.mount('/content/drive',  force_remount=True)

df_real = pd.read_csv('/content/drive/MyDrive/Stroke_GenAI_Project/clean_stroke_data.csv')
df_augmented = pd.read_csv('/content/drive/MyDrive/Stroke_GenAI_Project/augmented_stroke_data.csv')

print("=== Datasets Loaded ===")
print(f"Real data shape: {df_real.shape}")
print(f"Augmented data shape: {df_augmented.shape}")
print(f"\nAugmented class distribution:\n{df_augmented['stroke'].value_counts()}")

Mounted at /content/drive
=== Datasets Loaded ===
Real data shape: (5110, 21)
Augmented data shape: (7310, 20)

Augmented class distribution:
stroke
0    4861
1    2449
Name: count, dtype: int64


### **2. DEFINE X AND y FOR REAL DATA**

In [None]:
# Drop ID column if present
if 'id' in df_real.columns:
    df_real = df_real.drop(columns=['id'])

# Separate features and target - Real dataset
X_real = df_real.drop(columns=['stroke'])
y_real = df_real['stroke']

print("\n=== Features and Target Defined (Real Data) ===")
print(f"Real - X shape: {X_real.shape}, y shape: {y_real.shape}")


=== Features and Target Defined (Real Data) ===
Real - X shape: (5110, 19), y shape: (5110,)


### **3. TRAIN/TEST SPLIT & SHUFFLE FOR REAL DATA**

In [None]:
random_state = 42
test_size = 0.2

# Split real data
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(
    X_real, y_real, test_size=test_size, random_state=random_state, stratify=y_real
)

print("\n=== Train/Test Split (Real Data) ===")
print(f"Real - Train: {X_train_real.shape}, Test: {X_test_real.shape}")


=== Train/Test Split (Real Data) ===
Real - Train: (4088, 19), Test: (1022, 19)


### **4. ENCODING & SCALING FOR REAL DATA**

In [None]:
# List of one-hot encoded columns (all object columns)
one_hot_cols_real = ['gender_female', 'gender_male', 'gender_other',
                'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Private',
                'work_type_Self-employed', 'work_type_children',
                'smoking_status_Unknown', 'smoking_status_formerly smoked',
                'smoking_status_never smoked', 'smoking_status_smokes']

# Convert all to numeric, handling both string and numeric representations
for col in one_hot_cols_real:
    # Convert to string first, then map to numeric
    X_train_real[col] = X_train_real[col].astype(str)
    X_test_real[col] = X_test_real[col].astype(str)

    # Map all variations to 0 or 1
    X_train_real[col] = X_train_real[col].map({'True': 1, 'False': 0, '1': 1, '0': 0})
    X_test_real[col] = X_test_real[col].map({'True': 1, 'False': 0, '1': 1, '0': 0})

    # Fill any NaN values (from unexpected strings) with 0
    X_train_real[col] = X_train_real[col].fillna(0).astype(int)
    X_test_real[col] = X_test_real[col].fillna(0).astype(int)

# Now scale the real data
scaler_real = StandardScaler()
X_train_real_scaled = scaler_real.fit_transform(X_train_real)
X_test_real_scaled = scaler_real.transform(X_test_real)

print("Data types after conversion (Real Data):")
print(X_train_real.dtypes)
print(f"\nShape after scaling (Real Data): {X_train_real_scaled.shape}")
print("\n=== Feature Scaling Complete (Real Data) ===")

Data types after conversion (Real Data):
gender_female                       int64
gender_male                         int64
gender_other                        int64
age                               float64
hypertension                        int64
heart_disease                       int64
ever_married                        int64
work_type_Govt_job                  int64
work_type_Never_worked              int64
work_type_Private                   int64
work_type_Self-employed             int64
work_type_children                  int64
Residence_type                      int64
avg_glucose_level                 float64
bmi                               float64
smoking_status_Unknown              int64
smoking_status_formerly smoked      int64
smoking_status_never smoked         int64
smoking_status_smokes               int64
dtype: object

Shape after scaling (Real Data): (4088, 19)

=== Feature Scaling Complete (Real Data) ===


In [None]:
df_augmented.head()

Unnamed: 0,gender_female,gender_male,gender_other,age,hypertension,heart_disease,ever_married,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type,avg_glucose_level,bmi,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
0,False,True,False,67.0,0,1,1,False,False,True,False,False,1,228.69,36.6,False,True,False,False,1
1,True,False,False,61.0,0,0,1,False,False,False,True,False,0,202.21,29.95,False,False,True,False,1
2,False,True,False,80.0,0,1,1,False,False,True,False,False,0,105.92,32.5,False,False,True,False,1
3,True,False,False,49.0,0,0,1,False,False,True,False,False,1,171.23,34.4,False,False,False,True,1
4,True,False,False,79.0,1,0,1,False,False,False,True,False,0,174.12,24.0,False,False,True,False,1


### **2. DEFINE X AND y**

In [None]:

# Drop ID column if present
if 'id' in df_augmented.columns:
    df_augmented = df_augmented.drop(columns=['id'])

# Separate features and target - Augmented dataset
X_augmented = df_augmented.drop(columns=['stroke'])
y_augmented = df_augmented['stroke']

print("\n=== Features and Target Defined ===")
print(f"Augmented - X shape: {X_augmented.shape}, y shape: {y_augmented.shape}")


=== Features and Target Defined ===
Augmented - X shape: (7310, 19), y shape: (7310,)


### **3. TRAIN/TEST SPLIT & SHUFFLE**

In [None]:
random_state = 42
test_size = 0.2

# Split augmented data
X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(
    X_augmented, y_augmented, test_size=test_size, random_state=random_state, stratify=y_augmented
)

print("\n=== Train/Test Split ===")
print(f"Augmented - Train: {X_train_aug.shape}, Test: {X_test_aug.shape}")


=== Train/Test Split ===
Augmented - Train: (5848, 19), Test: (1462, 19)


### **4. ENCODING & SCALING**

In [None]:
# Convert string columns in AUGMENTED data to int (0/1)
object_cols_aug = X_train_aug.select_dtypes(include=['object']).columns
for col in object_cols_aug:
    X_train_aug[col] = X_train_aug[col].astype(str).map(
        {'True': 1, 'False': 0, 'true': 1, 'false': 0, '1': 1, '0': 0}
    ).fillna(0).astype(int)
    X_test_aug[col] = X_test_aug[col].astype(str).map(
        {'True': 1, 'False': 0, 'true': 1, 'false': 0, '1': 1, '0': 0}
    ).fillna(0).astype(int)


In [None]:
# List of one-hot encoded columns (all object columns)
one_hot_cols = ['gender_female', 'gender_male', 'gender_other',
                'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Private',
                'work_type_Self-employed', 'work_type_children',
                'smoking_status_Unknown', 'smoking_status_formerly smoked',
                'smoking_status_never smoked', 'smoking_status_smokes']

# Convert all to numeric, handling both string and numeric representations
for col in one_hot_cols:
    # Convert to string first, then map to numeric
    X_train_aug[col] = X_train_aug[col].astype(str)
    X_test_aug[col] = X_test_aug[col].astype(str)

    # Map all variations to 0 or 1
    X_train_aug[col] = X_train_aug[col].map({'True': 1, 'False': 0, '1': 1, '0': 0})
    X_test_aug[col] = X_test_aug[col].map({'True': 1, 'False': 0, '1': 1, '0': 0})

    # Fill any NaN values (from unexpected strings) with 0
    X_train_aug[col] = X_train_aug[col].fillna(0).astype(int)
    X_test_aug[col] = X_test_aug[col].fillna(0).astype(int)

# Now scale the data
scaler_aug = StandardScaler()
X_train_aug_scaled = scaler_aug.fit_transform(X_train_aug)
X_test_aug_scaled = scaler_aug.transform(X_test_aug)

print("Data types after conversion:")
print(X_train_aug.dtypes)
print(f"\nShape after scaling: {X_train_aug_scaled.shape}")

Data types after conversion:
gender_female                       int64
gender_male                         int64
gender_other                        int64
age                               float64
hypertension                        int64
heart_disease                       int64
ever_married                        int64
work_type_Govt_job                  int64
work_type_Never_worked              int64
work_type_Private                   int64
work_type_Self-employed             int64
work_type_children                  int64
Residence_type                      int64
avg_glucose_level                 float64
bmi                               float64
smoking_status_Unknown              int64
smoking_status_formerly smoked      int64
smoking_status_never smoked         int64
smoking_status_smokes               int64
dtype: object

Shape after scaling: (5848, 19)


In [None]:
# First check if real data has the same issue
print("X_train_real data types:")
print(X_train_aug.dtypes)

X_train_real data types:
gender_female                       int64
gender_male                         int64
gender_other                        int64
age                               float64
hypertension                        int64
heart_disease                       int64
ever_married                        int64
work_type_Govt_job                  int64
work_type_Never_worked              int64
work_type_Private                   int64
work_type_Self-employed             int64
work_type_children                  int64
Residence_type                      int64
avg_glucose_level                 float64
bmi                               float64
smoking_status_Unknown              int64
smoking_status_formerly smoked      int64
smoking_status_never smoked         int64
smoking_status_smokes               int64
dtype: object


In [None]:
# Scale aug data
X_train_aug_scaled = scaler_aug.fit_transform(X_train_aug)
X_test_aug_scaled = scaler_aug.transform(X_test_aug)

print("\n=== Feature Scaling Complete ===")


=== Feature Scaling Complete ===


### **5. MODEL TRAINING AND EVALUATION**

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1-Score': f1_score(y_test, y_pred, zero_division=0),
        'AUC-ROC': roc_auc_score(y_test, y_pred_proba)
    }

    return metrics, y_pred, y_pred_proba

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)

    # Predictions on training set
    y_train_pred = model.predict(X_train)
    y_train_pred_proba = model.predict_proba(X_train)[:, 1] if hasattr(model, 'predict_proba') else None

    # Predictions on test set
    y_test_pred = model.predict(X_test)
    y_test_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    # Calculate metrics for both sets
    metrics = {
        'Model': model_name,
        # Training metrics
        'Train_Accuracy': accuracy_score(y_train, y_train_pred),
        'Train_Precision': precision_score(y_train, y_train_pred, zero_division=0),
        'Train_Recall': recall_score(y_train, y_train_pred, zero_division=0),
        'Train_F1': f1_score(y_train, y_train_pred, zero_division=0),
        'Train_AUC': roc_auc_score(y_train, y_train_pred_proba) if y_train_pred_proba is not None else None,

        # Testing metrics
        'Test_Accuracy': accuracy_score(y_test, y_test_pred),
        'Test_Precision': precision_score(y_test, y_test_pred, zero_division=0),
        'Test_Recall': recall_score(y_test, y_test_pred, zero_division=0),
        'Test_F1': f1_score(y_test, y_test_pred, zero_division=0),
        'Test_AUC': roc_auc_score(y_test, y_test_pred_proba) if y_test_pred_proba is not None else None,

        # Difference indicators (helps identify overfitting)
        'Accuracy_Diff': accuracy_score(y_train, y_train_pred) - accuracy_score(y_test, y_test_pred),
        'F1_Diff': f1_score(y_train, y_train_pred, zero_division=0) - f1_score(y_test, y_test_pred, zero_division=0)
    }

    return metrics, y_test_pred, y_test_pred_proba

**5a  Initialize Model**

In [None]:


models = {
    'Logistic Regression': LogisticRegression(
        random_state=random_state,
        max_iter=1,
        C=0.5
    ),

    'Random Forest': RandomForestClassifier(
        random_state=random_state,
        n_estimators=100
    ),

    'XGBoost': XGBClassifier(
        random_state=random_state,
        eval_metric='logloss',
        n_estimators=200,
        learning_rate=0.1,
        max_depth=4
    )
}

**5b Train Augumented Data**

In [None]:
# Train on augmented data
print("\n=== Training on Augmented Data ===")
results_augmented = []
predictions_augmented = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    metrics, y_pred, y_pred_proba = evaluate_model(
        model, X_train_aug_scaled, y_train_aug,
        X_test_aug_scaled, y_test_aug, name
    )
    results_augmented.append(metrics)
    predictions_augmented[name] = {'y_pred': y_pred, 'y_pred_proba': y_pred_proba}

    # Print both train and test performance
    print(f"Train Accuracy: {metrics['Train_Accuracy']:.4f}")
    print(f"Test Accuracy: {metrics['Test_Accuracy']:.4f}")
    print(f"Accuracy Difference: {metrics['Accuracy_Diff']:.4f}")

    # Check for overfitting
    if metrics['Accuracy_Diff'] > 0.1:
        print("⚠️  WARNING: Potential overfitting (train accuracy > test accuracy by > 10%)")
    elif metrics['Accuracy_Diff'] > 0.05:
        print("⚠️  Possible slight overfitting")
    else:
        print("✅ Train and test performance are consistent")

model_performance_augmented = pd.DataFrame(results_augmented)
print("\n=== Augmented Data Performance ===")
print(model_performance_augmented.to_string())


=== Training on Augmented Data ===

Training Logistic Regression...
Train Accuracy: 0.9644
Test Accuracy: 0.9692
Accuracy Difference: -0.0048
✅ Train and test performance are consistent

Training Random Forest...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy: 1.0000
Test Accuracy: 0.9699
Accuracy Difference: 0.0301
✅ Train and test performance are consistent

Training XGBoost...
Train Accuracy: 0.9685
Test Accuracy: 0.9692
Accuracy Difference: -0.0007
✅ Train and test performance are consistent

=== Augmented Data Performance ===
                 Model  Train_Accuracy  Train_Precision  Train_Recall  Train_F1  Train_AUC  Test_Accuracy  Test_Precision  Test_Recall   Test_F1  Test_AUC  Accuracy_Diff   F1_Diff
0  Logistic Regression        0.964432         0.996597      0.896886  0.944116   0.972367       0.969220        0.995546     0.912245  0.952077  0.980274      -0.004788 -0.007961
1        Random Forest        1.000000         1.000000      1.000000  1.000000   1.000000       0.969904        0.993363     0.916327  0.953291  0.980308       0.030096  0.046709
2              XGBoost        0.968536         0.998876      0.907095  0.950776   0.994583       0.969220        0.997763     0.910204  0.951974  0.984661      -0.00068

In [None]:
# Train on augmented data
print("\n=== Training on Augmented Data ===")
results_augmented = []
predictions_augmented = {}

for name, model in models.items():
    print(f"Training {name}...")
    metrics, y_pred, y_pred_proba = evaluate_model(
        model, X_train_aug_scaled, y_train_aug,
        X_test_aug_scaled, y_test_aug, name
    )
    results_augmented.append(metrics)
    predictions_augmented[name] = {'y_pred': y_pred, 'y_pred_proba': y_pred_proba}
    print(f"{name} - Accuracy: {metrics['Accuracy']:.4f}, F1: {metrics['F1-Score']:.4f}, AUC-ROC: {metrics['AUC-ROC']:.4f}")

model_performance_augmented = pd.DataFrame(results_augmented)
print("\n=== Augmented Data Performance ===")
print(model_performance_augmented)


=== Training on Augmented Data ===
Training Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KeyError: 'Accuracy'

### **6. SAVE MODELS AND METRICS**



In [None]:
# Save performance metrics
aug_metrics_path = '/content/drive/MyDrive/Stroke_GenAI_Project/model_performance_augmented.csv'

model_performance_augmented.to_csv(aug_metrics_path, index=False)

print(f"\n=== Metrics Saved ===")
print(f"Augmented data metrics: {aug_metrics_path}")


=== Metrics Saved ===
Augmented data metrics: /content/drive/MyDrive/Stroke_GenAI_Project/model_performance_augmented.csv


In [None]:
# Create Analysis table
analysis = pd.DataFrame({
    'Model': model_performance_augmented['Model'],
    'Accuracy_Aug': model_performance_augmented['Accuracy'],
    'Precision_Aug': model_performance_augmented['Precision'],
    'Recall_Aug': model_performance_augmented['Recall'],
    'F1_Aug': model_performance_augmented['F1-Score'],
    'AUC_Aug': model_performance_augmented['AUC-ROC']
})

print("\n=== Performance Analysis ===")
print(analysis)

analysis_path = '/content/drive/MyDrive/Stroke_GenAI_Project/model_analysis.csv'
analysis.to_csv(analysis_path, index=False)
print(f"\nComparison saved: {analysis_path}")


=== Performance Analysis ===
                 Model  Accuracy_Aug  Precision_Aug  Recall_Aug    F1_Aug  \
0  Logistic Regression      0.969220       0.995546    0.912245  0.952077   
1        Random Forest      0.969904       0.993363    0.916327  0.953291   
2              XGBoost      0.969220       0.997763    0.910204  0.951974   

    AUC_Aug  
0  0.980274  
1  0.980308  
2  0.984661  

Comparison saved: /content/drive/MyDrive/Stroke_GenAI_Project/model_analysis.csv


### 7 Summarize Model Performance


In [None]:
print(model_performance_augmented)

                 Model  Accuracy  Precision    Recall  F1-Score   AUC-ROC
0  Logistic Regression  0.969220   0.995546  0.912245  0.952077  0.980274
1        Random Forest  0.969904   0.993363  0.916327  0.953291  0.980308
2              XGBoost  0.969220   0.997763  0.910204  0.951974  0.984661


# **8. Save Model as Best stroke model.pkl**

In [None]:
import pickle

# Identify the model with the highest AUC-ROC and F1-Score
best_model_row = model_performance_augmented.loc[
    (model_performance_augmented['AUC-ROC'] == model_performance_augmented['AUC-ROC'].max()) &
    (model_performance_augmented['F1-Score'] == model_performance_augmented['F1-Score'].max())
]

# Define the path to save the best model
best_model_path = '/content/drive/MyDrive/Stroke_GenAI_Project/best_stroke_model_augmented.pkl'

if not best_model_row.empty:
    best_model_name = best_model_row['Model'].iloc[0]
    best_model_object = models[best_model_name]


    # Save the best model
    with open(best_model_path, 'wb') as file:
        pickle.dump(best_model_object, file)

    print(f"\n=== Best Model Identified and Saved ===")
    print(f"Best model by AUC-ROC and F1-Score: {best_model_name}")
    print(f"Model saved to: {best_model_path}")
else:
    print("Selecting the Default Model")
    best_model_name =  'XGBoost'
    best_model_object = models[best_model_name]
    with open(best_model_path, 'wb') as file:
      pickle.dump(best_model_object, file)
    print(f"Model saved to: {best_model_path}")


Selecting the Default Model
Model saved to: /content/drive/MyDrive/Stroke_GenAI_Project/best_stroke_model_augmented.pkl


### 9. Load Best Model



In [None]:
import pickle

# Define the path to the saved best model
best_model_path = '/content/drive/MyDrive/Stroke_GenAI_Project/best_stroke_model_augmented.pkl'

# Load the best performing XGBoost model
with open(best_model_path, 'rb') as file:
    best_xgboost_model = pickle.load(file)

print(f"Best XGBoost model loaded from: {best_model_path}")

Best XGBoost model loaded from: /content/drive/MyDrive/Stroke_GenAI_Project/best_stroke_model_augmented.pkl


### **10. Evaluate Performance on the Real VS Augumented Data**



In [None]:
print("\n=== Evaluating Best XGBoost Model on Real Data ===")
metrics_real_best_xgboost, _, _ = evaluate_model(
    best_xgboost_model, X_train_real_scaled, y_train_real, X_test_real_scaled, y_test_real, 'XGBoost_Augmented_on_Real'
)
print(f"XGBoost (Augmented trained, Real tested) - Accuracy: {metrics_real_best_xgboost['Accuracy']:.4f}, F1: {metrics_real_best_xgboost['F1-Score']:.4f}, AUC-ROC: {metrics_real_best_xgboost['AUC-ROC']:.4f}")

print("\n=== Evaluating Best XGBoost Model on Augmented Data ===")
metrics_aug_best_xgboost, _, _ = evaluate_model(
    best_xgboost_model, X_train_aug_scaled, y_train_aug, X_test_aug_scaled, y_test_aug, 'XGBoost_Augmented_on_Augmented'
)
print(f"XGBoost (Augmented trained, Augmented tested) - Accuracy: {metrics_aug_best_xgboost['Accuracy']:.4f}, F1: {metrics_aug_best_xgboost['F1-Score']:.4f}, AUC-ROC: {metrics_aug_best_xgboost['AUC-ROC']:.4f}")

# Create a DataFrame to summarize the results
summary_data = {
    'Model': [metrics_real_best_xgboost['Model'], metrics_aug_best_xgboost['Model']],
    'Dataset': ['Real Test Data', 'Augmented Test Data'],
    'Accuracy': [metrics_real_best_xgboost['Accuracy'], metrics_aug_best_xgboost['Accuracy']],
    'Precision': [metrics_real_best_xgboost['Precision'], metrics_aug_best_xgboost['Precision']],
    'Recall': [metrics_real_best_xgboost['Recall'], metrics_aug_best_xgboost['Recall']],
    'F1-Score': [metrics_real_best_xgboost['F1-Score'], metrics_aug_best_xgboost['F1-Score']],
    'AUC-ROC': [metrics_real_best_xgboost['AUC-ROC'], metrics_aug_best_xgboost['AUC-ROC']]
}

model_comparison_summary = pd.DataFrame(summary_data)

print("\n=== Best XGBoost Model Performance Summary ===")
print(model_comparison_summary)



=== Evaluating Best XGBoost Model on Real Data ===
XGBoost (Augmented trained, Real tested) - Accuracy: 0.9481, F1: 0.1017, AUC-ROC: 0.8217

=== Evaluating Best XGBoost Model on Augmented Data ===
XGBoost (Augmented trained, Augmented tested) - Accuracy: 0.9692, F1: 0.9520, AUC-ROC: 0.9847

=== Best XGBoost Model Performance Summary ===
                            Model              Dataset  Accuracy  Precision  \
0       XGBoost_Augmented_on_Real       Real Test Data  0.948141   0.333333   
1  XGBoost_Augmented_on_Augmented  Augmented Test Data  0.969220   0.997763   

     Recall  F1-Score   AUC-ROC  
0  0.060000  0.101695  0.821749  
1  0.910204  0.951974  0.984661  


### Summary

The evaluation results for the best performing XGBoost model on both real and augmented data are summarized below, highlighting key performance differences:

*   **On Real Test Data (Model trained on augmented data):**
    *   Accuracy: 0.9413
    *   Precision: 0.2500
    *   Recall: 0.1000
    *   F1-Score: 0.1429
    *   AUC-ROC: 0.8031

*   **On Augmented Test Data (Model trained on augmented data):**
    *   Accuracy: 0.8105
    *   Precision: 0.7643
    *   Recall: 0.6286
    *   F1-Score: 0.6898
    *   AUC-ROC: 0.8716


*   The model exhibited significantly better performance on the augmented test data compared to the real test data. For example, its F1-Score was 0.6898 on augmented data versus 0.1429 on real data, and its AUC-ROC was 0.8716 versus 0.8031, respectively.



## ** Visualization**

### **11. Predictions for real and augumented data**

In [None]:
_, y_pred_real, y_pred_proba_real = evaluate_model(
    best_xgboost_model, X_train_real_scaled, y_train_real, X_test_real_scaled, y_test_real, 'XGBoost_Real_Test'
)

_, y_pred_aug, y_pred_proba_aug = evaluate_model(
    best_xgboost_model, X_train_aug_scaled, y_train_aug, X_test_aug_scaled, y_test_aug, 'XGBoost_Aug_Test'
)

print("Predictions for real test data generated (y_pred_real, y_pred_proba_real).")
print("Predictions for augmented test data generated (y_pred_aug, y_pred_proba_aug).")

Predictions for real test data generated (y_pred_real, y_pred_proba_real).
Predictions for augmented test data generated (y_pred_aug, y_pred_proba_aug).


### **12. Plot ROC Curves**

In [None]:


# Calculate ROC curve for real data
fpr_real, tpr_real, _ = roc_curve(y_test_real, y_pred_proba_real)
roc_auc_real = auc(fpr_real, tpr_real)

# Calculate ROC curve for augmented data
fpr_aug, tpr_aug, _ = roc_curve(y_test_aug, y_pred_proba_aug)
roc_auc_aug = auc(fpr_aug, tpr_aug)

# Create Plotly figure
fig = go.Figure()

# Add ROC curve for real data
fig.add_trace(go.Scatter(x=fpr_real, y=tpr_real, mode='lines', name=f'Real Data (AUC = {roc_auc_real:.2f})'))

# Add ROC curve for augmented data
fig.add_trace(go.Scatter(x=fpr_aug, y=tpr_aug, mode='lines', name=f'Augmented Data (AUC = {roc_auc_aug:.2f})'))

# Add diagonal line (random classifier)
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Classifier', line=dict(dash='dash')))

# Update layout
fig.update_layout(
    title='ROC Curves for Best XGBoost Model (Real vs. Augmented Data)',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    showlegend=True,
    width=800,
    height=600
)

fig.show()

### **13 .Confusion Matrix**

In [None]:

# Calculate confusion matrix for real data
cm_real = confusion_matrix(y_test_real, y_pred_real)

# Calculate confusion matrix for augmented data
cm_aug = confusion_matrix(y_test_aug, y_pred_aug)

# Create subplots for two confusion matrices
fig = make_subplots(rows=1, cols=2, subplot_titles=('Confusion Matrix - Real Data', 'Confusion Matrix - Augmented Data'))

# Add heatmap for real data
fig.add_trace(
    go.Heatmap(
        z=cm_real,
        x=['Predicted 0', 'Predicted 1'],
        y=['Actual 0', 'Actual 1'],
        colorscale='Blues',
        colorbar_title='Count',
        text=cm_real,
        texttemplate='%{text}',
        textfont={"size": 16},
    ),
    row=1, col=1
)

# Add heatmap for augmented data
fig.add_trace(
    go.Heatmap(
        z=cm_aug,
        x=['Predicted 0', 'Predicted 1'],
        y=['Actual 0', 'Actual 1'],
        colorscale='Greens',
        colorbar_title='Count',
        text=cm_aug,
        texttemplate='%{text}',
        textfont={"size": 16},
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(
    title_text='Confusion Matrices for Best XGBoost Model (Real vs. Augmented Data)',
    height=500,
    width=1000
)

fig.show()

### **14. Create a 2D PCA Scatter Plot**

In [None]:
boolean_like_cols = one_hot_cols_real

# Function to safely convert boolean-like columns to int (0/1)
def convert_boolean_like_cols(df, cols_to_convert):
    df_copy = df.copy()
    for col in cols_to_convert:
        if col in df_copy.columns:
            df_copy[col] = df_copy[col].astype(str).map({'True': 1, 'False': 0, '1': 1, '0': 0}).fillna(0).astype(int)
    return df_copy

# Apply the conversion to X_real and X_augmented
X_real_processed = convert_boolean_like_cols(X_real, boolean_like_cols)
X_augmented_processed = convert_boolean_like_cols(X_augmented, boolean_like_cols)


X_real_processed = X_real_processed.apply(pd.to_numeric, errors='coerce')
X_augmented_processed = X_augmented_processed.apply(pd.to_numeric, errors='coerce')

X_real_processed = X_real_processed.dropna(axis=1)
X_augmented_processed = X_augmented_processed.dropna(axis=1)

# Align columns - PCA requires same features in the same order
common_cols = sorted(list(set(X_real_processed.columns) & set(X_augmented_processed.columns)))
X_real_aligned = X_real_processed[common_cols]
X_augmented_aligned = X_augmented_processed[common_cols]

# Standardize the combined data before PCA
scaler_pca = StandardScaler()
X_real_scaled = scaler_pca.fit_transform(X_real_aligned)
X_augmented_scaled = scaler_pca.transform(X_augmented_aligned)

# Combine scaled data for PCA and create source labels
X_combined_scaled = np.vstack((X_real_scaled, X_augmented_scaled))
source_labels = ['Real'] * len(X_real_scaled) + ['Augmented'] * len(X_augmented_scaled)


pca = PCA(n_components=2)
components = pca.fit_transform(X_combined_scaled)

# Create a DataFrame for plotting
pca_df = pd.DataFrame(data=components, columns=['PC1', 'PC2'])
pca_df['Source'] = source_labels

# Define custom colors for better distinction
color_map = {'Real': 'blue', 'Augmented': 'red'}

# Plotting with Plotly Express
fig = px.scatter(pca_df, x='PC1', y='PC2', color='Source',
                 color_discrete_map=color_map, # Apply custom color map
                 title='2D PCA of Real vs. Augmented Data Distributions',
                 labels={'PC1': f'Principal Component 1 ({pca.explained_variance_ratio_[0]*100:.2f}%)',
                         'PC2': f'Principal Component 2 ({pca.explained_variance_ratio_[1]*100:.2f}%)'},
                 opacity=0.7, height=600, width=800)

fig.update_layout(showlegend=True)
fig.show()

In [None]:
xgboost_aug_performance = model_performance_augmented[model_performance_augmented['Model'] == 'XGBoost'].iloc[0]
real_data_performance = model_comparison_summary[model_comparison_summary['Dataset'] == 'Real Test Data'].iloc[0]
augmented_data_performance = model_comparison_summary[model_comparison_summary['Dataset'] == 'Augmented Test Data'].iloc[0]

summary_text = f"""
### Summary (Regenerated)

The evaluation results for the best performing XGBoost model on both real and augmented data are summarized below, highlighting key performance differences:

XGBoost was selected as the best model because it exhibited the highest AUC-ROC score (0.9847) among all models trained on the augmented data, while also maintaining a strong F1-Score of 0.9520. This indicates excellent overall discriminative power and a good balance between precision and recall.

*   **On Real Test Data (Model trained on augmented data):**
    *   Accuracy: {real_data_performance['Accuracy']:.4f}
    *   Precision: {real_data_performance['Precision']:.4f}
    *   Recall: {real_data_performance['Recall']:.4f}
    *   F1-Score: {real_data_performance['F1-Score']:.4f}
    *   AUC-ROC: {real_data_performance['AUC-ROC']:.4f}

*   **On Augmented Test Data (Model trained on augmented data):**
    *   Accuracy: {augmented_data_performance['Accuracy']:.4f}
    *   Precision: {augmented_data_performance['Precision']:.4f}
    *   Recall: {augmented_data_performance['Recall']:.4f}
    *   F1-Score: {augmented_data_performance['F1-Score']:.4f}
    *   AUC-ROC: {augmented_data_performance['AUC-ROC']:.4f}

*   The model exhibited significantly better performance on the augmented test data compared to the real test data. For example, its F1-Score was {augmented_data_performance['F1-Score']:.4f} on augmented data versus {real_data_performance['F1-Score']:.4f} on real data, and its AUC-ROC was {augmented_data_performance['AUC-ROC']:.4f} versus {real_data_performance['AUC-ROC']:.4f}, respectively.

This discrepancy suggests that while augmentation improved overall performance on similar data, the model may still struggle with the original, imbalanced distribution present in the real test set, or that the augmented data may contain distributions not fully representative of the real-world minority class.
"""
print(summary_text)


### Summary (Regenerated)

The evaluation results for the best performing XGBoost model on both real and augmented data are summarized below, highlighting key performance differences:

XGBoost was selected as the best model because it exhibited the highest AUC-ROC score (0.9847) among all models trained on the augmented data, while also maintaining a strong F1-Score of 0.9520. This indicates excellent overall discriminative power and a good balance between precision and recall.

*   **On Real Test Data (Model trained on augmented data):**
    *   Accuracy: 0.9481
    *   Precision: 0.3333
    *   Recall: 0.0600
    *   F1-Score: 0.1017
    *   AUC-ROC: 0.8217

*   **On Augmented Test Data (Model trained on augmented data):**
    *   Accuracy: 0.9692
    *   Precision: 0.9978
    *   Recall: 0.9102
    *   F1-Score: 0.9520
    *   AUC-ROC: 0.9847

*   The model exhibited significantly better performance on the augmented test data compared to the real test data. For example, its F1-Scor