# Binary Classification Training Pipeline

This notebook demonstrates the professional training pipeline with MLflow integration, advanced hyperparameter tuning, and comprehensive evaluation metrics.

In [None]:
import sys
import os
sys.path.append('.')

from train_pipeline import BinaryClassificationTrainer
from data_preprocessing import DataPreprocessor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Initialize Trainer

In [None]:
# Initialize the trainer with MLflow
trainer = BinaryClassificationTrainer(
    experiment_name="binary_classification_experiment",
    tracking_uri="file:./mlruns"
)

## Load and Preprocess Stroke Dataset

In [None]:
# Load stroke dataset
stroke_data_path = "../data/stroke/Dataset1.csv"
X_stroke, y_stroke = trainer.preprocessor.preprocess_stroke_data(stroke_data_path)

# Check class distribution
stroke_dist = trainer.preprocessor.get_class_distribution(y_stroke)
print("Stroke Dataset Class Distribution:")
print(f"  Class counts: {stroke_dist['class_counts']}")
print(f"  Class proportions: {stroke_dist['class_proportions']}")
print(f"  Is imbalanced: {stroke_dist['is_imbalanced']}")
print(f"  Total samples: {stroke_dist['total_samples']}")

In [None]:
# Split data with SMOTE for class imbalance
X_train_stroke, X_test_stroke, y_train_stroke, y_test_stroke = trainer.preprocessor.split_data(
    X_stroke, y_stroke, test_size=0.2, apply_smote=True
)

print(f"Training set size: {len(X_train_stroke)}")
print(f"Test set size: {len(X_test_stroke)}")

## Train Models on Stroke Dataset

In [None]:
# Train all models on stroke dataset
trainer.train_all_models(
    dataset_name="stroke",
    X_train=X_train_stroke,
    y_train=y_train_stroke,
    X_test=X_test_stroke,
    y_test=y_test_stroke,
    n_iter=50  # Number of hyperparameter combinations to try
)

## Load and Preprocess Hiring Dataset

In [None]:
# Load hiring dataset
hiring_data_path = "../data/HIRING/Dataset2.csv"
X_hiring, y_hiring = trainer.preprocessor.preprocess_hiring_data(hiring_data_path)

# Check class distribution
hiring_dist = trainer.preprocessor.get_class_distribution(y_hiring)
print("Hiring Dataset Class Distribution:")
print(f"  Class counts: {hiring_dist['class_counts']}")
print(f"  Class proportions: {hiring_dist['class_proportions']}")
print(f"  Is imbalanced: {hiring_dist['is_imbalanced']}")
print(f"  Total samples: {hiring_dist['total_samples']}")

In [None]:
# Split data with SMOTE
X_train_hiring, X_test_hiring, y_train_hiring, y_test_hiring = trainer.preprocessor.split_data(
    X_hiring, y_hiring, test_size=0.2, apply_smote=True
)

print(f"Training set size: {len(X_train_hiring)}")
print(f"Test set size: {len(X_test_hiring)}")

## Train Models on Hiring Dataset

In [None]:
# Train all models on hiring dataset
trainer.train_all_models(
    dataset_name="hiring",
    X_train=X_train_hiring,
    y_train=y_train_hiring,
    X_test=X_test_hiring,
    y_test=y_test_hiring,
    n_iter=50
)

## Model Comparison

In [None]:
# Compare models on stroke dataset
print("="*60)
print("MODEL COMPARISON - STROKE DATASET")
print("="*60)
stroke_comparison = trainer.compare_models("stroke")
print(stroke_comparison)

In [None]:
# Compare models on hiring dataset
print("="*60)
print("MODEL COMPARISON - HIRING DATASET")
print("="*60)
hiring_comparison = trainer.compare_models("hiring")
print(hiring_comparison)

## Access MLflow UI

To view all experiments, metrics, and artifacts, run:
```bash
mlflow ui
```

Then open http://localhost:5000 in your browser.