# Machine Learning Assignment 2 - Model Training
## Complete Pipeline for Classification Models

This notebook demonstrates the complete workflow for training and evaluating 6 classification models.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, 
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report
)
import joblib
import warnings
warnings.filterwarnings('ignore')

print("✓ All libraries imported successfully")

## 2. Load and Explore Dataset

In [None]:
# IMPORTANT: Update these paths according to your dataset
DATA_PATH = 'your_dataset.csv'  # Update this
TARGET_COLUMN = 'target'  # Update this with your target column name

# Load dataset
df = pd.read_csv(DATA_PATH)

print(f"Dataset Shape: {df.shape}")
print(f"Number of Features: {df.shape[1] - 1}")
print(f"Number of Instances: {df.shape[0]}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Dataset information
print("Dataset Info:")
df.info()

print("\nMissing Values:")
print(df.isnull().sum())

print(f"\nTarget Distribution:")
print(df[TARGET_COLUMN].value_counts())

## 3. Data Preprocessing

In [None]:
# Separate features and target
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

# Encode target if categorical
label_encoder = LabelEncoder()
if y.dtype == 'object':
    y = label_encoder.fit_transform(y)
    print(f"Target encoded. Classes: {label_encoder.classes_}")

# Handle categorical features
categorical_cols = X.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    print(f"Encoding categorical features: {list(categorical_cols)}")
    X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

print(f"\nFinal feature shape: {X.shape}")
print(f"Number of classes: {len(np.unique(y))}")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train_scaled.shape[0]}")
print(f"Test set size: {X_test_scaled.shape[0]}")
print(f"Number of features: {X_train_scaled.shape[1]}")

## 4. Initialize All Models

In [None]:
# Initialize all 6 models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'kNN': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False)
}

print(f"Initialized {len(models)} models:")
for name in models.keys():
    print(f"  ✓ {name}")

## 5. Train and Evaluate Models

In [None]:
# Function to calculate all metrics
def calculate_metrics(y_true, y_pred, y_pred_proba=None):
    metrics = {}
    
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    metrics['Recall'] = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    metrics['F1'] = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    metrics['MCC'] = matthews_corrcoef(y_true, y_pred)
    
    # AUC Score
    try:
        if y_pred_proba is not None:
            if len(np.unique(y_true)) == 2:
                metrics['AUC'] = roc_auc_score(y_true, y_pred_proba[:, 1])
            else:
                metrics['AUC'] = roc_auc_score(y_true, y_pred_proba, 
                                               multi_class='ovr', average='weighted')
        else:
            metrics['AUC'] = 0.0
    except:
        metrics['AUC'] = 0.0
    
    return metrics

In [None]:
# Train and evaluate all models
results = {}

for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    
    # Get probabilities
    try:
        y_pred_proba = model.predict_proba(X_test_scaled)
    except:
        y_pred_proba = None
    
    # Calculate metrics
    metrics = calculate_metrics(y_test, y_pred, y_pred_proba)
    
    # Store results
    results[model_name] = {
        'model': model,
        'metrics': metrics,
        'predictions': y_pred,
        'confusion_matrix': confusion_matrix(y_test, y_pred)
    }
    
    print(f"✓ {model_name} completed")
    print(f"  Accuracy: {metrics['Accuracy']:.4f}")
    print(f"  F1 Score: {metrics['F1']:.4f}")

## 6. Results Comparison Table

In [None]:
# Create comparison dataframe
results_data = []
for model_name, result in results.items():
    metrics = result['metrics']
    results_data.append({
        'ML Model Name': model_name,
        'Accuracy': f"{metrics['Accuracy']:.4f}",
        'AUC': f"{metrics['AUC']:.4f}",
        'Precision': f"{metrics['Precision']:.4f}",
        'Recall': f"{metrics['Recall']:.4f}",
        'F1': f"{metrics['F1']:.4f}",
        'MCC': f"{metrics['MCC']:.4f}"
    })

results_df = pd.DataFrame(results_data)
print("\n" + "="*100)
print("MODEL COMPARISON TABLE")
print("="*100)
print(results_df.to_string(index=False))
print("="*100)

## 7. Visualize Results

In [None]:
# Plot metrics comparison
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

metrics_to_plot = ['Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']
model_names = list(results.keys())

for idx, metric in enumerate(metrics_to_plot):
    row = idx // 3
    col = idx % 3
    ax = axes[row, col]
    
    values = [results[name]['metrics'][metric] for name in model_names]
    
    bars = ax.bar(range(len(model_names)), values, color='steelblue', alpha=0.8)
    ax.set_xlabel('Models', fontsize=10)
    ax.set_ylabel(metric, fontsize=10)
    ax.set_title(f'{metric} Comparison', fontsize=12, fontweight='bold')
    ax.set_xticks(range(len(model_names)))
    ax.set_xticklabels([name.replace(' ', '\n') for name in model_names], 
                        rotation=0, ha='center', fontsize=8)
    ax.set_ylim(0, 1.0)
    ax.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for bar, val in zip(bars, values):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{val:.3f}', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Visualization saved as 'model_comparison.png'")

In [None]:
# Plot confusion matrices
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Confusion Matrices for All Models', fontsize=16, fontweight='bold')

for idx, (model_name, result) in enumerate(results.items()):
    row = idx // 3
    col = idx % 3
    ax = axes[row, col]
    
    cm = result['confusion_matrix']
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax, cbar=False)
    ax.set_title(model_name, fontsize=12, fontweight='bold')
    ax.set_xlabel('Predicted', fontsize=10)
    ax.set_ylabel('Actual', fontsize=10)

plt.tight_layout()
plt.savefig('confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Confusion matrices saved as 'confusion_matrices.png'")

## 8. Save Models and Preprocessors

In [None]:
# Create model directory if it doesn't exist
import os
os.makedirs('model', exist_ok=True)

# Save all models
print("Saving models...")
for model_name, result in results.items():
    model_filename = f"model/{model_name.replace(' ', '_').lower()}_model.pkl"
    joblib.dump(result['model'], model_filename)
    print(f"✓ Saved {model_name} to {model_filename}")

# Save scaler and label encoder
joblib.dump(scaler, 'model/scaler.pkl')
joblib.dump(label_encoder, 'model/label_encoder.pkl')

print("✓ Saved scaler and label encoder")
print("\n✓ All models saved successfully!")

## 9. Export Results for README

In [None]:
# Save results to CSV
results_df.to_csv('model_results.csv', index=False)
print("✓ Results exported to 'model_results.csv'")

# Print markdown table for README
print("\n" + "="*100)
print("COPY THIS TABLE TO YOUR README.md:")
print("="*100)
print("\n| ML Model Name | Accuracy | AUC | Precision | Recall | F1 | MCC |")
print("|--------------|----------|-----|-----------|--------|-------|-----|")
for _, row in results_df.iterrows():
    print(f"| {row['ML Model Name']} | {row['Accuracy']} | {row['AUC']} | {row['Precision']} | {row['Recall']} | {row['F1']} | {row['MCC']} |")
print("\n" + "="*100)

## 10. Summary

In [None]:
# Find best model
best_model_name = max(results.items(), key=lambda x: x[1]['metrics']['Accuracy'])[0]
best_accuracy = results[best_model_name]['metrics']['Accuracy']

print("\n" + "="*100)
print("SUMMARY")
print("="*100)
print(f"\nBest Performing Model: {best_model_name}")
print(f"Best Accuracy: {best_accuracy:.4f}")
print(f"\nAll models trained and saved successfully!")
print(f"Total models: {len(models)}")
print(f"\nNext Steps:")
print("1. Update README.md with the results table")
print("2. Add your observations for each model")
print("3. Run the Streamlit app: streamlit run app.py")
print("4. Deploy to Streamlit Cloud")
print("="*100)