# 🤖 AutoML + SHAP Explainer

**Project**: Automated Machine Learning + Explainable AI  
**Level**: Expert  
**Dataset**: Multiple datasets for comparison  

## 📋 Project Overview

This project implements automated machine learning (AutoML) with model explainability using SHAP. We'll learn:

- AutoML frameworks and techniques
- Automated feature engineering
- Hyperparameter optimization
- SHAP for model explainability
- Production-ready ML pipelines

Let's automate machine learning with explainable AI! 🚀

## 1. Import Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# AutoML Libraries
try:
    import autosklearn.classification
    import autosklearn.regression
    AUTOSKLEARN_AVAILABLE = True
except ImportError:
    AUTOSKLEARN_AVAILABLE = False
    print("⚠️ Auto-sklearn not available. Using alternative AutoML approach.")

try:
    from tpot import TPOTClassifier, TPOTRegressor
    TPOT_AVAILABLE = True
except ImportError:
    TPOT_AVAILABLE = False
    print("⚠️ TPOT not available.")

try:
    import pycaret
    from pycaret.classification import *
    from pycaret.regression import *
    PYCARET_AVAILABLE = True
except ImportError:
    PYCARET_AVAILABLE = False
    print("⚠️ PyCaret not available.")

# Explainable AI
import shap
try:
    import lime
    from lime.lime_tabular import LimeTabularExplainer
    LIME_AVAILABLE = True
except ImportError:
    LIME_AVAILABLE = False
    print("⚠️ LIME not available.")

# Traditional ML (fallback)
from sklearn.datasets import load_breast_cancer, load_wine, load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.inspection import permutation_importance

# Hyperparameter optimization
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Utilities
import warnings
import time
from datetime import datetime

warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Core libraries imported successfully!")
print(f"🤖 AutoML Libraries Available:")
print(f"   • Auto-sklearn: {AUTOSKLEARN_AVAILABLE}")
print(f"   • TPOT: {TPOT_AVAILABLE}")
print(f"   • PyCaret: {PYCARET_AVAILABLE}")
print(f"   • SHAP: ✅")
print(f"   • LIME: {LIME_AVAILABLE}")
print(f"🚀 Ready for AutoML + Explainable AI!")

## 2. Dataset Loading and Preparation

In [None]:
# Load multiple datasets for comprehensive AutoML testing
print("📊 Loading multiple datasets for AutoML comparison...")

datasets = {}

# 1. Breast Cancer Dataset (Classification)
cancer_data = load_breast_cancer()
datasets['breast_cancer'] = {
    'X': pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names),
    'y': cancer_data.target,
    'target_names': cancer_data.target_names,
    'type': 'classification',
    'description': 'Breast Cancer Wisconsin (Diagnostic)'
}

# 2. Wine Dataset (Classification)
wine_data = load_wine()
datasets['wine'] = {
    'X': pd.DataFrame(wine_data.data, columns=wine_data.feature_names),
    'y': wine_data.target,
    'target_names': wine_data.target_names,
    'type': 'classification',
    'description': 'Wine Recognition Dataset'
}

# 3. Diabetes Dataset (Regression)
diabetes_data = load_diabetes()
datasets['diabetes'] = {
    'X': pd.DataFrame(diabetes_data.data, columns=diabetes_data.feature_names),
    'y': diabetes_data.target,
    'target_names': ['diabetes_progression'],
    'type': 'regression',
    'description': 'Diabetes Dataset'
}

# 4. Synthetic Dataset (Classification)
from sklearn.datasets import make_classification
X_synthetic, y_synthetic = make_classification(
    n_samples=1000, n_features=20, n_informative=15, n_redundant=5,
    n_classes=2, random_state=42
)
feature_names_synthetic = [f'feature_{i}' for i in range(20)]
datasets['synthetic'] = {
    'X': pd.DataFrame(X_synthetic, columns=feature_names_synthetic),
    'y': y_synthetic,
    'target_names': ['class_0', 'class_1'],
    'type': 'classification',
    'description': 'Synthetic Classification Dataset'
}

print(f"\n📊 Datasets loaded successfully:")
for name, data in datasets.items():
    print(f"• {name}: {data['description']}")
    print(f"  - Shape: {data['X'].shape}")
    print(f"  - Type: {data['type']}")
    print(f"  - Classes: {len(data['target_names'])}")
    print()

In [None]:
# Dataset exploration and visualization
def explore_dataset(name, dataset_info):
    """Explore a single dataset"""
    print(f"🔍 Exploring {name} dataset...")
    
    X, y = dataset_info['X'], dataset_info['y']
    
    print(f"Dataset: {dataset_info['description']}")
    print(f"Samples: {X.shape[0]:,}")
    print(f"Features: {X.shape[1]}")
    print(f"Task: {dataset_info['type']}")
    
    if dataset_info['type'] == 'classification':
        unique_classes, counts = np.unique(y, return_counts=True)
        print(f"Classes: {len(unique_classes)}")
        for cls, count in zip(unique_classes, counts):
            class_name = dataset_info['target_names'][cls] if cls < len(dataset_info['target_names']) else f'class_{cls}'
            print(f"  • {class_name}: {count} ({count/len(y):.1%})")
    else:
        print(f"Target range: {y.min():.2f} - {y.max():.2f}")
        print(f"Target mean: {y.mean():.2f} ± {y.std():.2f}")
    
    print(f"Missing values: {X.isnull().sum().sum()}")
    print(f"Duplicate rows: {X.duplicated().sum()}")
    print()

# Explore all datasets
for name, dataset_info in datasets.items():
    explore_dataset(name, dataset_info)

## 3. Custom AutoML Implementation

In [None]:
class SimpleAutoML:
    """
    A simple AutoML implementation that:
    1. Tries multiple algorithms
    2. Performs hyperparameter tuning
    3. Selects the best model
    4. Provides model explanations with SHAP
    """
    
    def __init__(self, task_type='classification', time_limit=300, cv_folds=5):
        self.task_type = task_type
        self.time_limit = time_limit
        self.cv_folds = cv_folds
        self.best_model = None
        self.best_score = None
        self.results = []
        self.scaler = StandardScaler()
        
        # Define model candidates
        if task_type == 'classification':
            self.models = {
                'RandomForest': {
                    'model': RandomForestClassifier(random_state=42),
                    'params': {
                        'n_estimators': randint(50, 200),
                        'max_depth': randint(3, 20),
                        'min_samples_split': randint(2, 20),
                        'min_samples_leaf': randint(1, 10)
                    }
                },
                'GradientBoosting': {
                    'model': GradientBoostingClassifier(random_state=42),
                    'params': {
                        'n_estimators': randint(50, 200),
                        'learning_rate': uniform(0.01, 0.3),
                        'max_depth': randint(3, 10),
                        'subsample': uniform(0.6, 0.4)
                    }
                },
                'LogisticRegression': {
                    'model': LogisticRegression(random_state=42, max_iter=1000),
                    'params': {
                        'C': uniform(0.01, 10),
                        'penalty': ['l1', 'l2'],
                        'solver': ['liblinear']
                    }
                },
                'SVM': {
                    'model': SVC(random_state=42, probability=True),
                    'params': {
                        'C': uniform(0.1, 10),
                        'kernel': ['rbf', 'linear'],
                        'gamma': ['scale', 'auto']
                    }
                }
            }
        else:  # regression
            from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
            from sklearn.linear_model import LinearRegression, Ridge
            from sklearn.svm import SVR
            
            self.models = {
                'RandomForest': {
                    'model': RandomForestRegressor(random_state=42),
                    'params': {
                        'n_estimators': randint(50, 200),
                        'max_depth': randint(3, 20),
                        'min_samples_split': randint(2, 20)
                    }
                },
                'GradientBoosting': {
                    'model': GradientBoostingRegressor(random_state=42),
                    'params': {
                        'n_estimators': randint(50, 200),
                        'learning_rate': uniform(0.01, 0.3),
                        'max_depth': randint(3, 10)
                    }
                },
                'Ridge': {
                    'model': Ridge(random_state=42),
                    'params': {
                        'alpha': uniform(0.01, 10)
                    }
                }
            }
    
    def fit(self, X, y):
        """Fit AutoML pipeline"""
        print(f"🤖 Starting AutoML for {self.task_type} task...")
        print(f"⏱️ Time limit: {self.time_limit} seconds")
        print(f"🔄 Cross-validation folds: {self.cv_folds}")
        
        start_time = time.time()
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        
        # Try each model
        for model_name, model_config in self.models.items():
            if time.time() - start_time > self.time_limit:
                print(f"⏰ Time limit reached. Stopping at {model_name}")
                break
            
            print(f"\n🔧 Tuning {model_name}...")
            
            # Hyperparameter tuning
            search = RandomizedSearchCV(
                model_config['model'],
                model_config['params'],
                n_iter=20,
                cv=self.cv_folds,
                scoring='accuracy' if self.task_type == 'classification' else 'neg_mean_squared_error',
                random_state=42,
                n_jobs=-1
            )
            
            search.fit(X_scaled, y)
            
            # Store results
            result = {
                'model_name': model_name,
                'best_model': search.best_estimator_,
                'best_score': search.best_score_,
                'best_params': search.best_params_,
                'cv_results': search.cv_results_
            }
            
            self.results.append(result)
            
            print(f"✅ {model_name} - Best CV Score: {search.best_score_:.4f}")
            
            # Update best model
            if self.best_model is None or search.best_score_ > self.best_score:
                self.best_model = search.best_estimator_
                self.best_score = search.best_score_
                self.best_model_name = model_name
        
        total_time = time.time() - start_time
        print(f"\n🎉 AutoML completed in {total_time:.1f} seconds!")
        print(f"🏆 Best model: {self.best_model_name}")
        print(f"📊 Best CV score: {self.best_score:.4f}")
        
        return self
    
    def predict(self, X):
        """Make predictions"""
        if self.best_model is None:
            raise ValueError("Model not fitted yet!")
        
        X_scaled = self.scaler.transform(X)
        return self.best_model.predict(X_scaled)
    
    def predict_proba(self, X):
        """Predict probabilities (classification only)"""
        if self.task_type != 'classification':
            raise ValueError("predict_proba only available for classification")
        
        if self.best_model is None:
            raise ValueError("Model not fitted yet!")
        
        X_scaled = self.scaler.transform(X)
        return self.best_model.predict_proba(X_scaled)
    
    def get_leaderboard(self):
        """Get model performance leaderboard"""
        leaderboard = pd.DataFrame([
            {
                'Model': result['model_name'],
                'CV_Score': result['best_score'],
                'Best_Params': str(result['best_params'])
            }
            for result in self.results
        ]).sort_values('CV_Score', ascending=False)
        
        return leaderboard

print("✅ SimpleAutoML class defined!")
print("🤖 Ready to automate machine learning!")