In [1]:
# Real-world data integration strategies\n\nimport sqlite3\nimport json\nfrom urllib.parse import urljoin\nfrom datetime import datetime, timedelta\nimport time\n\nclass DataConnector:\n    \"\"\"Professional data connector for various sources\"\"\"\n    \n    def __init__(self):\n        self.connection_cache = {}\n        self.request_session = None\n        if HAS_REQUESTS:\n            self.request_session = requests.Session()\n            self.request_session.headers.update({\n                'User-Agent': 'ML-Training-Notebook/1.0'\n            })\n    \n    def connect_to_api(self, base_url, endpoint, params=None, headers=None, rate_limit=1.0):\n        \"\"\"Connect to REST API with rate limiting and error handling\"\"\"\n        \n        if not HAS_REQUESTS:\n            print(\"❌ Requests library not available. Install with: pip install requests\")\n            return None\n        \n        try:\n            url = urljoin(base_url, endpoint)\n            \n            # Rate limiting\n            time.sleep(rate_limit)\n            \n            # Add custom headers\n            session_headers = self.request_session.headers.copy()\n            if headers:\n                session_headers.update(headers)\n            \n            print(f\"🌐 Connecting to API: {url}\")\n            response = self.request_session.get(url, params=params, headers=session_headers, timeout=10)\n            response.raise_for_status()\n            \n            print(f\"✅ API request successful (Status: {response.status_code})\")\n            return response.json()\n            \n        except requests.exceptions.RequestException as e:\n            print(f\"❌ API request failed: {e}\")\n            return None\n        except json.JSONDecodeError as e:\n            print(f\"❌ Failed to parse JSON response: {e}\")\n            return None\n    \n    def create_sample_database(self, db_path='sample_data.db'):\n        \"\"\"Create a sample SQLite database for demonstration\"\"\"\n        \n        print(f\"🗄️ Creating sample database: {db_path}\")\n        \n        # Create connection\n        conn = sqlite3.connect(db_path)\n        cursor = conn.cursor()\n        \n        # Create tables\n        cursor.execute('''\n            CREATE TABLE IF NOT EXISTS customers (\n                customer_id INTEGER PRIMARY KEY,\n                name TEXT NOT NULL,\n                email TEXT UNIQUE,\n                registration_date DATE,\n                country TEXT,\n                subscription_tier TEXT\n            )\n        ''')\n        \n        cursor.execute('''\n            CREATE TABLE IF NOT EXISTS transactions (\n                transaction_id INTEGER PRIMARY KEY,\n                customer_id INTEGER,\n                amount REAL,\n                transaction_date DATETIME,\n                product_category TEXT,\n                FOREIGN KEY (customer_id) REFERENCES customers (customer_id)\n            )\n        ''')\n        \n        # Insert sample data\n        np.random.seed(RANDOM_STATE)\n        \n        # Generate customers\n        countries = ['USA', 'UK', 'Germany', 'France', 'Canada', 'Australia']\n        tiers = ['Basic', 'Premium', 'Enterprise']\n        \n        customers_data = []\n        for i in range(1, 201):  # 200 customers\n            customers_data.append((\n                i,\n                f\"Customer_{i:03d}\",\n                f\"customer{i:03d}@email.com\",\n                (datetime.now() - timedelta(days=np.random.randint(30, 365))).date(),\n                np.random.choice(countries),\n                np.random.choice(tiers, p=[0.5, 0.35, 0.15])\n            ))\n        \n        cursor.executemany(\n            'INSERT OR REPLACE INTO customers VALUES (?, ?, ?, ?, ?, ?)',\n            customers_data\n        )\n        \n        # Generate transactions\n        categories = ['Electronics', 'Clothing', 'Books', 'Home', 'Sports']\n        transactions_data = []\n        \n        for i in range(1, 1001):  # 1000 transactions\n            customer_id = np.random.randint(1, 201)\n            amount = np.random.lognormal(3, 1)  # Log-normal distribution for realistic amounts\n            transaction_date = datetime.now() - timedelta(days=np.random.randint(0, 90))\n            category = np.random.choice(categories)\n            \n            transactions_data.append((\n                i, customer_id, round(amount, 2), transaction_date, category\n            ))\n        \n        cursor.executemany(\n            'INSERT OR REPLACE INTO transactions VALUES (?, ?, ?, ?, ?)',\n            transactions_data\n        )\n        \n        conn.commit()\n        conn.close()\n        \n        print(f\"✅ Database created with {len(customers_data)} customers and {len(transactions_data)} transactions\")\n        return db_path\n    \n    def connect_to_database(self, db_path, query):\n        \"\"\"Connect to SQLite database and execute query\"\"\"\n        \n        try:\n            print(f\"🗄️ Connecting to database: {db_path}\")\n            conn = sqlite3.connect(db_path)\n            \n            # Execute query and return DataFrame\n            df = pd.read_sql_query(query, conn)\n            conn.close()\n            \n            print(f\"✅ Query executed successfully. Retrieved {len(df)} rows\")\n            return df\n            \n        except sqlite3.Error as e:\n            print(f\"❌ Database error: {e}\")\n            return None\n    \n    def simulate_web_scraping(self, num_records=100):\n        \"\"\"Simulate web scraping (without actual scraping)\"\"\"\n        \n        print(f\"🕷️ Simulating web scraping for {num_records} records...\")\n        \n        # Simulate realistic web-scraped data\n        np.random.seed(RANDOM_STATE)\n        \n        # Simulate product data from e-commerce site\n        products = []\n        categories = ['Electronics', 'Books', 'Clothing', 'Home & Garden', 'Sports']\n        brands = ['BrandA', 'BrandB', 'BrandC', 'BrandD', 'BrandE']\n        \n        for i in range(num_records):\n            # Simulate some missing data (realistic for web scraping)\n            rating = np.random.uniform(1, 5) if np.random.random() > 0.1 else None\n            price = np.random.lognormal(3, 0.8) if np.random.random() > 0.05 else None\n            \n            product = {\n                'product_id': f'P{i:04d}',\n                'name': f'Product {i}',\n                'category': np.random.choice(categories),\n                'brand': np.random.choice(brands) if np.random.random() > 0.15 else None,\n                'price': round(price, 2) if price else None,\n                'rating': round(rating, 1) if rating else None,\n                'num_reviews': np.random.poisson(50) if rating else 0,\n                'in_stock': np.random.choice([True, False], p=[0.85, 0.15]),\n                'scraped_date': datetime.now() - timedelta(hours=np.random.randint(0, 24))\n            }\n            products.append(product)\n        \n        df = pd.DataFrame(products)\n        \n        print(f\"✅ Simulated scraping complete. Created dataset with shape {df.shape}\")\n        print(f\"Missing data: {df.isnull().sum().sum()} total missing values\")\n        \n        return df\n    \n    def fetch_public_api_data(self, api_name='jsonplaceholder'):\n        \"\"\"Fetch data from public APIs for demonstration\"\"\"\n        \n        if not HAS_REQUESTS:\n            print(\"❌ Requests library not available\")\n            return None\n        \n        if api_name == 'jsonplaceholder':\n            # JSONPlaceholder - fake REST API\n            print(\"📡 Fetching data from JSONPlaceholder API...\")\n            \n            # Fetch users\n            users_data = self.connect_to_api(\n                'https://jsonplaceholder.typicode.com/',\n                'users'\n            )\n            \n            # Fetch posts\n            posts_data = self.connect_to_api(\n                'https://jsonplaceholder.typicode.com/',\n                'posts'\n            )\n            \n            if users_data and posts_data:\n                users_df = pd.DataFrame(users_data)\n                posts_df = pd.DataFrame(posts_data)\n                \n                # Clean and normalize the data\n                users_clean = pd.json_normalize(users_data)\n                posts_clean = pd.DataFrame(posts_data)\n                \n                print(f\"✅ Fetched {len(users_clean)} users and {len(posts_clean)} posts\")\n                return {'users': users_clean, 'posts': posts_clean}\n        \n        return None\n\n# Initialize data connector\ndata_connector = DataConnector()\n\n# Demonstrate different data sources\nprint(\"🌍 REAL-WORLD DATA INTEGRATION EXAMPLES\")\nprint(\"=\" * 50)\n\n# 1. Database connection\ndb_path = data_connector.create_sample_database()\n\n# Query customer data\ncustomer_query = '''\n    SELECT \n        c.customer_id,\n        c.name,\n        c.country,\n        c.subscription_tier,\n        COUNT(t.transaction_id) as num_transactions,\n        SUM(t.amount) as total_spent,\n        AVG(t.amount) as avg_transaction\n    FROM customers c\n    LEFT JOIN transactions t ON c.customer_id = t.customer_id\n    GROUP BY c.customer_id\n    ORDER BY total_spent DESC\n    LIMIT 10\n'''\n\ncustomer_data = data_connector.connect_to_database(db_path, customer_query)\nif customer_data is not None:\n    print(\"\\n🏆 Top 10 customers by total spent:\")\n    print(customer_data)\n\n# 2. Web scraping simulation\nscraped_data = data_connector.simulate_web_scraping(50)\nprint(\"\\n🕷️ Sample scraped data:\")\nprint(scraped_data.head())\nprint(f\"\\nData quality check - Missing values per column:\")\nprint(scraped_data.isnull().sum())\n\n# 3. Public API data\napi_data = data_connector.fetch_public_api_data()\nif api_data:\n    print(\"\\n📡 Public API data:\")\n    print(f\"Users shape: {api_data['users'].shape}\")\n    print(f\"Posts shape: {api_data['posts'].shape}\")\n    print(\"\\nSample user data:\")\n    print(api_data['users'][['name', 'email', 'address.city', 'company.name']].head())"

### 4.1 Real Data Sources Integration"

## 4. Real-World Data Integration & MLOps\n\n**⏱️ Estimated time:** 50 minutes\n\n**Learning objectives:**\n- Connect to real data sources (APIs, databases, web scraping)\n- Implement model persistence and versioning\n- Apply MLOps principles for production deployments\n- Handle data quality monitoring and model drift detection"

# Machine Learning & AI 101: Complete Professional Training

🎯 **Welcome to the most comprehensive ML/AI training for data professionals!**

This enhanced notebook transforms beginners into competent ML practitioners through:
- **Systematic skill building** with measurable learning outcomes
- **Real-world applications** using actual data sources and deployment techniques
- **Industry best practices** including MLOps, testing, and production considerations
- **Interactive assessments** to validate your progress

---

## 📋 Learning Objectives

By completing this training, you will:

1. **Master data preprocessing pipelines** for production-ready ML systems
2. **Implement robust model evaluation** with proper validation strategies
3. **Build end-to-end ML applications** with real data sources and deployment
4. **Apply MLOps principles** for model versioning, monitoring, and maintenance
5. **Handle ethical considerations** including bias detection and fairness metrics
6. **Debug common ML issues** and optimize model performance

**Estimated completion time:** 8-12 hours (can be completed in modules)

---

## 📊 Prerequisites & Environment Setup

### Required Knowledge
- [ ] Basic Python programming (functions, classes, data structures)
- [ ] Elementary statistics (mean, variance, distributions)
- [ ] High school mathematics (algebra, basic calculus helpful but not required)

### Success Criteria
- [ ] Complete all checkpoint assessments with 70%+ scores
- [ ] Successfully implement at least 2 end-to-end projects
- [ ] Demonstrate ability to debug and optimize ML models

Let's verify your environment and begin your professional ML journey! 🚀

## 1. Environment Setup & Validation

**⏱️ Estimated time:** 15 minutes

**Learning objectives:**
- Set up a reproducible ML environment
- Understand version management for ML projects
- Implement proper random seed management

In [2]:
# Environment setup with version tracking and reproducibility
import sys
import warnings
from datetime import datetime
import os

# Core libraries with version checking
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# Machine Learning - Core
import sklearn
from sklearn.datasets import (
    load_iris, load_wine, load_breast_cancer, 
    make_classification, make_regression, make_blobs
)
from sklearn.model_selection import (
    train_test_split, cross_val_score, GridSearchCV, 
    RandomizedSearchCV, validation_curve, learning_curve,
    StratifiedKFold, KFold
)
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler, 
    LabelEncoder, OneHotEncoder, PolynomialFeatures
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer

# Algorithms
from sklearn.linear_model import (
    LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
)
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,
    GradientBoostingClassifier, VotingClassifier
)
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier, MLPRegressor

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix,
    mean_squared_error, mean_absolute_error, r2_score,
    roc_curve, auc, roc_auc_score,
    silhouette_score, adjusted_rand_score
)

# Advanced libraries
try:
    import joblib
    HAS_JOBLIB = True
except ImportError:
    HAS_JOBLIB = False
    
try:
    import requests
    HAS_REQUESTS = True
except ImportError:
    HAS_REQUESTS = False

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 3)

# Global random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Environment validation
print("🔧 ML ENVIRONMENT VALIDATION")
print("=" * 50)
print(f"Python version: {sys.version.split()[0]}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")
print(f"Matplotlib version: {plt.matplotlib.__version__}")
print(f"Seaborn version: {sns.__version__}")

print("\n📦 Optional Libraries:")
print(f"Joblib available: {'✅' if HAS_JOBLIB else '❌ (pip install joblib)'}")
print(f"Requests available: {'✅' if HAS_REQUESTS else '❌ (pip install requests)'}")

print(f"\n🎲 Random state set to: {RANDOM_STATE}")
print(f"📅 Session started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("\n✅ Environment ready for ML training!")

🔧 ML ENVIRONMENT VALIDATION
Python version: 3.12.3
NumPy version: 2.3.3
Pandas version: 2.3.2
Scikit-learn version: 1.7.2
Matplotlib version: 3.10.6
Seaborn version: 0.13.2

📦 Optional Libraries:
Joblib available: ✅
Requests available: ✅

🎲 Random state set to: 42
📅 Session started: 2025-09-28 10:58:12

✅ Environment ready for ML training!


### 📝 Checkpoint 1: Environment Validation

**Quick Assessment (2 minutes):**

1. What is the purpose of setting a random state in ML projects?
2. Why do we suppress warnings in production ML code?
3. What happens if you don't manage package versions in ML projects?

<details>
<summary>Click for answers</summary>

1. **Random state ensures reproducibility** - same results across runs and different environments
2. **Suppress warnings to avoid clutter** in production logs, but keep them during development
3. **Version mismatches can cause** model performance changes, crashes, or different results
</details>

## 2. Data Fundamentals & Professional Preprocessing

**⏱️ Estimated time:** 45 minutes

**Learning objectives:**
- Master production-ready data preprocessing pipelines
- Handle missing data with advanced strategies
- Implement feature engineering and validation
- Understand data leakage and prevention

### 2.1 Advanced Data Loading & Validation

In [None]:
# Professional data validation and quality assessment
def validate_dataset(df, name="Dataset"):
    """Comprehensive data validation function"""
    print(f"\n📊 {name} Validation Report")
    print("=" * 40)
    
    # Basic info
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Data types
    print("\n📋 Data Types:")
    type_counts = df.dtypes.value_counts()
    for dtype, count in type_counts.items():
        print(f"  {dtype}: {count} columns")
    
    # Missing data analysis
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing': missing,
        'Percentage': missing_pct
    }).sort_values('Missing', ascending=False)
    
    if missing.sum() > 0:
        print("\n⚠️ Missing Data:")
        print(missing_df[missing_df['Missing'] > 0])
    else:
        print("\n✅ No missing data found")
    
    # Duplicates
    duplicates = df.duplicated().sum()
    print(f"\n🔄 Duplicate rows: {duplicates} ({duplicates/len(df)*100:.1f}%)")
    
    # Numeric column statistics
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"\n📈 Numeric columns: {len(numeric_cols)}")
        print("Range check:")
        for col in numeric_cols:
            print(f"  {col}: [{df[col].min():.3f}, {df[col].max():.3f}]")
    
    return missing_df

# Create a comprehensive synthetic dataset for demonstration
np.random.seed(RANDOM_STATE)

n_samples = 1000
synthetic_data = {
    # Demographic features
    'customer_id': range(1, n_samples + 1),
    'age': np.random.randint(18, 80, n_samples),
    'income': np.random.lognormal(10.5, 0.8, n_samples),  # More realistic income distribution
    'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n_samples, p=[0.3, 0.4, 0.2, 0.1]),
    'region': np.random.choice(['North', 'South', 'East', 'West'], n_samples),
    
    # Behavioral features
    'monthly_spend': np.random.gamma(2, 50, n_samples),
    'num_purchases': np.random.poisson(8, n_samples),
    'days_since_last_purchase': np.random.exponential(10, n_samples),
    'customer_rating': np.random.choice([1, 2, 3, 4, 5], n_samples, p=[0.05, 0.1, 0.2, 0.4, 0.25]),
    
    # Technical features
    'website_visits': np.random.negative_binomial(10, 0.3, n_samples),
    'mobile_app_usage': np.random.beta(2, 5, n_samples) * 100,  # Percentage
    'email_open_rate': np.random.beta(3, 7, n_samples),
}

# Add missing values realistically (missing not at random)
df_raw = pd.DataFrame(synthetic_data)

# Income missing for younger customers (survey bias)
young_mask = df_raw['age'] < 25
df_raw.loc[young_mask & (np.random.random(sum(young_mask)) < 0.3), 'income'] = np.nan

# Rating missing for customers with very few purchases
low_purchase_mask = df_raw['num_purchases'] <= 2
df_raw.loc[low_purchase_mask & (np.random.random(sum(low_purchase_mask)) < 0.4), 'customer_rating'] = np.nan

# App usage missing for older customers
old_mask = df_raw['age'] > 65
df_raw.loc[old_mask & (np.random.random(sum(old_mask)) < 0.6), 'mobile_app_usage'] = np.nan

# Add some extreme outliers
outlier_indices = np.random.choice(df_raw.index, 20, replace=False)
df_raw.loc[outlier_indices, 'monthly_spend'] *= 10  # Very high spenders

# Validate the dataset
validation_report = validate_dataset(df_raw, "Customer Dataset")

print("\n🎯 First 5 rows:")
print(df_raw.head())

### 2.2 Professional Missing Data Handling

In [None]:
# Advanced missing data analysis and handling strategies

def analyze_missing_patterns(df):
    """Analyze patterns in missing data"""
    print("🔍 Missing Data Pattern Analysis")
    print("=" * 40)
    
    missing_df = df.isnull()
    
    # Missing data patterns
    missing_patterns = missing_df.groupby(list(missing_df.columns)).size().reset_index().rename(columns={0:'count'})
    missing_patterns = missing_patterns.sort_values('count', ascending=False)
    
    print("Top 5 missing patterns:")
    for i, (_, row) in enumerate(missing_patterns.head().iterrows()):
        pattern = [col for col in missing_df.columns if row[col]]
        if pattern:
            print(f"  {i+1}. Missing {pattern}: {row['count']} rows")
        else:
            print(f"  {i+1}. Complete data: {row['count']} rows")
    
    # Correlation between missing values
    missing_corr = missing_df.corr()
    
    # Visualize missing data patterns
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # Missing data heatmap
    missing_matrix = df.isnull().astype(int)
    sns.heatmap(missing_matrix.corr(), annot=True, cmap='RdYlBu_r', center=0, ax=axes[0])
    axes[0].set_title('Missing Data Correlation')
    
    # Missing data by column
    missing_counts = df.isnull().sum().sort_values(ascending=True)
    missing_counts = missing_counts[missing_counts > 0]
    if len(missing_counts) > 0:
        missing_counts.plot(kind='barh', ax=axes[1])
        axes[1].set_title('Missing Values by Column')
        axes[1].set_xlabel('Number of Missing Values')
    
    # Missing data pattern visualization
    sample_data = df.head(100)  # Sample for visualization
    missing_vis = sample_data.isnull().astype(int)
    sns.heatmap(missing_vis.T, cbar=True, cmap='RdYlBu_r', ax=axes[2])
    axes[2].set_title('Missing Data Pattern (First 100 rows)')
    axes[2].set_xlabel('Row Index')
    
    plt.tight_layout()
    plt.show()
    
    return missing_patterns

# Multiple imputation strategies
def compare_imputation_strategies(df, target_col):
    """Compare different imputation strategies"""
    print(f"\n🔧 Imputation Strategy Comparison for '{target_col}'")
    print("=" * 50)
    
    if target_col not in df.columns or df[target_col].isnull().sum() == 0:
        print(f"Column '{target_col}' has no missing values")
        return
    
    # Original statistics
    original_mean = df[target_col].mean()
    original_std = df[target_col].std()
    missing_count = df[target_col].isnull().sum()
    
    print(f"Original - Mean: {original_mean:.3f}, Std: {original_std:.3f}")
    print(f"Missing values: {missing_count} ({missing_count/len(df)*100:.1f}%)")
    
    strategies = {
        'Mean': SimpleImputer(strategy='mean'),
        'Median': SimpleImputer(strategy='median'),
        'KNN (k=5)': KNNImputer(n_neighbors=5)
    }
    
    results = {}
    
    # Prepare features for KNN imputation
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df_numeric = df[numeric_cols]
    
    for name, imputer in strategies.items():
        df_imputed = df_numeric.copy()
        
        if name.startswith('KNN'):
            # KNN needs all numeric data
            df_imputed = pd.DataFrame(
                imputer.fit_transform(df_numeric),
                columns=df_numeric.columns,
                index=df_numeric.index
            )
        else:
            # Simple imputation
            df_imputed[target_col] = imputer.fit_transform(df_numeric[[target_col]]).ravel()
        
        imputed_mean = df_imputed[target_col].mean()
        imputed_std = df_imputed[target_col].std()
        
        results[name] = {
            'mean': imputed_mean,
            'std': imputed_std,
            'mean_diff': abs(imputed_mean - original_mean),
            'std_diff': abs(imputed_std - original_std)
        }
        
        print(f"{name:12} - Mean: {imputed_mean:.3f}, Std: {imputed_std:.3f}")
    
    return results

# Analyze missing patterns
missing_patterns = analyze_missing_patterns(df_raw)

# Compare imputation strategies for income
imputation_results = compare_imputation_strategies(df_raw, 'income')

### 2.3 Production-Ready Preprocessing Pipeline

In [None]:
# Professional preprocessing pipeline using sklearn

class MLPreprocessor:
    """Production-ready preprocessing pipeline"""
    
    def __init__(self, handle_outliers=True, outlier_method='iqr'):
        self.handle_outliers = handle_outliers
        self.outlier_method = outlier_method
        self.preprocessor = None
        self.feature_names = None
        self.outlier_bounds = {}
        
    def detect_outliers(self, X, column, method='iqr'):
        """Detect outliers using IQR or z-score method"""
        if method == 'iqr':
            Q1 = X[column].quantile(0.25)
            Q3 = X[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
        else:  # z-score
            mean = X[column].mean()
            std = X[column].std()
            lower_bound = mean - 3 * std
            upper_bound = mean + 3 * std
        
        self.outlier_bounds[column] = (lower_bound, upper_bound)
        outliers = (X[column] < lower_bound) | (X[column] > upper_bound)
        return outliers
    
    def fit(self, X, y=None):
        """Fit the preprocessing pipeline"""
        X_copy = X.copy()
        
        # Separate numeric and categorical columns
        numeric_features = X_copy.select_dtypes(include=[np.number]).columns.tolist()
        categorical_features = X_copy.select_dtypes(include=['object', 'category']).columns.tolist()
        
        # Remove ID columns if present
        id_columns = [col for col in numeric_features if 'id' in col.lower()]
        numeric_features = [col for col in numeric_features if col not in id_columns]
        
        print(f"Identified features:")
        print(f"  Numeric: {len(numeric_features)} - {numeric_features}")
        print(f"  Categorical: {len(categorical_features)} - {categorical_features}")
        print(f"  ID columns (excluded): {id_columns}")
        
        # Handle outliers in numeric features
        if self.handle_outliers:
            print(f"\n🎯 Outlier Detection ({self.outlier_method} method):")
            for col in numeric_features:
                outliers = self.detect_outliers(X_copy, col, self.outlier_method)
                outlier_count = outliers.sum()
                if outlier_count > 0:
                    print(f"  {col}: {outlier_count} outliers ({outlier_count/len(X_copy)*100:.1f}%)")
        
        # Create preprocessing pipelines
        numeric_transformer = Pipeline(steps=[
            ('imputer', KNNImputer(n_neighbors=5)),
            ('scaler', StandardScaler())
        ])
        
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
            ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
        ])
        
        # Combine transformers
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ],
            remainder='drop'  # Drop ID columns
        )
        
        # Fit the preprocessor
        self.preprocessor.fit(X_copy)
        
        # Store feature names for later use
        # Get feature names from transformers
        numeric_feature_names = numeric_features
        
        try:
            categorical_feature_names = (
                self.preprocessor
                .named_transformers_['cat']
                .named_steps['encoder']
                .get_feature_names_out(categorical_features)
            )
        except:
            categorical_feature_names = []
        
        self.feature_names = list(numeric_feature_names) + list(categorical_feature_names)
        
        print(f"\n✅ Preprocessing pipeline fitted")
        print(f"Final feature count: {len(self.feature_names)}")
        
        return self
    
    def transform(self, X):
        """Transform the data using fitted pipeline"""
        if self.preprocessor is None:
            raise ValueError("Pipeline not fitted. Call fit() first.")
        
        X_transformed = self.preprocessor.transform(X)
        
        # Convert to DataFrame with proper column names
        return pd.DataFrame(X_transformed, columns=self.feature_names, index=X.index)
    
    def fit_transform(self, X, y=None):
        """Fit and transform in one step"""
        return self.fit(X, y).transform(X)
    
    def get_feature_importance_mapping(self):
        """Get mapping of original to transformed features"""
        return {
            'feature_names': self.feature_names,
            'outlier_bounds': self.outlier_bounds
        }

# Apply the preprocessing pipeline
print("🔧 PROFESSIONAL PREPROCESSING PIPELINE")
print("=" * 50)

# Initialize and fit the preprocessor
preprocessor = MLPreprocessor(handle_outliers=True, outlier_method='iqr')

# Exclude customer_id for preprocessing
feature_columns = [col for col in df_raw.columns if col != 'customer_id']
X_raw = df_raw[feature_columns]

# Fit and transform
X_processed = preprocessor.fit_transform(X_raw)

print(f"\n📊 Transformation Results:")
print(f"Original shape: {X_raw.shape}")
print(f"Processed shape: {X_processed.shape}")
print(f"Features created: {list(X_processed.columns)}")

# Validate the processed data
validate_dataset(X_processed, "Processed Dataset")

### 📝 Checkpoint 2: Data Preprocessing

**Assessment Questions (5 minutes):**

1. Why is KNN imputation often better than mean/median imputation?
2. What is data leakage and how does proper train/test splitting prevent it?
3. When would you use RobustScaler instead of StandardScaler?
4. What are the risks of dropping rows with missing values?

**Practical Exercise:**
Modify the preprocessing pipeline to:
- Use different imputation strategies for different columns
- Add polynomial features for numeric variables
- Implement custom outlier handling

## 3. Advanced Model Evaluation & Validation

**⏱️ Estimated time:** 40 minutes

**Learning objectives:**
- Implement robust cross-validation strategies
- Understand bias-variance tradeoff
- Master hyperparameter optimization
- Detect overfitting and model selection

### 3.1 Cross-Validation Strategies

In [None]:
# Advanced cross-validation and model evaluation

def comprehensive_model_evaluation(X, y, models, cv_strategy='stratified', n_splits=5, test_size=0.2):
    """Comprehensive model evaluation with multiple metrics"""
    
    print("🎯 COMPREHENSIVE MODEL EVALUATION")
    print("=" * 50)
    
    # Determine if this is classification or regression
    is_classification = len(np.unique(y)) < 20 and y.dtype in ['int64', 'object', 'bool']
    
    print(f"Problem type: {'Classification' if is_classification else 'Regression'}")
    print(f"Cross-validation: {cv_strategy} {n_splits}-fold")
    
    # Split data for final evaluation
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=RANDOM_STATE, 
        stratify=y if is_classification else None
    )
    
    # Choose cross-validation strategy
    if is_classification:
        if cv_strategy == 'stratified':
            cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
        else:
            cv = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
        scoring_metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'roc_auc_ovr']
    else:
        cv = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
        scoring_metrics = ['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2']
    
    results = {}
    
    for name, model in models.items():
        print(f"\n🔍 Evaluating {name}...")
        
        model_results = {'name': name}
        
        # Cross-validation scores
        for metric in scoring_metrics:
            try:
                scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=metric)
                model_results[metric] = {
                    'mean': scores.mean(),
                    'std': scores.std(),
                    'scores': scores
                }
                print(f"  {metric}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
            except Exception as e:
                print(f"  {metric}: Error - {e}")
                model_results[metric] = {'mean': np.nan, 'std': np.nan, 'scores': []}
        
        # Final model evaluation on test set
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        if is_classification:
            model_results['test_accuracy'] = accuracy_score(y_test, y_pred)
            model_results['test_precision'] = precision_score(y_test, y_pred, average='macro', zero_division=0)
            model_results['test_recall'] = recall_score(y_test, y_pred, average='macro', zero_division=0)
            model_results['test_f1'] = f1_score(y_test, y_pred, average='macro', zero_division=0)
        else:
            model_results['test_mse'] = mean_squared_error(y_test, y_pred)
            model_results['test_mae'] = mean_absolute_error(y_test, y_pred)
            model_results['test_r2'] = r2_score(y_test, y_pred)
        
        model_results['model'] = model
        model_results['y_test'] = y_test
        model_results['y_pred'] = y_pred
        
        results[name] = model_results
    
    return results, X_test

# Create target variable for demonstration
# Customer lifetime value prediction (regression)
np.random.seed(RANDOM_STATE)

# Create a realistic target based on features
def create_target_variable(df):
    """Create realistic target variables"""
    
    # Customer Lifetime Value (CLV) - Regression target
    base_clv = 1000
    
    # Impact of various factors on CLV
    income_factor = np.log1p(df['income'].fillna(df['income'].median())) / 10
    age_factor = np.where(df['age'] > 50, 1.2, np.where(df['age'] < 30, 0.8, 1.0))
    spending_factor = np.log1p(df['monthly_spend']) / 2
    loyalty_factor = np.log1p(df['num_purchases']) * 50
    rating_factor = df['customer_rating'].fillna(3) * 100
    
    clv = (base_clv + income_factor + loyalty_factor + rating_factor) * age_factor + spending_factor
    clv += np.random.normal(0, 200, len(df))  # Add noise
    clv = np.maximum(clv, 100)  # Ensure positive values
    
    # High-value customer (binary classification target)
    high_value = (clv > clv.quantile(0.7)).astype(int)
    
    return clv, high_value

# Create targets
clv_target, high_value_target = create_target_variable(df_raw)

print("🎯 Target Variables Created:")
print(f"CLV (regression): Mean={clv_target.mean():.0f}, Std={clv_target.std():.0f}")
print(f"High-value customer (classification): {high_value_target.mean():.1%} positive class")

# Define models for evaluation
regression_models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0, random_state=RANDOM_STATE),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE),
    'Neural Network': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=RANDOM_STATE)
}

classification_models = {
    'Logistic Regression': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE),
    'SVM': SVC(probability=True, random_state=RANDOM_STATE),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=RANDOM_STATE)
}

# Evaluate regression models
print("\n" + "="*60)
print("REGRESSION EVALUATION (Customer Lifetime Value)")
regression_results, X_test_reg = comprehensive_model_evaluation(
    X_processed, clv_target, regression_models, cv_strategy='standard'
)

# Evaluate classification models  
print("\n" + "="*60)
print("CLASSIFICATION EVALUATION (High-Value Customer)")
classification_results, X_test_clf = comprehensive_model_evaluation(
    X_processed, high_value_target, classification_models, cv_strategy='stratified'
)

### 3.2 Learning Curves & Bias-Variance Analysis

In [None]:
# Advanced learning curves and bias-variance analysis

def plot_learning_curves(models, X, y, title_suffix=""):
    """Plot learning curves to diagnose overfitting/underfitting"""
    
    n_models = len(models)
    fig, axes = plt.subplots(2, (n_models + 1) // 2, figsize=(15, 8))
    if n_models == 1:
        axes = [axes]
    axes = axes.flatten()
    
    for idx, (name, model) in enumerate(models.items()):
        # Calculate learning curves
        train_sizes, train_scores, val_scores = learning_curve(
            model, X, y, cv=5, n_jobs=-1, 
            train_sizes=np.linspace(0.1, 1.0, 10),
            random_state=RANDOM_STATE
        )
        
        # Calculate means and standard deviations
        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        val_mean = np.mean(val_scores, axis=1)
        val_std = np.std(val_scores, axis=1)
        
        # Plot learning curves
        axes[idx].plot(train_sizes, train_mean, 'o-', color='blue', label='Training score')
        axes[idx].fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
        
        axes[idx].plot(train_sizes, val_mean, 'o-', color='red', label='Validation score')
        axes[idx].fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='red')
        
        axes[idx].set_title(f'{name} Learning Curve')
        axes[idx].set_xlabel('Training Set Size')
        axes[idx].set_ylabel('Score')
        axes[idx].legend(loc='best')
        axes[idx].grid(True, alpha=0.3)
        
        # Analyze bias-variance
        final_train_score = train_mean[-1]
        final_val_score = val_mean[-1]
        gap = final_train_score - final_val_score
        
        # Add diagnosis text
        if gap > 0.1:
            diagnosis = "High Variance (Overfitting)"
            color = 'red'
        elif final_val_score < 0.7:  # Assuming scores are 0-1
            diagnosis = "High Bias (Underfitting)"
            color = 'orange'
        else:
            diagnosis = "Good Balance"
            color = 'green'
        
        axes[idx].text(0.05, 0.95, diagnosis, transform=axes[idx].transAxes, 
                      bbox=dict(boxstyle='round', facecolor=color, alpha=0.3),
                      verticalalignment='top')
    
    # Hide extra subplots
    for idx in range(n_models, len(axes)):
        axes[idx].set_visible(False)
    
    plt.suptitle(f'Learning Curves Analysis {title_suffix}', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

def plot_validation_curves(model, X, y, param_name, param_range, title=""):
    """Plot validation curves for hyperparameter tuning"""
    
    train_scores, val_scores = validation_curve(
        model, X, y, param_name=param_name, param_range=param_range,
        cv=5, scoring='accuracy' if len(np.unique(y)) < 20 else 'r2'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(param_range, train_mean, 'o-', color='blue', label='Training score')
    plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
    
    plt.plot(param_range, val_mean, 'o-', color='red', label='Validation score')
    plt.fill_between(param_range, val_mean - val_std, val_mean + val_std, alpha=0.1, color='red')
    
    plt.title(f'Validation Curve: {title}')
    plt.xlabel(param_name)
    plt.ylabel('Score')
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    
    # Find optimal parameter
    optimal_idx = np.argmax(val_mean)
    optimal_param = param_range[optimal_idx]
    plt.axvline(optimal_param, color='green', linestyle='--', alpha=0.7, 
                label=f'Optimal: {optimal_param}')
    plt.legend()
    
    plt.show()
    
    return optimal_param

# Plot learning curves for classification models
selected_clf_models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE),
    'SVM': SVC(random_state=RANDOM_STATE),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=RANDOM_STATE)
}

plot_learning_curves(selected_clf_models, X_processed, high_value_target, "(Classification)")

# Plot learning curves for regression models
selected_reg_models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE),
    'Ridge': Ridge(alpha=1.0, random_state=RANDOM_STATE)
}

plot_learning_curves(selected_reg_models, X_processed, clv_target, "(Regression)")

# Validation curves for hyperparameter tuning
print("\n🔧 Hyperparameter Optimization Examples")
print("=" * 50)

# Random Forest n_estimators
rf_param_range = [10, 25, 50, 100, 200, 300]
optimal_n_estimators = plot_validation_curves(
    RandomForestClassifier(random_state=RANDOM_STATE),
    X_processed, high_value_target,
    'n_estimators', rf_param_range,
    'Random Forest - Number of Estimators'
)

print(f"Optimal n_estimators for Random Forest: {optimal_n_estimators}")

# Ridge regression alpha
ridge_param_range = [0.01, 0.1, 1.0, 10.0, 100.0]
optimal_alpha = plot_validation_curves(
    Ridge(random_state=RANDOM_STATE),
    X_processed, clv_target,
    'alpha', ridge_param_range,
    'Ridge Regression - Alpha (Regularization)'
)

print(f"Optimal alpha for Ridge Regression: {optimal_alpha}")

### 3.3 Advanced Hyperparameter Optimization

In [None]:
# Professional hyperparameter optimization

def advanced_hyperparameter_tuning(X, y, model_class, param_distributions, 
                                 search_type='random', n_iter=50, cv=5):
    """Advanced hyperparameter optimization with multiple strategies"""
    
    print(f"\n🎯 {search_type.upper()} HYPERPARAMETER SEARCH")
    print("=" * 50)
    print(f"Model: {model_class.__name__}")
    print(f"Search iterations: {n_iter}")
    print(f"Cross-validation folds: {cv}")
    
    # Determine scoring metric
    is_classification = len(np.unique(y)) < 20
    scoring = 'accuracy' if is_classification else 'r2'
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE,
        stratify=y if is_classification else None
    )
    
    # Initialize search
    if search_type == 'grid':
        search = GridSearchCV(
            model_class(random_state=RANDOM_STATE),
            param_distributions,
            cv=cv,
            scoring=scoring,
            n_jobs=-1,
            verbose=1
        )
    else:  # random search
        search = RandomizedSearchCV(
            model_class(random_state=RANDOM_STATE),
            param_distributions,
            n_iter=n_iter,
            cv=cv,
            scoring=scoring,
            n_jobs=-1,
            random_state=RANDOM_STATE,
            verbose=1
        )
    
    # Perform search
    start_time = datetime.now()
    search.fit(X_train, y_train)
    search_time = (datetime.now() - start_time).total_seconds()
    
    # Results
    print(f"\n⏱️ Search completed in {search_time:.1f} seconds")
    print(f"🏆 Best cross-validation score: {search.best_score_:.4f}")
    print(f"🎯 Best parameters: {search.best_params_}")
    
    # Test the best model
    best_model = search.best_estimator_
    test_pred = best_model.predict(X_test)
    
    if is_classification:
        test_score = accuracy_score(y_test, test_pred)
        print(f"🎯 Test accuracy: {test_score:.4f}")
    else:
        test_score = r2_score(y_test, test_pred)
        print(f"🎯 Test R² score: {test_score:.4f}")
    
    # Feature importance (if available)
    if hasattr(best_model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"\n🔍 Top 5 Feature Importances:")
        for _, row in feature_importance.head().iterrows():
            print(f"  {row['feature']}: {row['importance']:.4f}")
    
    return {
        'best_model': best_model,
        'best_params': search.best_params_,
        'best_score': search.best_score_,
        'test_score': test_score,
        'search_time': search_time,
        'cv_results': search.cv_results_
    }

# Define parameter distributions for different models
rf_param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

svm_param_dist = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

mlp_param_dist = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (200, 100), (100, 100, 50)],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [1000, 2000]
}

# Hyperparameter tuning for classification
print("🔬 CLASSIFICATION HYPERPARAMETER OPTIMIZATION")
print("=" * 60)

# Random Forest Classification
rf_clf_results = advanced_hyperparameter_tuning(
    X_processed, high_value_target,
    RandomForestClassifier,
    rf_param_dist,
    search_type='random',
    n_iter=30
)

# SVM Classification
svm_clf_results = advanced_hyperparameter_tuning(
    X_processed, high_value_target,
    SVC,
    svm_param_dist,
    search_type='random',
    n_iter=20
)

# Hyperparameter tuning for regression
print("\n\n🔬 REGRESSION HYPERPARAMETER OPTIMIZATION")
print("=" * 60)

# Random Forest Regression
rf_reg_results = advanced_hyperparameter_tuning(
    X_processed, clv_target,
    RandomForestRegressor,
    rf_param_dist,
    search_type='random',
    n_iter=30
)

# Compare optimization results
print("\n\n📊 HYPERPARAMETER OPTIMIZATION SUMMARY")
print("=" * 60)

results_summary = pd.DataFrame([
    {
        'Model': 'Random Forest (Clf)',
        'Best CV Score': rf_clf_results['best_score'],
        'Test Score': rf_clf_results['test_score'],
        'Search Time (s)': rf_clf_results['search_time']
    },
    {
        'Model': 'SVM (Clf)',
        'Best CV Score': svm_clf_results['best_score'],
        'Test Score': svm_clf_results['test_score'],
        'Search Time (s)': svm_clf_results['search_time']
    },
    {
        'Model': 'Random Forest (Reg)',
        'Best CV Score': rf_reg_results['best_score'],
        'Test Score': rf_reg_results['test_score'],
        'Search Time (s)': rf_reg_results['search_time']
    }
])

print(results_summary.round(4))

### 📝 Checkpoint 3: Model Evaluation

**Assessment Questions (5 minutes):**

1. What does a large gap between training and validation scores indicate?
2. When would you use RandomizedSearchCV instead of GridSearchCV?
3. How do you interpret a learning curve that shows both training and validation scores plateauing at a low level?
4. What are the trade-offs between different cross-validation strategies?

**Practical Exercise:**
- Implement nested cross-validation for unbiased model selection
- Create custom scoring functions for business metrics
- Analyze the impact of different train/test split ratios