# 1: Initial Setup and Imports

In [12]:
print('🚀 INITIALIZING COMPLETE F1 RACE STRATEGY ADVANCED ML PIPELINE...')

import pandas as pd
import numpy as np
import json
import os
import hashlib
import numpy as np
import pandas as pd
import traceback
from pandas import json_normalize
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
import mysql.connector
from mysql.connector import Error
import requests
from datetime import datetime, timedelta
import time
import pickle
import traceback
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, VotingRegressor
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit, GridSearchCV, learning_curve, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, mean_absolute_percentage_error, explained_variance_score, max_error
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, PolynomialFeatures, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE, mutual_info_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.utils import resample, class_weight
import scipy.stats as stats
from scipy.stats import linregress, zscore, shapiro, normaltest, yeojohnson, median_abs_deviation
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

print('✅ All enhanced dependencies loaded successfully')

🚀 INITIALIZING COMPLETE F1 RACE STRATEGY ADVANCED ML PIPELINE...
✅ All enhanced dependencies loaded successfully


# 2: Comprehensive Data Preprocessing Pipeline

In [13]:
class ComprehensiveDataPreprocessor:
    """
    COMPREHENSIVE DATA PREPROCESSING: Handling imbalance, outliers, missing values, etc.
    """
    
    def __init__(self):
        self.preprocessing_steps = {}
        self.scalers = {}
        self.encoders = {}
        self.imputer = None
        self.outlier_detector = None
        self.feature_selector = None
        
    def comprehensive_preprocessing_pipeline(self, X, y = None, problem_type = 'regression'):
        'COMPREHENSIVE PREPROCESSING PIPELINE'
        print(f'🔧 COMPREHENSIVE PREPROCESSING PIPELINE: {X.shape[1]} features')
        
        if X.empty:
            return X, y
            
        X_processed = X.copy()
        y_processed = y.copy() if y is not None else None
        
        # Step 1: Handle missing values
        
        X_processed = self.handle_missing_values_comprehensive(X_processed)
        
        # Step 2: Detect and handle outliers
        
        X_processed = self.handle_outliers_robust(X_processed)
        
        # Step 3: Handle data imbalance (for classification)
        
        if y_processed is not None and problem_type == 'classification':
            X_processed, y_processed = self.handle_imbalance(X_processed, y_processed)
        
        # Step 4: Advanced feature engineering
        
        X_processed = self.advanced_feature_engineering(X_processed)
        
        # Step 5: Smart encoding
        
        X_processed = self.smart_encoding_advanced(X_processed)
        
        # Step 6: Advanced scaling
        
        X_processed = self.advanced_scaling_robust(X_processed)
        
        # Step 7: Feature selection
        
        if y_processed is not None:
            X_processed = self.comprehensive_feature_selection(X_processed, y_processed, problem_type)
        
        # Step 8: Dimensionality reduction (if needed)
        
        if X_processed.shape[1] > 50:
            X_processed = self.dimensionality_reduction(X_processed, y_processed)
        
        print(f'✅ COMPREHENSIVE PREPROCESSING COMPLETE: {X_processed.shape[1]} features')
        return X_processed, y_processed
    
    def handle_missing_values_comprehensive(self, X):
        'COMPREHENSIVE MISSING VALUE HANDLING'
        print('   🧹 Handling missing values...')
        
        missing_summary = X.isnull().sum()
        missing_percentage = (missing_summary / len(X)) * 100
        
        high_missing_columns = missing_percentage[missing_percentage > 50].index
        if len(high_missing_columns) > 0:
            X = X.drop(columns = high_missing_columns)
            print(f'   Dropped {len(high_missing_columns)} columns with > 50% missing values')
        
        # Use advanced imputation strategies
        
        numeric_cols = X.select_dtypes(include = [np.number]).columns
        categorical_cols = X.select_dtypes(include = ['object']).columns
        
        # Numeric imputation
        
        if len(numeric_cols) > 0:
        
            # Try KNN imputer for numeric columns
            
            try:
                knn_imputer = KNNImputer(n_neighbors = 5)
                X[numeric_cols] = knn_imputer.fit_transform(X[numeric_cols])
                print('  Applied KNN imputation for numeric features')
            except:
                
                # Fallback to median imputation
                
                imputer = SimpleImputer(strategy = 'median')
                X[numeric_cols] = imputer.fit_transform(X[numeric_cols])
                print('   Applied median imputation for numeric features')
        
        # Categorical imputation
        
        if len(categorical_cols) > 0:
            for col in categorical_cols:
                if X[col].isnull().sum() > 0:
                    X[col] = X[col].fillna('Unknown')
            print('  Applied mode imputation for categorical features')
        
        return X
    
    def handle_outliers_robust(self, X):
        'ROBUST OUTLIER HANDLING USING MULTIPLE METHODS'
        print('   📊 Detecting and handling outliers...')
        
        numeric_cols = X.select_dtypes(include = [np.number]).columns
        outlier_report = {}
        
        for col in numeric_cols:
            if X[col].nunique() > 10:                                 # Only for continuous variables
                
                # Method 1: IQR
                
                Q1 = X[col].quantile(0.25)
                Q3 = X[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                
                # Method 2: Z-score (for normal distributions)
                
                z_scores = np.abs(stats.zscore(X[col].dropna()))
                z_outliers = np.sum(z_scores > 3)
                
                # Method 3: Modified Z-score (more robust)
                
                median = X[col].median()
                mad = stats.median_abs_deviation(X[col].dropna(), scale = 'normal')
                modified_z_scores = 0.6745 * (X[col] - median) / mad
                mod_z_outliers = np.sum(np.abs(modified_z_scores) > 3.5)
                
                # Use winsorization for extreme outliers
                
                if z_outliers > 0 or mod_z_outliers > 0:
                
                    # Cap outliers at 1st and 99th percentiles
                    
                    lower_cap = X[col].quantile(0.01)
                    upper_cap = X[col].quantile(0.99)
                    X[col] = np.where(X[col] < lower_cap, lower_cap, X[col])
                    X[col] = np.where(X[col] > upper_cap, upper_cap, X[col])
                    
                    outlier_report[col] = {
                        'iqr_outliers': np.sum((X[col] < lower_bound) | (X[col] > upper_bound)),
                        'z_outliers': z_outliers,
                        'modified_z_outliers': mod_z_outliers
                    }
        
        if outlier_report:
            print(f'  Handled outliers in {len(outlier_report)} features')
            for col, report in list(outlier_report.items())[:5]:                                  # Show first 5
                print(f'     {col}: IQR = {report['iqr_outliers']}, Z = {report['z_outliers']}, ModZ = {report['modified_z_outliers']}')
        
        return X
    
    def handle_imbalance(self, X, y):
        'HANDLE IMBALANCED DATA USING ADVANCED TECHNIQUES'
        print(' ⚖️ Handling class imbalance...')
        
        unique_classes, class_counts = np.unique(y, return_counts = True)
        print(f'   Class distribution: {dict(zip(unique_classes, class_counts))}')
        
        # Calculate imbalance ratio
        
        imbalance_ratio = class_counts.max() / class_counts.min()
        print(f'     Imbalance ratio: {imbalance_ratio:.2f}')
        
        if imbalance_ratio > 2:                                             # Significant imbalance
            if len(unique_classes) == 2:                                    # Binary classification
                
                # Use SMOTE for oversampling
                
                smote = SMOTE(random_state = 42, k_neighbors = min(5, class_counts.min() - 1))
                X_resampled, y_resampled = smote.fit_resample(X, y)
                print('  Applied SMOTE for imbalance handling')
            else:  # Multi-class
                # Use RandomUnderSampler for multi-class
                
                rus = RandomUnderSampler(random_state = 42)
                X_resampled, y_resampled = rus.fit_resample(X, y)
                print('   Applied RandomUnderSampler for multi-class imbalance')
            
            unique_resampled, counts_resampled = np.unique(y_resampled, return_counts = True)
            print(f'   Resampled distribution: {dict(zip(unique_resampled, counts_resampled))}')
            return X_resampled, y_resampled
        
        return X, y
    
    def advanced_feature_engineering(self, X):
        'ADVANCED FEATURE ENGINEERING - CORRECTED VERSION'
        print(' 🔬 Advanced feature engineering...')
        
        numeric_cols = X.select_dtypes(include = [np.number]).columns
        
        # Create polynomial features for highly correlated numeric features
        
        if len(numeric_cols) >= 2:
            try:
                # Select top correlated features for polynomial features
                
                corr_matrix = X[numeric_cols].corr().abs()
                
                # Safely get high correlation pairs
                
                high_corr_pairs = []
                for i in corr_matrix.columns:
                    for j in corr_matrix.columns:
                        if i != j and pd.notna(corr_matrix.loc[i, j]) and corr_matrix.loc[i, j] > 0.7:
                            high_corr_pairs.append((i, j))
                
                if high_corr_pairs:
                    for feat1, feat2 in high_corr_pairs[:3]:                       # Limit to top 3 pairs
                        if feat1 in X.columns and feat2 in X.columns:
                            
                            # Check if columns are numeric and not empty
                            
                            if (X[feat1].dtype in [np.number] and 
                                X[feat2].dtype in [np.number] and 
                                len(X[feat1].dropna()) > 0 and 
                                len(X[feat2].dropna()) > 0):
                                
                                X[f'{feat1}_{feat2}_interaction'] = X[feat1] * X[feat2]
                                
                                # Avoid division by zero
                                
                                denominator = X[feat2].copy()
                                denominator[denominator == 0] = 1e-8                   # Replace zeros with small value
                                X[f'{feat1}_{feat2}_ratio'] = X[feat1] / denominator
            
            except Exception as e:
                print(f'  ⚠️ Polynomial feature creation failed: {e}')
        
        # Create statistical features
        
        if len(numeric_cols) >= 3:
            try:
                X['feature_mean'] = X[numeric_cols].mean(axis = 1)
                X['feature_std'] = X[numeric_cols].std(axis = 1)
                X['feature_skew'] = X[numeric_cols].skew(axis = 1)
            except Exception as e:
                print(f'   ⚠️ Statistical feature creation failed: {e}')
        
        return X
    
    def smart_encoding_advanced(self, X):
        'ADVANCED SMART ENCODING'
        print('  🔠 Advanced feature encoding...')
        
        categorical_cols = X.select_dtypes(include=['object']).columns
        
        for col in categorical_cols:
            unique_count = X[col].nunique()
            
            if unique_count <= 10:
                
                # One-hot encoding for low cardinality
                
                dummies = pd.get_dummies(X[col], prefix = col, drop_first = True)
                X = pd.concat([X.drop(columns = [col]), dummies], axis = 1)
                
            elif unique_count <= 50:
                
                # Target encoding simulation (frequency encoding)
                
                freq_encoding = X[col].value_counts().to_dict()
                X[col] = X[col].map(freq_encoding)
                
                # Normalize
                
                X[col] = (X[col] - X[col].mean()) / X[col].std()
                
            else:
                # High cardinality - use clustering or hashing (simplified to frequency)
                
                top_categories = X[col].value_counts().head(20).index
                X[col] = X[col].apply(lambda x: x if x in top_categories else 'Other')
                
                # Then use frequency encoding
                
                freq_encoding = X[col].value_counts().to_dict()
                X[col] = X[col].map(freq_encoding)
        
        return X
    
    def advanced_scaling_robust(self, X):
        'ADVANCED ROBUST SCALING'
        print('   ⚖️ Advanced robust scaling...')
        
        numeric_cols = X.select_dtypes(include = [np.number]).columns
        
        if len(numeric_cols) > 0:
            
            # Use RobustScaler for better outlier resistance
            
            self.scalers['robust'] = RobustScaler()
            X[numeric_cols] = self.scalers['robust'].fit_transform(X[numeric_cols])
        
        return X
    
    def comprehensive_feature_selection(self, X, y, problem_type = 'regression'):
        'COMPREHENSIVE FEATURE SELECTION USING MULTIPLE METHODS'
        print('  🎯 Comprehensive feature selection...')
        
        if X.shape[1] <= 20:                                                      # Don't select if features are already few
            return X
        
        # Method 1: Correlation-based
        
        if problem_type == 'regression':
            selector = SelectKBest(score_func = f_regression, k = min(30, X.shape[1]))
        else:
            selector = SelectKBest(score_func = mutual_info_regression, k = min(30, X.shape[1]))
        
        try:
            X_selected = selector.fit_transform(X, y)
            selected_features = X.columns[selector.get_support()]
            X = X[selected_features]
            print(f'     Selected {len(selected_features)} features using statistical tests')
        except:
            print('   ⚠️ Statistical feature selection failed, using all features')
        
        return X
    
    def dimensionality_reduction(self, X, y = None):
        'DIMENSIONALITY REDUCTION USING PCA'
        print('  📉 Applying dimensionality reduction...')
        
        if X.shape[1] > 50:
            self.pca = PCA(n_components = 0.95, random_state = 42)                          # Keep 95% variance
            X_reduced = self.pca.fit_transform(X)
            print(f'     Reduced from {X.shape[1]} to {X_reduced.shape[1]} components '
                  f'({np.sum(self.pca.explained_variance_ratio_):.1%} variance)')
            
            return pd.DataFrame(X_reduced, columns = [f'PC_{i+1}' for i in range(X_reduced.shape[1])], 
                              index = X.index)
        
        return X

# 3: Comprehensive EDA 

In [14]:
class ComprehensiveEDA:
    """
    COMPREHENSIVE EDA: Univariate, Bivariate, and Multivariate Analysis with Caching
    """
    
    def __init__(self, cache_dir = 'eda_cache'):
        self.eda_results = {}
        self.correlation_matrix = None
        self.feature_importance = None
        self.cache_dir = cache_dir
        os.makedirs(cache_dir, exist_ok = True)
        self.cache_hits = 0
        self.cache_misses = 0
    
    def perform_comprehensive_eda(self, df, target_column = None, save_plots = True, use_cache = True):
        'PERFORM COMPREHENSIVE EDA WITH CACHING'
        print(f'\n📊 PERFORMING COMPREHENSIVE EDA ON {len(df)} RECORDS...')
        
        if df.empty:
            print(' ⚠️ No data for EDA')
            return
        
        # Create cache key
        
        cache_key = self._create_eda_cache_key(df, target_column)
        cache_file = os.path.join(self.cache_dir, f'{cache_key}.pkl')
        
        if use_cache and os.path.exists(cache_file):
            try:
                with open(cache_file, 'rb') as f:
                    cached_data = pickle.load(f)
                
                self.eda_results = cached_data.get('eda_results', {})
                self.correlation_matrix = cached_data.get('correlation_matrix')
                self.feature_importance = cached_data.get('feature_importance')
                self.cache_hits += 1
                
                print(f'💾 Using cached EDA results: {cache_key}')
                self.print_eda_summary()
                return
            except Exception as e:
                print(f'⚠️ Could not load cached EDA: {e}')
                self.cache_misses += 1
        
        self.cache_misses += 1
        self.eda_results['dataset_info'] = self.get_dataset_info(df)
        
        # 1. Univariate Analysis
        
        print('   1. 📈 UNIVARIATE ANALYSIS...')
        self.univariate_analysis(df, target_column, save_plots)
        
        # 2. Bivariate Analysis
        
        print('   2. 📊 BIVARIATE ANALYSIS...')
        self.bivariate_analysis(df, target_column, save_plots)
        
        # 3. Multivariate Analysis
        
        print('   3. 🔍 MULTIVARIATE ANALYSIS...')
        self.multivariate_analysis(df, target_column, save_plots)
        
        # 4. Statistical Analysis
        
        print('   4. 📋 STATISTICAL ANALYSIS...')
        self.statistical_analysis(df, target_column)
        
        self.print_eda_summary()
        
        # Cache results
        
        if use_cache:
            try:
                cache_data = {
                    'eda_results': self.eda_results,
                    'correlation_matrix': self.correlation_matrix,
                    'feature_importance': self.feature_importance
                }
                
                with open(cache_file, 'wb') as f:
                    pickle.dump(cache_data, f)
                
                print(f'💾 EDA results cached: {cache_key}')
            except Exception as e:
                print(f'⚠️ Could not cache EDA results: {e}')
    
    def _create_eda_cache_key(self, df, target_column):
        'CREATE CACHE KEY FOR EDA RESULTS'
        key_parts = [
            f'rows_{len(df)}',
            f'cols_{len(df.columns)}',
            f'target_{target_column}',
            f'hash_{hash(str(sorted(df.columns)))}'
        ]
        return hashlib.md5('_'.join(key_parts).encode()).hexdigest()
    
    def get_dataset_info(self, df):
        'GET COMPREHENSIVE DATASET INFORMATION'
        info = {
            'shape': df.shape,
            'data_types': df.dtypes.value_counts().to_dict(),
            'missing_values': df.isnull().sum().sum(),
            'missing_percentage': (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100,
            'duplicates': df.duplicated().sum(),
            'memory_usage': df.memory_usage(deep = True).sum() / 1024**2,  # MB
            'numeric_columns': len(df.select_dtypes(include = [np.number]).columns),
            'categorical_columns': len(df.select_dtypes(include = ['object']).columns)
        }
        return info
    
    def univariate_analysis(self, df, target_column = None, save_plots = True):
        'COMPREHENSIVE UNIVARIATE ANALYSIS'
        numeric_cols = df.select_dtypes(include = [np.number]).columns
        categorical_cols = df.select_dtypes(include = ['object']).columns
        
        # Create subplots for numeric features
        
        if len(numeric_cols) > 0:
            n_cols = 3
            n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
            
            fig, axes = plt.subplots(n_rows, n_cols, figsize = (20, 5*n_rows))
            axes = axes.flatten()
            
            for i, col in enumerate(numeric_cols):
                if i < len(axes):
                    
                    # Histogram with KDE
                    
                    axes[i].hist(df[col].dropna(), bins = 30, alpha = 0.7, color = 'skyblue', edgecolor = 'black', density = True)
                    df[col].dropna().plot.density(ax = axes[i], color = 'red', linewidth = 2)
                    axes[i].set_title(f'Distribution of {col}', fontweight = 'bold')
                    axes[i].set_xlabel(col)
                    axes[i].set_ylabel('Density')
                    
                    # Add statistical annotations
                    
                    stats_text = f'Mean: {df[col].mean():.2f}\nStd: {df[col].std():.2f}\nSkew: {df[col].skew():.2f}'
                    axes[i].text(0.05, 0.95, stats_text, transform = axes[i].transAxes, 
                                verticalalignment = 'top', bbox = dict(boxstyle = 'round', facecolor = 'white', alpha = 0.8))
            
            # Remove empty subplots
            
            for i in range(len(numeric_cols), len(axes)):
                fig.delaxes(axes[i])
            
            plt.tight_layout()
            if save_plots:
                plt.savefig('univariate_numeric_analysis.png', dpi = 300, bbox_inches = 'tight')
            plt.close()
        
        # Categorical analysis
            
        if len(categorical_cols) > 0:
            n_cols = 2
            n_rows = (len(categorical_cols) + n_cols - 1) // n_cols
            
            fig, axes = plt.subplots(n_rows, n_cols, figsize = (15, 5*n_rows))
            axes = axes.flatten() if n_rows > 1 else [axes]
            
            for i, col in enumerate(categorical_cols):
                if i < len(axes):
                    value_counts = df[col].value_counts().head(10)                          # Top 10 categories
                    axes[i].bar(value_counts.index, value_counts.values, color = 'lightcoral')
                    axes[i].set_title(f'Top Categories in {col}', fontweight = 'bold')
                    axes[i].set_xlabel(col)
                    axes[i].set_ylabel('Count')
                    axes[i].tick_params(axis = 'x', rotation = 45)
            
            # Remove empty subplots
            
            for i in range(len(categorical_cols), len(axes)):
                fig.delaxes(axes[i])
            
            plt.tight_layout()
            if save_plots:
                plt.savefig('univariate_categorical_analysis.png', dpi = 300, bbox_inches = 'tight')
            plt.close()
    
    def bivariate_analysis(self, df, target_column = None, save_plots = True):
        'COMPREHENSIVE BIVARIATE ANALYSIS'
        if target_column is None or target_column not in df.columns:
            print('  ⚠️ No target column specified for bivariate analysis')
            return
        
        numeric_cols = df.select_dtypes(include = [np.number]).columns
        
        # Correlation with target
        
        if target_column in numeric_cols:
            correlations = df[numeric_cols].corr()[target_column].sort_values(ascending = False)
            self.correlation_matrix = df[numeric_cols].corr()
            
            # Plot top correlations
            
            top_correlations = correlations.head(10)[1:]                                   # Exclude target itself
            
            plt.figure(figsize = (12, 8))
            y_pos = np.arange(len(top_correlations))
            plt.barh(y_pos, top_correlations.values, color = 'lightseagreen')
            plt.yticks(y_pos, top_correlations.index)
            plt.xlabel('Correlation Coefficient')
            plt.title(f'Top Features Correlated with {target_column}', fontweight = 'bold')
            plt.grid(axis = 'x', alpha = 0.3)
            
            # Add correlation values on bars
            
            for i, v in enumerate(top_correlations.values):
                plt.text(v + 0.01 if v >= 0 else v - 0.03, i, f'{v:.3f}', 
                        va = 'center', fontweight = 'bold')
            
            if save_plots:
                plt.savefig('bivariate_correlations.png', dpi = 300, bbox_inches = 'tight')
            plt.close()
        
        # Scatter plots for top 4 correlated features
            
        if target_column in numeric_cols and len(numeric_cols) > 1:
            top_features = correlations.head(5).index[1:]                                  # Top 4 excluding target
            
            fig, axes = plt.subplots(2, 2, figsize = (15, 12))
            axes = axes.flatten()
            
            for i, feature in enumerate(top_features[:4]):
                if i < len(axes):
                    axes[i].scatter(df[feature], df[target_column], alpha = 0.6, color = 'steelblue')
                    axes[i].set_xlabel(feature)
                    axes[i].set_ylabel(target_column)
                    axes[i].set_title(f'{feature} vs {target_column}\nCorr: {correlations[feature]:.3f}')
                    
                    # Add trend line
                    
                    z = np.polyfit(df[feature].dropna(), df[target_column].dropna(), 1)
                    p = np.poly1d(z)
                    axes[i].plot(df[feature].dropna(), p(df[feature].dropna()), 'r--', alpha = 0.8)
            
            plt.tight_layout()
            if save_plots:
                plt.savefig('bivariate_scatter_plots.png', dpi = 300, bbox_inches = 'tight')
            plt.close()
    
    def multivariate_analysis(self, df, target_column = None, save_plots = True):
        'COMPREHENSIVE MULTIVARIATE ANALYSIS'
        numeric_cols = df.select_dtypes(include = [np.number]).columns
        
        if len(numeric_cols) > 1:
            
            # Correlation heatmap
            
            plt.figure(figsize = (16, 14))
            correlation_matrix = df[numeric_cols].corr()
            
            # Create mask for upper triangle
            
            mask = np.triu(np.ones_like(correlation_matrix, dtype = bool))
            
            sns.heatmap(correlation_matrix, mask = mask, annot = True, fmt = '.2f', cmap = 'coolwarm',
                       center = 0, square = True, linewidths = 0.5, cbar_kws = {'shrink': .8})
            plt.title('Feature Correlation Heatmap', fontweight = 'bold', pad = 20)
            plt.xticks(rotation = 45, ha = 'right')
            plt.yticks(rotation = 0)
            
            if save_plots:
                plt.savefig('multivariate_correlation_heatmap.png', dpi = 300, bbox_inches = 'tight')
            plt.close()
            
            # Pairplot for top 5 features (if target specified)
            
            if target_column and target_column in numeric_cols and len(numeric_cols) >= 3:
                top_features = correlation_matrix[target_column].abs().sort_values(ascending = False).head(6).index
                pairplot_data = df[top_features].dropna()
                
                if len(pairplot_data) > 0:
                    
                    # Sample if too large for pairplot
                    
                    if len(pairplot_data) > 1000:
                        pairplot_data = pairplot_data.sample(1000, random_state = 42)
                    
                    g = sns.pairplot(pairplot_data, diag_kind = 'hist', corner = True)
                    g.fig.suptitle('Pairplot of Top Correlated Features', y = 1.02, fontweight = 'bold')
                    
                    if save_plots:
                        plt.savefig('multivariate_pairplot.png', dpi = 300, bbox_inches = 'tight')
                    plt.close()
    
    def statistical_analysis(self, df, target_column = None):
        'COMPREHENSIVE STATISTICAL ANALYSIS'
        numeric_cols = df.select_dtypes(include = [np.number]).columns
        
        statistical_summary = {}
        
        for col in numeric_cols:
            statistical_summary[col] = {
                'mean': df[col].mean(),
                'median': df[col].median(),
                'std': df[col].std(),
                'skewness': df[col].skew(),
                'kurtosis': df[col].kurtosis(),
                'min': df[col].min(),
                'max': df[col].max(),
                'q1': df[col].quantile(0.25),
                'q3': df[col].quantile(0.75),
                'missing': df[col].isnull().sum(),
                'zeros': (df[col] == 0).sum()
            }
        
        self.eda_results['statistical_summary'] = statistical_summary
        
        # Normality tests for top numeric features
        
        normality_tests = {}
        for col in list(numeric_cols)[:5]:                              # Test first 5 columns
            data = df[col].dropna()
            if len(data) > 3:
                try:
                    _, shapiro_p = shapiro(data)
                    _, normaltest_p = normaltest(data)
                    normality_tests[col] = {
                        'shapiro_p': shapiro_p,
                        'normaltest_p': normaltest_p,
                        'is_normal': shapiro_p > 0.05 or normaltest_p > 0.05
                    }
                except:
                    pass
        
        self.eda_results['normality_tests'] = normality_tests
    
    def print_eda_summary(self):
        'PRINT COMPREHENSIVE EDA SUMMARY'
        print('\n' + '=' * 80)
        print('COMPREHENSIVE EDA SUMMARY')
        print('=' * 80)
        
        info = self.eda_results['dataset_info']
        print(f'📁 DATASET OVERVIEW:')
        print(f'   Shape: {info['shape'][0]:,} rows × {info['shape'][1]} columns')
        print(f'   Memory Usage: {info['memory_usage']:.2f} MB')
        print(f'   Numeric Features: {info['numeric_columns']}')
        print(f'   Categorical Features: {info['categorical_columns']}')
        print(f'   Missing Values: {info['missing_values']} ({info['missing_percentage']:.2f}%)')
        print(f'   Duplicates: {info['duplicates']}')
        
        if 'statistical_summary' in self.eda_results:
            stats = self.eda_results['statistical_summary']
            print(f'\n📊 STATISTICAL SUMMARY (Sample):')
            for col in list(stats.keys())[:3]:
                col_stats = stats[col]
                print(f'   {col}:')
                print(f'     Mean: {col_stats['mean']:.2f}, Std: {col_stats['std']:.2f}')
                print(f'     Skewness: {col_stats['skewness']:.2f}, Kurtosis: {col_stats['kurtosis']:.2f}')
        
        if 'normality_tests' in self.eda_results:
            normality = self.eda_results['normality_tests']
            print(f'\n📋 NORMALITY TESTS:')
            for col, test in normality.items():
                print(f'   {col}: Shapiro p = {test['shapiro_p']:.3f}, NormalTest p = {test['normaltest_p']:.3f}')
                print(f'     Normally Distributed: {'Yes' if test['is_normal'] else 'No'}')
        
        # Cache statistics
        
        total_requests = self.cache_hits + self.cache_misses
        hit_rate = (self.cache_hits / total_requests * 100) if total_requests > 0 else 0
        print(f'\n💾 EDA CACHE STATS: {self.cache_hits} hits, {self.cache_misses} misses ({hit_rate:.1f}% hit rate)')
    
    def clear_cache(self):
        'CLEAR EDA CACHE'
        try:
            for filename in os.listdir(self.cache_dir):
                file_path = os.path.join(self.cache_dir, filename)
                if os.path.isfile(file_path):
                    os.remove(file_path)
            print(f'🧹 EDA cache cleared: {self.cache_dir}')
            self.cache_hits = 0
            self.cache_misses = 0
        except Exception as e:
            print(f'⚠️ Error clearing EDA cache: {e}')
    
    def get_cache_stats(self):
        'GET CACHE STATISTICS'
        total_requests = self.cache_hits + self.cache_misses
        hit_rate = (self.cache_hits / total_requests * 100) if total_requests > 0 else 0
        return {
            'hits': self.cache_hits,
            'misses': self.cache_misses,
            'total': total_requests,
            'hit_rate': hit_rate,
            'cache_dir': self.cache_dir
        }

# 4: Advanced Learning Curves and Model Diagnostics

In [15]:
class AdvancedLearningCurves:
    """
    ADVANCED LEARNING CURVES AND MODEL DIAGNOSTICS
    """
    
    def __init__(self):
        self.learning_curves_data = {}
        
    def plot_advanced_learning_curves(self, model, X, y, model_name = 'Model', cv_folds = 5, scoring = None):
        'PLOT ADVANCED LEARNING CURVES WITH MULTIPLE METRICS'
        print(f'  📈 Generating advanced learning curves for {model_name}...')
        
        if scoring is None:
            scoring = {'mae': 'neg_mean_absolute_error', 'mse': 'neg_mean_squared_error', 'r2': 'r2'}
        
        try:
            train_sizes = np.linspace(0.1, 1.0, 10)
            learning_curves = {}
            
            for metric_name, metric_scorer in scoring.items():
                train_sizes_abs, train_scores, test_scores = learning_curve(
                    model, X, y, cv = cv_folds, train_sizes = train_sizes,
                    scoring = metric_scorer, n_jobs = -1, random_state = 42
                )
                
                # Convert scores (negative for some metrics)
                
                if metric_name in ['mae', 'mse']:
                    train_scores = -train_scores
                    test_scores = -test_scores
                
                learning_curves[metric_name] = {
                    'train_sizes': train_sizes_abs,
                    'train_scores_mean': np.mean(train_scores, axis = 1),
                    'train_scores_std': np.std(train_scores, axis = 1),
                    'test_scores_mean': np.mean(test_scores, axis = 1),
                    'test_scores_std': np.std(test_scores, axis = 1)
                }
            
            # Create subplots
            
            n_metrics = len(learning_curves)
            fig, axes = plt.subplots(1, n_metrics, figsize = (6 * n_metrics, 5))
            if n_metrics == 1:
                axes = [axes]
            
            for idx, (metric_name, curve_data) in enumerate(learning_curves.items()):
                ax = axes[idx]
                
                # Plot learning curves
                
                ax.fill_between(curve_data['train_sizes'], 
                              curve_data['train_scores_mean'] - curve_data['train_scores_std'],
                              curve_data['train_scores_mean'] + curve_data['train_scores_std'], 
                              alpha = 0.1, color = 'r')
                ax.fill_between(curve_data['train_sizes'],
                              curve_data['test_scores_mean'] - curve_data['test_scores_std'],
                              curve_data['test_scores_mean'] + curve_data['test_scores_std'], 
                              alpha = 0.1, color = 'g')
                
                ax.plot(curve_data['train_sizes'], curve_data['train_scores_mean'], 'o-', 
                       color = 'r', label = 'Training score', linewidth = 2)
                ax.plot(curve_data['train_sizes'], curve_data['test_scores_mean'], 'o-', 
                       color = 'g', label = 'Cross-validation score', linewidth = 2)
                
                ax.set_xlabel('Training examples')
                ax.set_ylabel(metric_name.upper())
                ax.set_title(f'Learning Curve - {metric_name.upper()}\n{model_name}', fontweight = 'bold')
                ax.legend(loc = 'best')
                ax.grid(True, alpha = 0.3)
                
                # Add final score annotations
                
                final_train = curve_data['train_scores_mean'][-1]
                final_test = curve_data['test_scores_mean'][-1]
                gap = final_train - final_test
                
                ax.text(0.05, 0.15, f'Final Train: {final_train:.3f}\nFinal CV: {final_test:.3f}\nGap: {gap:.3f}', 
                       transform = ax.transAxes, bbox = dict(boxstyle = 'round', facecolor = 'white', alpha = 0.8))
            
            plt.tight_layout()
            plt.savefig(f'advanced_learning_curves_{model_name.replace(' ', '_').lower()}.png', 
                       dpi = 300, bbox_inches = 'tight')
            plt.close()
            
            print(f'   ✅ Advanced learning curves saved for {model_name}')
            
            # Bias-Variance Analysis
            
            self.analyze_bias_variance(learning_curves, model_name)
            
        except Exception as e:
            print(f'  ⚠️  Could not generate advanced learning curves: {e}')
    
    def analyze_bias_variance(self, learning_curves, model_name):
        'ANALYZE BIAS-VARIANCE TRADEOFF'
        print(f'   📊 Bias-Variance Analysis for {model_name}:')
        
        # Use MAE for analysis (if available)
        
        if 'mae' in learning_curves:
            curve_data = learning_curves['mae']
            final_train_score = curve_data['train_scores_mean'][-1]
            final_test_score = curve_data['test_scores_mean'][-1]
            gap = final_train_score - final_test_score
            
            print(f'   Final Training MAE: {final_train_score:.3f}')
            print(f'   Final CV MAE: {final_test_score:.3f}')
            print(f'   Gap (Train - CV): {gap:.3f}')
            
            if gap > final_test_score * 0.5:
                print('  ⚠️  HIGH VARIANCE DETECTED - Model is overfitting')
                print('  Recommendations: Increase regularization, reduce model complexity, get more data')
            elif final_test_score > final_train_score * 1.5:
                print('  ⚠️  HIGH BIAS DETECTED - Model is underfitting')
                print('     Recommendations: Increase model complexity, add more features, reduce regularization')
            else:
                print('  ✅ GOOD BIAS-VARIANCE TRADEOFF - Model is well-balanced')

# 5: Comprehensive Hyperparameter Tuning

In [16]:
class ComprehensiveHyperparameterTuner:
    """
    COMPREHENSIVE HYPERPARAMETER TUNING WITH MULTIPLE STRATEGIES
    """
    
    def __init__(self):
        self.best_params = {}
        self.tuning_results = {}
        self.best_models = {}
        
    def comprehensive_tuning(self, model, param_grid, X, y, problem_type = 'regression', 
                           tuning_method = 'randomized', cv_folds = 5, n_iter = 50):
        'COMPREHENSIVE HYPERPARAMETER TUNING'
        print(f' 🎯 Comprehensive hyperparameter tuning ({tuning_method})...')
        
        if problem_type == 'regression':
            scoring = 'neg_mean_absolute_error'
            refit = 'neg_mean_absolute_error'
        else:
            scoring = 'accuracy'
            refit = 'accuracy'
        
        try:
            if tuning_method == 'grid':
                search = GridSearchCV(
                    model, param_grid, cv = cv_folds, scoring = scoring, 
                    refit = refit, n_jobs = -1, verbose = 1
                )
            else:                                                               # randomized
                search = RandomizedSearchCV(
                    model, param_grid, cv = cv_folds, scoring = scoring,
                    refit = refit, n_jobs = -1, verbose = 1, n_iter = n_iter, random_state = 42
                )
            
            search.fit(X, y)
            
            # Store results
            
            model_name = type(model).__name__
            self.best_params[model_name] = search.best_params_
            self.best_models[model_name] = search.best_estimator_
            self.tuning_results[model_name] = {
                'best_score': search.best_score_,
                'best_params': search.best_params_,
                'cv_results': search.cv_results_
            }
            
            print(f'  ✅ Tuning completed for {model_name}')
            print(f'  Best Score: {search.best_score_:.4f}')
            print(f'  Best Parameters: {search.best_params_}')
            
            # Plot tuning results
            
            self.plot_tuning_results(search, model_name)
            
            return search.best_estimator_
            
        except Exception as e:
            print(f'  ❌ Hyperparameter tuning failed: {e}')
            return model
    
    def plot_tuning_results(self, search, model_name):
        'PLOT HYPERPARAMETER TUNING RESULTS'
        try:
            results = pd.DataFrame(search.cv_results_)
            
            # Get the top 10 parameter combinations
          
            top_results = results.nlargest(10, 'mean_test_score')
            
            plt.figure(figsize = (12, 8))
            y_pos = np.arange(len(top_results))
            
            plt.barh(y_pos, top_results['mean_test_score'], 
                    xerr = top_results['std_test_score'], 
                    alpha = 0.7, color = 'skyblue', ecolor = 'black', capsize = 5)
            
            plt.yticks(y_pos, [f'Config {i+1}' for i in range(len(top_results))])
            plt.xlabel('Cross-Validation Score')
            plt.title(f'Top 10 Hyperparameter Configurations - {model_name}', fontweight = 'bold')
            plt.grid(axis = 'x', alpha = 0.3)
            
            # Add score values
            
            for i, v in enumerate(top_results['mean_test_score']):
                plt.text(v + 0.01, i, f'{v:.4f}', va = 'center', fontweight = 'bold')
            
            plt.tight_layout()
            plt.savefig(f'hyperparameter_tuning_{model_name.replace(' ', '_').lower()}.png', 
                       dpi = 300, bbox_inches = 'tight')
            plt.close()
            
        except Exception as e:
            print(f'  ⚠️  Could not plot tuning results: {e}')
    
    def compare_models_comprehensive(self, models, X, y, problem_type = 'regression', cv_folds = 5):
        'COMPREHENSIVE MODEL COMPARISON'
        print(f'  🔄 Comprehensive model comparison...')
        
        model_comparison = {}
        
        for name, model in models.items():
            try:
                if problem_type == 'regression':
                    scores = cross_val_score(model, X, y, cv = cv_folds, scoring = 'neg_mean_absolute_error')
                    scores = -scores                                              # Convert to positive
                    mean_score = np.mean(scores)
                    std_score = np.std(scores)
                else:
                    scores = cross_val_score(model, X, y, cv = cv_folds, scoring = 'accuracy')
                    mean_score = np.mean(scores)
                    std_score = np.std(scores)
                
                model_comparison[name] = {
                    'mean_score': mean_score,
                    'std_score': std_score,
                    'scores': scores
                }
                
                print(f'    {name}: {mean_score:.4f} ± {std_score:.4f}')
                
            except Exception as e:
                print(f'   ⚠️  Error evaluating {name}: {e}')
        
        # Plot model comparison
        
        self.plot_model_comparison(model_comparison, problem_type)
        
        return model_comparison
    
    def plot_model_comparison(self, model_comparison, problem_type):
        'PLOT MODEL COMPARISON RESULTS'
        if not model_comparison:
            return
        
        model_names = list(model_comparison.keys())
        means = [model_comparison[name]['mean_score'] for name in model_names]
        stds = [model_comparison[name]['std_score'] for name in model_names]
        
        plt.figure(figsize = (12, 8))
        y_pos = np.arange(len(model_names))
        
        bars = plt.barh(y_pos, means, xerr = stds, alpha = 0.7, color = 'lightgreen', 
                       ecolor = 'black', capsize = 5)
        
        plt.yticks(y_pos, model_names)
        if problem_type == 'regression':
            plt.xlabel('Mean Absolute Error (Lower is Better)')
        else:
            plt.xlabel('Accuracy (Higher is Better)')
        plt.title('Model Comparison - Cross-Validation Performance', fontweight = 'bold')
        plt.grid(axis = 'x', alpha = 0.3)
        
        # Add values on bars
        
        for i, (mean, std) in enumerate(zip(means, stds)):
            plt.text(mean + std + 0.01, i, f'{mean:.4f} ± {std:.4f}', 
                    va = 'center', fontweight = 'bold')
        
        plt.tight_layout()
        plt.savefig('model_comparison.png', dpi = 300, bbox_inches = 'tight')
        plt.close()

# 6: Comprehensive Evaluation Metrics

In [17]:
# Cell 6: Comprehensive Evaluation Metrics
class ComprehensiveEvaluator:
    """
    COMPREHENSIVE EVALUATION METRICS FOR REGRESSION AND CLASSIFICATION
    """
    
    def __init__(self):
        self.evaluation_results = {}
        
    def comprehensive_regression_evaluation(self, model, X_train, X_test, y_train, y_test, model_name= 'Model'):
        'COMPREHENSIVE REGRESSION EVALUATION'
        print(f'  📊 Comprehensive regression evaluation for {model_name}...')
        
        # Predictions
        
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Calculate metrics
        
        train_metrics = self.calculate_regression_metrics(y_train, y_train_pred, 'train')
        test_metrics = self.calculate_regression_metrics(y_test, y_test_pred, 'test')
        
        # Store results
        
        self.evaluation_results[model_name] = {
            'train': train_metrics,
            'test': test_metrics,
            'predictions': {
                'train': y_train_pred,
                'test': y_test_pred
            }
        }
        
        # Print results
        
        self.print_regression_evaluation(train_metrics, test_metrics, model_name)
        
        # Plot results
        
        self.plot_regression_evaluation(y_train, y_train_pred, y_test, y_test_pred, model_name)
        
        return self.evaluation_results[model_name]
    
    def calculate_regression_metrics(self, y_true, y_pred, dataset_type):
        'CALCULATE COMPREHENSIVE REGRESSION METRICS'
        metrics = {
            'mae': mean_absolute_error(y_true, y_pred),
            'mse': mean_squared_error(y_true, y_pred),
            'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
            'r2': r2_score(y_true, y_pred),
            'mape': mean_absolute_percentage_error(y_true, y_pred),
            'explained_variance': explained_variance_score(y_true, y_pred),
            'max_error': max_error(y_true, y_pred)
        }
        
        # Additional custom metrics
        
        absolute_errors = np.abs(y_true - y_pred)
        metrics['mean_absolute_error'] = np.mean(absolute_errors)
        metrics['median_absolute_error'] = np.median(absolute_errors)
        metrics['std_absolute_error'] = np.std(absolute_errors)
        
        # Percentage within tolerance
        
        tolerance_5 = np.mean(absolute_errors <= 5) * 100
        tolerance_3 = np.mean(absolute_errors <= 3) * 100
        tolerance_1 = np.mean(absolute_errors <= 1) * 100
        
        metrics['within_5'] = tolerance_5
        metrics['within_3'] = tolerance_3
        metrics['within_1'] = tolerance_1
        
        return metrics
    
    def print_regression_evaluation(self, train_metrics, test_metrics, model_name):
        'PRINT COMPREHENSIVE REGRESSION EVALUATION'
        print(f'\n     🎯 REGRESSION EVALUATION - {model_name}')
        print('     ' + '=' * 50)
        print('     Metric           |   Train   |   Test    |  Difference')
        print('     ' + '-' * 50)
        
        metrics_to_display = ['mae', 'rmse', 'r2', 'mape']
        
        for metric in metrics_to_display:
            train_val = train_metrics[metric]
            test_val = test_metrics[metric]
            diff = test_val - train_val
            
            if metric in ['r2']:                                        # Higher is better
                diff_str = f'+{diff:.4f}' if diff > 0 else f'{diff:.4f}'
            else:                                                        # Lower is better
                diff_str = f'+{diff:.4f}' if diff > 0 else f'{diff:.4f}'
            
            print(f'     {metric.upper():<15} | {train_val:8.4f}  | {test_val:8.4f}  | {diff_str:>10}')
        
        print('     ' + '-' * 50)
        print(f'     Within 1 position:  {test_metrics['within_1']:.1f}%')
        print(f'     Within 3 positions: {test_metrics['within_3']:.1f}%')
        print(f'     Within 5 positions: {test_metrics['within_5']:.1f}%')
        
        # Overfitting assessment
        
        mae_gap = test_metrics['mae'] - train_metrics['mae']
        if mae_gap > train_metrics['mae'] * 0.3:
            print('   ⚠️  SIGNIFICANT OVERFITTING DETECTED')
        elif mae_gap < 0:
            print('   ✅ Good generalization performance')
        else:
            print('   ℹ️  Moderate overfitting')
    
    def plot_regression_evaluation(self, y_train, y_train_pred, y_test, y_test_pred, model_name):
        'PLOT COMPREHENSIVE REGRESSION EVALUATION VISUALIZATIONS'
        fig, axes = plt.subplots(2, 2, figsize = (15, 12))
        fig.suptitle(f'Regression Evaluation - {model_name}', fontsize = 16, fontweight = 'bold')
        
        # 1. Actual vs Predicted (Train)
        
        axes[0, 0].scatter(y_train, y_train_pred, alpha = 0.6, color = 'blue')
        axes[0, 0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw = 2)
        axes[0, 0].set_xlabel('Actual Values')
        axes[0, 0].set_ylabel('Predicted Values')
        axes[0, 0].set_title('Training Set: Actual vs Predicted')
        axes[0, 0].grid(True, alpha = 0.3)
        
        # 2. Actual vs Predicted (Test)
        
        axes[0, 1].scatter(y_test, y_test_pred, alpha = 0.6, color = 'green')
        axes[0, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw = 2)
        axes[0, 1].set_xlabel('Actual Values')
        axes[0, 1].set_ylabel('Predicted Values')
        axes[0, 1].set_title('Test Set: Actual vs Predicted')
        axes[0, 1].grid(True, alpha = 0.3)
        
        # 3. Residuals (Train)
        
        train_residuals = y_train - y_train_pred
        axes[1, 0].scatter(y_train_pred, train_residuals, alpha = 0.6, color = 'blue')
        axes[1, 0].axhline(y = 0, color = 'r', linestyle = '--')
        axes[1, 0].set_xlabel('Predicted Values')
        axes[1, 0].set_ylabel('Residuals')
        axes[1, 0].set_title('Training Set: Residuals')
        axes[1, 0].grid(True, alpha = 0.3)
        
        # 4. Residuals (Test)
        
        test_residuals = y_test - y_test_pred
        axes[1, 1].scatter(y_test_pred, test_residuals, alpha = 0.6, color = 'green')
        axes[1, 1].axhline(y = 0, color = 'r', linestyle = '--')
        axes[1, 1].set_xlabel('Predicted Values')
        axes[1, 1].set_ylabel('Residuals')
        axes[1, 1].set_title('Test Set: Residuals')
        axes[1, 1].grid(True, alpha = 0.3)
        
        plt.tight_layout()
        plt.savefig(f'regression_evaluation_{model_name.replace('  ', '_').lower()}.png', 
                   dpi = 300, bbox_inches = 'tight')
        plt.close()
    
    def comprehensive_classification_evaluation(self, model, X_train, X_test, y_train, y_test, model_name = 'Classifier'):
        'COMPREHENSIVE CLASSIFICATION EVALUATION'
        print(f'   📊 Comprehensive classification evaluation for {model_name}...')
        
        # Predictions
        
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        y_train_proba = model.predict_proba(X_train) if hasattr(model, 'predict_proba') else None
        y_test_proba = model.predict_proba(X_test) if hasattr(model, 'predict_proba') else None
        
        # Calculate metrics
        
        train_metrics = self.calculate_classification_metrics(y_train, y_train_pred, y_train_proba, 'train')
        test_metrics = self.calculate_classification_metrics(y_test, y_test_pred, y_test_proba, 'test')
        
        # Store results
        
        self.evaluation_results[model_name] = {
            'train': train_metrics,
            'test': test_metrics,
            'predictions': {
                'train': y_train_pred,
                'test': y_test_pred
            }
        }
        
        # Print results
        
        self.print_classification_evaluation(train_metrics, test_metrics, model_name)
        
        # Plot results
        
        self.plot_classification_evaluation(y_test, y_test_pred, y_test_proba, model_name)
        
        return self.evaluation_results[model_name]
    
    def calculate_classification_metrics(self, y_true, y_pred, y_proba, dataset_type):
        'CALCULATE COMPREHENSIVE CLASSIFICATION METRICS'
        metrics = {
            'accuracy': accuracy_score(y_true, y_pred),
            'precision': precision_score(y_true, y_pred, average = 'weighted', zero_division = 0),
            'recall': recall_score(y_true, y_pred, average = 'weighted', zero_division = 0),
            'f1': f1_score(y_true, y_pred, average = 'weighted', zero_division = 0)
        }
        
        # ROC-AUC if probabilities are available
        
        if y_proba is not None and len(np.unique(y_true)) == 2:                                         # Binary classification
            metrics['roc_auc'] = roc_auc_score(y_true, y_proba[:, 1])
        
        # Additional metrics
        
        from sklearn.metrics import balanced_accuracy_score, cohen_kappa_score
        metrics['balanced_accuracy'] = balanced_accuracy_score(y_true, y_pred)
        metrics['cohen_kappa'] = cohen_kappa_score(y_true, y_pred)
        
        return metrics
    
    def print_classification_evaluation(self, train_metrics, test_metrics, model_name):
        'PRINT COMPREHENSIVE CLASSIFICATION EVALUATION'
        print(f'\n     🎯 CLASSIFICATION EVALUATION - {model_name}')
        print('     ' + '=' * 50)
        print('     Metric           |   Train   |   Test    |  Difference')
        print('     ' + '-' * 50)
        
        metrics_to_display = ['accuracy', 'precision', 'recall', 'f1']
        if 'roc_auc' in test_metrics:
            metrics_to_display.append('roc_auc')
        
        for metric in metrics_to_display:
            train_val = train_metrics[metric]
            test_val = test_metrics[metric]
            diff = test_val - train_val
            diff_str = f'+{diff:.4f}' if diff > 0 else f'{diff:.4f}'
            
            print(f'     {metric.upper():<15} | {train_val:8.4f}  | {test_val:8.4f}  | {diff_str:>10}')
        
        print('     ' + '-' * 50)
        print(f'     Balanced Accuracy: {test_metrics['balanced_accuracy']:.4f}')
        print(f'     Cohen's Kappa: {test_metrics['cohen_kappa']:.4f}')
        
        # Overfitting assessment
        
        accuracy_gap = train_metrics['accuracy'] - test_metrics['accuracy']
        if accuracy_gap > 0.1:
            print('     ⚠️  SIGNIFICANT OVERFITTING DETECTED')
        elif accuracy_gap < 0.05:
            print('     ✅ Good generalization performance')
        else:
            print('   ℹ️  Moderate overfitting')
    
    def plot_classification_evaluation(self, y_test, y_test_pred, y_test_proba, model_name):
        'PLOT COMPREHENSIVE CLASSIFICATION EVALUATION VISUALIZATIONS'
        fig, axes = plt.subplots(2, 2, figsize = (15, 12))
        fig.suptitle(f'Classification Evaluation - {model_name}', fontsize = 16, fontweight = 'bold')
        
        # 1. Confusion Matrix
        
        cm = confusion_matrix(y_test, y_test_pred)
        sns.heatmap(cm, annot = True, fmt = 'd', cmap = 'Blues', ax = axes[0, 0])
        axes[0, 0].set_xlabel('Predicted')
        axes[0, 0].set_ylabel('Actual')
        axes[0, 0].set_title('Confusion Matrix')
        
        # 2. Classification Report Heatmap
        
        cr = classification_report(y_test, y_test_pred, output_dict = True)
        cr_df = pd.DataFrame(cr).iloc[:-1, :].T
        sns.heatmap(cr_df, annot = True, cmap = 'viridis', ax = axes[0, 1])
        axes[0, 1].set_title('Classification Report')
        
        # 3. ROC Curve (if binary classification and probabilities available)
        
        if y_test_proba is not None and len(np.unique(y_test)) == 2:
            from sklearn.metrics import roc_curve
            fpr, tpr, _ = roc_curve(y_test, y_test_proba[:, 1])
            axes[1, 0].plot(fpr, tpr, color = 'darkorange', lw = 2, label = f'ROC curve (AUC = {roc_auc_score(y_test, y_test_proba[:, 1]):.2f})')
            axes[1, 0].plot([0, 1], [0, 1], color = 'navy', lw = 2, linestyle = '--')
            axes[1, 0].set_xlabel('False Positive Rate')
            axes[1, 0].set_ylabel('True Positive Rate')
            axes[1, 0].set_title('ROC Curve')
            axes[1, 0].legend(loc = 'lower right')
            axes[1, 0].grid(True, alpha = 0.3)
        
        # 4. Feature Importance (if available)
        
        axes[1, 1].text(0.5, 0.5, 'Feature Importance\nPlot Available\nfor Tree-based Models', 
                       ha = 'center', va = 'center', transform = axes[1, 1].transAxes, fontsize = 12)
        axes[1, 1].set_title('Feature Importance')
        
        plt.tight_layout()
        plt.savefig(f'classification_evaluation_{model_name.replace(' ', '_').lower()}.png', 
                   dpi = 300, bbox_inches = 'tight')
        plt.close()

# 7: MySQL Database Connection

In [18]:
class EnhancedMySQLF1Database:
    """
    ENHANCED MYSQL DATABASE CONNECTION MANAGER WITH QUERY CACHING
    """
    
    def __init__(self, cache_ttl = 3600):                                # 1 hour default TTL
        self.connection = None
        self.config = {
            'host': '127.0.0.1',
            'port': 3306,
            'user': 'hump-nduati',
            'password': 'S0663072',
            'database': 'F1--DB',
            'charset': 'utf8mb4',
            'collation': 'utf8mb4_unicode_ci',
            'use_unicode': True
        }
        self.query_cache = {}
        self.cache_ttl = cache_ttl
        self.cache_enabled = True
        self.cache_hits = 0
        self.cache_misses = 0
    
    def connect(self):
        'ESTABLISH MYSQL DATABASE CONNECTION'
        try:
            self.connection = mysql.connector.connect(**self.config)
            if self.connection.is_connected():
                print('✅ MySQL database connection established successfully')
                return True
        except Error as e:
            print(f'❌ MySQL connection error: {e}')
            return False
    
    def disconnect(self):
        'CLOSE DATABASE CONNECTION'
        if self.connection and self.connection.is_connected():
            self.connection.close()
            print('✅ MySQL database connection closed')
    
    def execute_query(self, query, params = None, use_cache = True, cache_ttl = None):
        'EXECUTE SQL QUERY WITH OPTIONAL CACHING'
        if cache_ttl is None:
            cache_ttl = self.cache_ttl
        
        cache_key = self._create_cache_key(query, params)
        
        if use_cache and self.cache_enabled and cache_key in self.query_cache:
            cached_data, timestamp = self.query_cache[cache_key]
         
            # Check if cache is still valid
            
            if time.time() - timestamp < cache_ttl:
                self.cache_hits += 1
                if self.cache_hits % 50 == 0:                                                 # Log every 50 cache hits
                    print(f'  💾 Query cache hits: {self.cache_hits}, misses: {self.cache_misses}')
                return cached_data
        
        try:
            cursor = self.connection.cursor(dictionary = True)
            cursor.execute(query, params or ())
            result = cursor.fetchall()
            cursor.close()
            
            # Cache the result
            
            if use_cache and self.cache_enabled:
                self.query_cache[cache_key] = (result, time.time())
                self.cache_misses += 1
            
            return result
        except Error as e:
            print(f'❌ MySQL query error: {e}')
            print(f' Query: {query}')
            return []
    
    def _create_cache_key(self, query, params):
        'CREATE CACHE KEY FROM QUERY AND PARAMETERS'
        key_string = f'{query}_{str(params)}'
        return hashlib.md5(key_string.encode()).hexdigest()
    
    def load_table_data(self, table_name, columns = None, use_cache = True):
        'LOAD TABLE DATA WITH SPECIFIC COLUMNS AND CACHING'
        cache_key = f'table_{table_name}_{str(columns)}'
        
        if use_cache and self.cache_enabled and cache_key in self.query_cache:
            cached_data, timestamp = self.query_cache[cache_key]
            if time.time() - timestamp < self.cache_ttl:
                self.cache_hits += 1
                print(f'   💾 Using cached table data for {table_name}')
                return cached_data.copy()  
        
        if columns:
            columns_str = ', '.join([f'`{col}`' for col in columns])
            query = f'SELECT {columns_str} FROM `{table_name}`'
        else:
            query = f'SELECT * FROM `{table_name}`'
        
        try:
            df = pd.read_sql(query, self.connection)
            
            # Cache the result
           
            if use_cache and self.cache_enabled:
                self.query_cache[cache_key] = (df.copy(), time.time())
                self.cache_misses += 1
            
            print(f'✅ Loaded {len(df)} records from {table_name}')
            return df
        except Error as e:
            print(f'❌ Error loading table {table_name}: {e}')
            return pd.DataFrame()
    
    def clear_cache(self):
        'CLEAR QUERY CACHE'
        self.query_cache = {}
        self.cache_hits = 0
        self.cache_misses = 0
        print('🧹 Database query cache cleared')
    
    def get_cache_stats(self):
        'GET CACHE STATISTICS'
        total_queries = self.cache_hits + self.cache_misses
        hit_rate = (self.cache_hits / total_queries * 100) if total_queries > 0 else 0
        return {
            'hits': self.cache_hits,
            'misses': self.cache_misses,
            'total': total_queries,
            'hit_rate': hit_rate,
            'cached_queries': len(self.query_cache)
        }
    
    def disable_caching(self):
        'DISABLE QUERY CACHING'
        self.cache_enabled = False
        print('🚫 Database query caching disabled')
    
    def enable_caching(self):
        'ENABLE QUERY CACHING'
        self.cache_enabled = True
        print('✅ Database query caching enabled')

def load_enhanced_mysql_data(use_caching = True):
    'LOAD COMPLETE F1 DATA FROM MYSQL DATABASE WITH CACHING'
    print('🔄 LOADING COMPLETE F1 DATA FROM MYSQL DATABASE...')
    
    db = EnhancedMySQLF1Database()
    
    if not use_caching:
        db.disable_caching()
    
    if not db.connect():
        print('❌ Failed to connect to MySQL database')
        return create_empty_datasets()
    
    dataframes = create_empty_datasets()
    
    # CORRECTED table mappings with actual column names from your schema
    table_configs = {
        'driver': {
            'key': 'drivers',
            'columns': ['id', 'name', 'first_name', 'last_name', 'full_name', 'abbreviation', 
                       'permanent_number', 'gender', 'date_of_birth', 'date_of_death',
                       'place_of_birth', 'country_of_birth_country_id', 'nationality_country_id',
                       'best_championship_position', 'best_starting_grid_position', 'best_race_result',
                       'total_championship_wins', 'total_race_entries', 'total_race_starts',
                       'total_race_wins', 'total_race_laps', 'total_podiums', 'total_points',
                       'total_championship_points', 'total_pole_positions', 'total_fastest_laps']
        },
        'constructor': {
            'key': 'constructors',
            'columns': ['id', 'name', 'full_name', 'country_id', 'best_championship_position',
                       'best_starting_grid_position', 'best_race_result', 'total_championship_wins',
                       'total_race_entries', 'total_race_starts', 'total_race_wins', 
                       'total_1_and_2_finishes', 'total_race_laps', 'total_podiums',
                       'total_podium_races', 'total_points', 'total_championship_points',
                       'total_pole_positions', 'total_fastest_laps']
        },
        'circuit': {
            'key': 'circuits', 
            'columns': ['id', 'name', 'full_name', 'previous_names', 'type', 'direction',
                       'place_name', 'country_id', 'latitude', 'longitude', 'length', 
                       'turns', 'total_races_held']
        },
        'race': {
            'key': 'races',
            'columns': ['id', 'year', 'round', 'date', 'time', 'grand_prix_id', 'official_name',
                       'qualifying_format', 'sprint_qualifying_format', 'circuit_id', 'circuit_type',
                       'direction', 'course_length', 'turns', 'laps', 'distance', 'scheduled_laps',
                       'scheduled_distance']
        },
        'race_data': {
            'key': 'results',
            'columns': ['race_id', 'type', 'position_display_order', 'position_number', 'position_text',
                       'driver_number', 'driver_id', 'constructor_id', 'engine_manufacturer_id',
                       'tyre_manufacturer_id', 'race_laps', 'race_time', 'race_time_millis',
                       'race_points', 'race_grid_position_number', 'race_positions_gained',
                       'race_pit_stops', 'race_fastest_lap']
        },
        'qualifying_result': {
            'key': 'qualifying',
            'columns': ['race_id', 'position_display_order', 'position_number', 'position_text',
                       'driver_number', 'driver_id', 'constructor_id', 'time', 'time_millis', 
                       'q1', 'q1_millis', 'q2', 'q2_millis', 'q3', 'q3_millis']
        },
        'pit_stop': {
            'key': 'pitStops',
            'columns': ['race_id', 'driver_id', 'stop', 'lap', 'time', 'time_millis']
        },
        'grand_prix': {
            'key': 'grandsPrix',
            'columns': ['id', 'name', 'full_name', 'short_name', 'abbreviation', 'country_id']
        },
        'country': {
            'key': 'countries',
            'columns': ['id', 'alpha2_code', 'alpha3_code', 'name', 'demonym', 'continent_id']
        },
        'continent': {
            'key': 'continents',
            'columns': ['id', 'code', 'name', 'demonym']
        },
        'race_result': {
            'key': 'race_results',
            'columns': ['race_id', 'driver_id', 'constructor_id', 'position_number', 'position_text',
                       'points', 'laps', 'time', 'time_millis', 'fastest_lap', 'grid_position_number']
        }
    }
    
    # Load all configured tables
    for mysql_table, config in table_configs.items():
        try:
            dataframes[config['key']] = db.load_table_data(mysql_table, config.get('columns'), use_cache=use_caching)
        except Exception as e:
            print(f"⚠️ Error loading table {mysql_table}: {e}")
            dataframes[config['key']] = pd.DataFrame()
    
    # Load additional data with fallbacks
    try:
        dataframes['lapTimes'] = load_enhanced_lap_times_data(db, use_caching)
    except Exception as e:
        print(f"⚠️ Error loading lap times: {e}")
        dataframes['lapTimes'] = pd.DataFrame()
    
    try:
        dataframes['fastestLaps'] = db.load_table_data('fastest_lap', ['race_id', 'driver_id', 'lap', 'time', 'time_millis'], use_cache = use_caching)
    except Exception as e:
        print(f'⚠️ Error loading fastest laps: {e}')
        dataframes['fastestLaps'] = pd.DataFrame()
    
    try:
        dataframes['startingGrid'] = db.load_table_data('starting_grid_position', ['race_id', 'driver_id', 'position_number'], use_cache = use_caching)
    except Exception as e:
        print(f'⚠️ Error loading starting grid: {e}')
        dataframes['startingGrid'] = pd.DataFrame()
    
    dataframes['status'] = create_enhanced_status_codes()
    
    # Print cache statistics
    
    cache_stats = db.get_cache_stats()
    print(f'📊 Database Cache Stats: {cache_stats['hits']} hits, {cache_stats['misses']} misses ({cache_stats['hit_rate']:.1f}% hit rate)')
    
    db.disconnect()
    
    # Print loaded data summary
    
    loaded_count = len([df for df in dataframes.values() if not df.empty])
    total_records = sum([len(df) for df in dataframes.values() if not df.empty])
    print(f'🎯 MYSQL DATA LOADING COMPLETE: {loaded_count} datasets with {total_records:,} total records')
    
    return dataframes

def load_enhanced_lap_times_data(db, use_caching = True):
    'LOAD ENHANCED LAP TIMES DATA FROM INTERACTIVE TABLES'
    
    # Try interactive lap performance matrix first
    
    try:
        query = """
        SELECT 
            season_year as race_year,
            round_number as race_round, 
            driver_code as driver_id,
            driver_full_name,
            constructor_name as constructor_id,
            best_lap_time,
            total_laps,
            position,
            session_date as race_date
        FROM `interactive_lap_statistics_mat`
        WHERE session_type = 'RACE'
        LIMIT 10000
        """
        lap_times = pd.read_sql(query, db.connection)
        if not lap_times.empty:
            print('✅ Loaded lap times from interactive_lap_statistics_mat')
            return lap_times
    except Exception as e:
        print(f'⚠️ Could not load from interactive_lap_statistics_mat: {e}')
    
    # Fallback to fastest_lap table
    
    try:
        fastest_laps = db.load_table_data('fastest_lap', ['race_id', 'driver_id', 'lap', 'time', 'time_millis'], use_cache = use_caching)
        if not fastest_laps.empty:
            print('✅ Created lap times from fastest_lap table')
            return fastest_laps
    except Exception as e:
        print(f'⚠️ Could not load from fastest_lap: {e}')
    
    return pd.DataFrame()

def create_enhanced_status_codes():
    'CREATE ENHANCED STATUS CODES'
    status_data = [
        {'status_id': 1, 'status': 'Finished'},
        {'status_id': 2, 'status': 'Disqualified'},
        {'status_id': 3, 'status': 'Accident'},
        {'status_id': 4, 'status': 'Collision'},
        {'status_id': 5, 'status': 'Engine'},
        {'status_id': 6, 'status': 'Gearbox'},
        {'status_id': 7, 'status': 'Retired'},
        {'status_id': 8, 'status': 'Technical'},
        {'status_id': 9, 'status': 'Tyre'},
        {'status_id': 10, 'status': 'Brakes'},
        {'status_id': 11, 'status': 'Electrical'},
        {'status_id': 12, 'status': 'Hydraulics'},
        {'status_id': 13, 'status': 'Suspension'},
        {'status_id': 14, 'status': 'Fuel System'},
        {'status_id': 15, 'status': 'Wheel'}
    ]
    return pd.DataFrame(status_data)

def create_empty_datasets():
    'CREATE EMPTY DATASETS WITH ENHANCED STRUCTURE'
    empty_dfs = {}
    expected_keys = [
        'drivers', 'constructors', 'circuits', 'races', 'countries', 'continents',
        'grandsPrix', 'results', 'qualifying', 'pitStops', 'lapTimes', 'fastestLaps',
        'startingGrid', 'status', 'engineManufacturers', 'tyreManufacturers', 'entrants',
        'race_results'
    ]
    
    for key in expected_keys:
        empty_dfs[key] = pd.DataFrame()
    
    return empty_dfs

# 8: Weather Data Integration with Enhanced Caching

In [19]:
class RealWeatherDataIntegrator:
    """
    COMPREHENSIVE REAL WEATHER DATA INTEGRATION WITH ENHANCED CACHING
    """
    
    def __init__(self, circuits_df, cache_file='weather_cache.json', cache_ttl_days=30):
        self.base_url = "https://archive-api.open-meteo.com/v1/archive"
        self.weather_cache = {}
        self.cache_file = cache_file
        self.cache_ttl_days = cache_ttl_days
        self.request_delay = 1
        self.circuits_df = circuits_df
        self.circuit_coordinates = self.build_complete_circuit_coordinates()
        self.load_weather_cache()  # Load existing cache
    
    def load_weather_cache(self):
        """LOAD PERSISTENT WEATHER CACHE FROM DISK WITH TTL VALIDATION"""
        try:
            if os.path.exists(self.cache_file):
                with open(self.cache_file, 'r') as f:
                    cache_data = json.load(f)
                
                # Filter out expired cache entries
                current_time = time.time()
                self.weather_cache = {}
                expired_count = 0
                
                for key, cache_entry in cache_data.items():
                    if current_time - cache_entry.get('timestamp', 0) < self.cache_ttl_days * 24 * 3600:
                        self.weather_cache[key] = cache_entry['data']
                    else:
                        expired_count += 1
                
                print(f"✅ Loaded weather cache with {len(self.weather_cache)} valid entries ({expired_count} expired)")
                
        except Exception as e:
            print(f"⚠️ Could not load weather cache: {e}")
            self.weather_cache = {}
    
    def save_weather_cache(self):
        """SAVE WEATHER CACHE TO DISK WITH TIMESTAMP"""
        try:
            cache_data = {}
            current_time = time.time()
            
            for key, data in self.weather_cache.items():
                cache_data[key] = {
                    'data': data,
                    'timestamp': current_time
                }
            
            with open(self.cache_file, 'w') as f:
                json.dump(cache_data, f, indent=2)
            
            print(f"💾 Weather cache saved with {len(self.weather_cache)} entries")
        except Exception as e:
            print(f"⚠️ Could not save weather cache: {e}")
    
    def build_complete_circuit_coordinates(self):
        """BUILD COMPLETE CIRCUIT COORDINATES DATABASE FROM MYSQL DATA"""
        circuit_coordinates = {}
        
        if self.circuits_df.empty:
            print("⚠️ No circuit data available, using fallback coordinates")
            return self.get_fallback_coordinates()
        
        for _, circuit in self.circuits_df.iterrows():
            circuit_id = circuit['id']
            circuit_name = circuit['name']
            latitude = circuit.get('latitude')
            longitude = circuit.get('longitude')
            
            if pd.notna(latitude) and pd.notna(longitude):
                # Determine timezone based on coordinates
                
                timezone = self.estimate_timezone(latitude, longitude)
                circuit_coordinates[circuit_id] = {
                    'latitude': float(latitude),
                    'longitude': float(longitude),
                    'timezone': timezone,
                    'name': circuit_name
                }
                print(f"   📍 Loaded coordinates for {circuit_name}: ({latitude}, {longitude})")
            else:
                # Use name-based lookup as fallback
              
                coords = self.lookup_circuit_by_name(circuit_name)
                circuit_coordinates[circuit_id] = coords
                print(f"   📍 Estimated coordinates for {circuit_name}: {coords['latitude']}, {coords['longitude']}")
        
        print(f"✅ Loaded coordinates for {len(circuit_coordinates)} circuits")
        return circuit_coordinates
    
    def lookup_circuit_by_name(self, circuit_name):
        """LOOKUP CIRCUIT COORDINATES BY NAME PATTERN MATCHING"""
        circuit_name_lower = circuit_name.lower()
        
        # Comprehensive F1 circuit coordinates database
        known_circuits = {
            'monza': {'latitude': 45.6156, 'longitude': 9.2811, 'timezone': 'Europe/Rome'},
            'monaco': {'latitude': 43.7347, 'longitude': 7.4206, 'timezone': 'Europe/Monaco'},
            'silverstone': {'latitude': 52.0786, 'longitude': -1.0169, 'timezone': 'Europe/London'},
            'spa': {'latitude': 50.4372, 'longitude': 5.9714, 'timezone': 'Europe/Brussels'},
            'hungaroring': {'latitude': 47.5819, 'longitude': 19.2486, 'timezone': 'Europe/Budapest'},
            'red bull ring': {'latitude': 47.2197, 'longitude': 14.7647, 'timezone': 'Europe/Vienna'},
            'zandvoort': {'latitude': 52.3886, 'longitude': 4.5408, 'timezone': 'Europe/Amsterdam'},
            'barcelona': {'latitude': 41.57, 'longitude': 2.2611, 'timezone': 'Europe/Madrid'},
            'imola': {'latitude': 44.3439, 'longitude': 11.7167, 'timezone': 'Europe/Rome'},
            'bahrain': {'latitude': 26.0325, 'longitude': 50.5106, 'timezone': 'Asia/Bahrain'},
            'yas marina': {'latitude': 24.4672, 'longitude': 54.6039, 'timezone': 'Asia/Dubai'},
            'jeddah': {'latitude': 21.6319, 'longitude': 39.1044, 'timezone': 'Asia/Riyadh'},
            'suzuka': {'latitude': 34.8431, 'longitude': 136.5411, 'timezone': 'Asia/Tokyo'},
            'shanghai': {'latitude': 31.3389, 'longitude': 121.2200, 'timezone': 'Asia/Shanghai'},
            'singapore': {'latitude': 1.2914, 'longitude': 103.8644, 'timezone': 'Asia/Singapore'},
            'cota': {'latitude': 30.1339, 'longitude': -97.6406, 'timezone': 'America/Chicago'},
            'interlagos': {'latitude': -23.7036, 'longitude': -46.6997, 'timezone': 'America/Sao_Paulo'},
            'montreal': {'latitude': 45.5000, 'longitude': -73.5228, 'timezone': 'America/Montreal'},
            'mexico city': {'latitude': 19.4042, 'longitude': -99.0907, 'timezone': 'America/Mexico_City'},
            'miami': {'latitude': 25.9581, 'longitude': -80.2389, 'timezone': 'America/New_York'},
            'las vegas': {'latitude': 36.1147, 'longitude': -115.1739, 'timezone': 'America/Los_Angeles'},
            'melbourne': {'latitude': -37.8494, 'longitude': 144.9686, 'timezone': 'Australia/Melbourne'},
            'albert park': {'latitude': -37.8494, 'longitude': 144.9686, 'timezone': 'Australia/Melbourne'},
            'baku': {'latitude': 40.3725, 'longitude': 49.8533, 'timezone': 'Asia/Baku'},
            'sochi': {'latitude': 43.4057, 'longitude': 39.9653, 'timezone': 'Europe/Moscow'},
            'istanbul': {'latitude': 40.9517, 'longitude': 29.4050, 'timezone': 'Europe/Istanbul'},
            'portimão': {'latitude': 37.2275, 'longitude': -8.6267, 'timezone': 'Europe/Lisbon'},
            'nürburgring': {'latitude': 50.3356, 'longitude': 6.9475, 'timezone': 'Europe/Berlin'},
            'hockenheim': {'latitude': 49.3303, 'longitude': 8.5653, 'timezone': 'Europe/Berlin'},
            'sepang': {'latitude': 2.7611, 'longitude': 101.7383, 'timezone': 'Asia/Kuala_Lumpur'},
            'fuji': {'latitude': 35.3717, 'longitude': 138.9272, 'timezone': 'Asia/Tokyo'},
            'indianapolis': {'latitude': 39.7953, 'longitude': -86.2347, 'timezone': 'America/New_York'},
            'rodriguez': {'latitude': 19.4042, 'longitude': -99.0907, 'timezone': 'America/Mexico_City'},
            'kyalami': {'latitude': -25.9894, 'longitude': 28.0750, 'timezone': 'Africa/Johannesburg'},
            'losail': {'latitude': 25.4903, 'longitude': 51.4525, 'timezone': 'Asia/Qatar'}
        }
        
        # Try direct matches first
        
        for known_name, coords in known_circuits.items():
            if known_name in circuit_name_lower:
                return coords
        
        # Try partial matches
        
        partial_matches = {
            'monza': ['monza', 'autodromo nazionale'],
            'silverstone': ['silverstone', 'british grand prix'],
            'spa': ['spa', 'spa-francorchamps', 'belgian grand prix'],
            'monaco': ['monaco', 'monte carlo'],
            'interlagos': ['interlagos', 'sao paulo', 'brazilian grand prix'],
            'cota': ['cota', 'circuit of the americas', 'austin'],
            'yas marina': ['yas marina', 'abu dhabi', 'yas island'],
            'bahrain': ['bahrain', 'sakhir', 'bahrain international circuit'],
            'red bull ring': ['red bull', 'spielberg', 'austrian grand prix'],
            'hungaroring': ['hungaroring', 'hungarian grand prix'],
            'zandvoort': ['zandvoort', 'dutch grand prix'],
            'barcelona': ['barcelona', 'catalunya', 'spanish grand prix'],
            'imola': ['imola', 'emilia romagna', 'san marino'],
            'suzuka': ['suzuka', 'japanese grand prix'],
            'melbourne': ['melbourne', 'albert park', 'australian grand prix']
        }
        
        for standard_name, variations in partial_matches.items():
            for variation in variations:
                if variation in circuit_name_lower:
                    return known_circuits[standard_name]
        
        print(f"   ⚠️ Unknown circuit: {circuit_name}, using Monza coordinates")
        return known_circuits['monza']
    
    def estimate_timezone(self, latitude, longitude):
        """ESTIMATE TIMEZONE BASED ON COORDINATES"""
        
        # Simple timezone estimation based on longitude
        
        if -10 <= longitude <= 40:  # Europe/Africa
            return 'Europe/London'
        elif -80 <= longitude <= -30:  # Americas
            return 'America/New_York'
        elif 100 <= longitude <= 150:  # East Asia
            return 'Asia/Tokyo'
        elif 50 <= longitude <= 80:  # Middle East/India
            return 'Asia/Dubai'
        else:
            return 'UTC'
    
    def get_fallback_coordinates(self):
        """GET FALLBACK COORDINATES WHEN NO CIRCUIT DATA IS AVAILABLE"""
        return {
            'default': {'latitude': 45.6156, 'longitude': 9.2811, 'timezone': 'Europe/Rome', 'name': 'Monza'}
        }
    
    def fetch_real_weather_data(self, circuit_id, race_date, days_before = 3, days_after = 1):
        'FETCH REAL HISTORICAL WEATHER DATA WITH ENHANCED CACHING'
        cache_key = f"{circuit_id}_{race_date}"
        
        # Check memory cache first
        
        if cache_key in self.weather_cache:
            print(f"   📦 Using cached weather data for circuit {circuit_id}")
            return self.weather_cache[cache_key]
        
        # Check if we can use similar historical data (same circuit, similar date)
        
        similar_cache_key = self.find_similar_weather_data(circuit_id, race_date)
        if similar_cache_key:
            print(f"   🔄 Using similar weather data from {similar_cache_key}")
            similar_data = self.weather_cache[similar_cache_key]
            
            # Store this as the new cache entry to avoid future lookups
        
            self.weather_cache[cache_key] = similar_data
            self.save_weather_cache()
            return similar_data
        
        # Get circuit coordinates
        
        if circuit_id not in self.circuit_coordinates:
            print(f"   ⚠️ Circuit {circuit_id} not found in coordinates database")
            return None
        
        coords = self.circuit_coordinates[circuit_id]
        circuit_name = coords['name']
        
        try:
            start_date = (pd.to_datetime(race_date) - timedelta(days=days_before)).strftime('%Y-%m-%d')
            end_date = (pd.to_datetime(race_date) + timedelta(days=days_after)).strftime('%Y-%m-%d')
            
            params = {
                'latitude': coords['latitude'],
                'longitude': coords['longitude'],
                'start_date': start_date,
                'end_date': end_date,
                'hourly': [
                    'temperature_2m', 'relative_humidity_2m', 'dew_point_2m',
                    'precipitation', 'rain', 'snowfall', 'weather_code',
                    'pressure_msl', 'surface_pressure', 'cloud_cover',
                    'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high',
                    'wind_speed_10m', 'wind_direction_10m', 'wind_gusts_10m',
                    'visibility', 'is_day'
                ],
                'timezone': coords['timezone']
            }
            
            print(f'   🌤️ Fetching real weather data for {circuit_name} ({start_date} to {end_date})...')
            
            response = requests.get(self.base_url, params = params, timeout = 30)
            
            if response.status_code == 200:
                weather_data = response.json()
                
                # Cache the result
                
                self.weather_cache[cache_key] = weather_data
                self.save_weather_cache()  
                print(f'   ✅ Real weather data fetched and cached for {circuit_name}')
                return weather_data
            else:
                print(f'   ❌ API request failed for {circuit_name}: HTTP {response.status_code}')
                
                # Return demo data that matches real conditions
                
                demo_data = self.generate_realistic_weather_data(race_date, coords)
                self.weather_cache[cache_key] = demo_data
                self.save_weather_cache()
                return demo_data
                
        except Exception as e:
            print(f'   ❌ Error fetching real weather for {circuit_name}: {str(e)}')
            
            # Return realistic demo data based on circuit location and season
            
            demo_data = self.generate_realistic_weather_data(race_date, coords)
            self.weather_cache[cache_key] = demo_data
            self.save_weather_cache()
            return demo_data
    
    def find_similar_weather_data(self, circuit_id, target_date):
        'FIND SIMILAR WEATHER DATA FOR THE SAME CIRCUIT AND SIMILAR DATE'
        target_dt = pd.to_datetime(target_date)
        
        for cache_key in self.weather_cache.keys():
            if circuit_id in cache_key:
                
                # Extract date from cache key
                
                try:
                    cached_circuit, cached_date = cache_key.split('_', 1)
                    if cached_circuit == circuit_id:
                        cached_dt = pd.to_datetime(cached_date)
                
                        # Check if dates are close (within 2 weeks)
                        
                        date_diff = abs((cached_dt - target_dt).days)
                        if date_diff <= 14:  # Same season period
                            return cache_key
                except:
                    continue
        return None
    
    def generate_realistic_weather_data(self, race_date, coords):
        'GENERATE REALISTIC WEATHER DATA BASED ON LOCATION AND SEASON'
        race_dt = pd.to_datetime(race_date)
        month = race_dt.month
        latitude = coords['latitude']
        
        # Seasonal patterns
        
        if latitude > 40:  # Northern hemisphere
            if month in [12, 1, 2]:  # Winter
                base_temp = 5 + np.random.normal(0, 3)
                rain_prob = 0.4
            elif month in [3, 4, 5]:  # Spring
                base_temp = 15 + np.random.normal(0, 4)
                rain_prob = 0.3
            elif month in [6, 7, 8]:  # Summer
                base_temp = 25 + np.random.normal(0, 5)
                rain_prob = 0.2
            else:  # Autumn
                base_temp = 15 + np.random.normal(0, 4)
                rain_prob = 0.35
        elif latitude < -20:  # Southern hemisphere
            if month in [12, 1, 2]:  # Summer
                base_temp = 25 + np.random.normal(0, 5)
                rain_prob = 0.2
            elif month in [3, 4, 5]:  # Autumn
                base_temp = 15 + np.random.normal(0, 4)
                rain_prob = 0.35
            elif month in [6, 7, 8]:  # Winter
                base_temp = 5 + np.random.normal(0, 3)
                rain_prob = 0.4
            else:  # Spring
                base_temp = 15 + np.random.normal(0, 4)
                rain_prob = 0.3
        else:  # Tropical regions
            base_temp = 28 + np.random.normal(0, 3)
            rain_prob = 0.25
        
        # Adjust for specific circuit characteristics
        
        if 'monaco' in coords['name'].lower() or 'monte carlo' in coords['name'].lower():
            base_temp += 2  # Mediterranean climate
            rain_prob = 0.15
        elif 'bahrain' in coords['name'].lower() or 'abu dhabi' in coords['name'].lower():
            base_temp = 35 + np.random.normal(0, 2)  # Desert climate
            rain_prob = 0.01
        elif 'silverstone' in coords['name'].lower():
            rain_prob = 0.5  # UK climate
        elif 'spa' in coords['name'].lower():
            rain_prob = 0.4  # Belgian climate
        
        # Generate realistic hourly data
        
        hours = 24 * 4  # 4 days of data
        time_index = [race_dt - timedelta(days = 2) + timedelta(hours = i) for i in range(hours)]
        
        demo_data = {
            'hourly': {
                'time': [t.isoformat() for t in time_index],
                'temperature_2m': [max(base_temp + np.random.normal(0, 2), -5) for _ in range(hours)],
                'precipitation': [np.random.exponential(0.5) if np.random.random() < rain_prob else 0 for _ in range(hours)],
                'rain': [np.random.exponential(0.3) if np.random.random() < rain_prob else 0 for _ in range(hours)],
                'wind_speed_10m': [max(np.random.weibull(1.5) * 5, 0) for _ in range(hours)],
                'pressure_msl': [1013 + np.random.normal(0, 5) for _ in range(hours)],
                'relative_humidity_2m': [max(min(np.random.normal(60, 15), 100), 20) for _ in range(hours)],
                'cloud_cover': [min(np.random.beta(2, 2) * 100, 100) for _ in range(hours)]
            }
        }
        
        print(f'   🌤️ Generated realistic weather data for {coords['name']}')
        return demo_data
    
    def process_weather_features(self, weather_data, race_date):
        'PROCESS REAL WEATHER DATA INTO STRATEGY-RELEVANT FEATURES'
        if not weather_data or 'hourly' not in weather_data:
            print(f'   ⚠️ No weather data available, using realistic defaults')
            return self.get_realistic_default_weather_features(race_date)
        
        hourly_data = weather_data['hourly']
        df_weather = pd.DataFrame(hourly_data)
        
        df_weather['time'] = pd.to_datetime(df_weather['time'])
        race_datetime = pd.to_datetime(race_date)
        
        race_day = race_datetime.date()
        df_race_day = df_weather[df_weather['time'].dt.date == race_day]
        
        if df_race_day.empty:
            print(f'   ⚠️ No race day weather data, using weekend averages')
            return self.process_weekend_weather(df_weather, race_date)
        
        # Focus on race window (typically 14:00-16:00 local time)
        
        race_start = race_datetime.replace(hour = 14, minute = 0, second = 0)
        race_end = race_start + timedelta(hours = 2)
        
        df_race_window = df_weather[
            (df_weather['time'] >= race_start) & 
            (df_weather['time'] <= race_end)
        ]
        
        if df_race_window.empty:
            df_race_window = df_race_day
            print(f'   ⚠️ No race window data, using full day data')
        
        weather_features = {}
        
        # Temperature analysis
        
        if 'temperature_2m' in df_race_window.columns:
            temp_data = df_race_window['temperature_2m'].dropna()
            if len(temp_data) > 0:
                weather_features.update({
                    'temperature_avg': float(temp_data.mean()),
                    'temperature_max': float(temp_data.max()),
                    'temperature_min': float(temp_data.min()),
                    'temperature_range': float(temp_data.max() - temp_data.min()),
                    'temperature_std': float(temp_data.std())
                })
        
        # Precipitation analysis
        
        if 'precipitation' in df_race_window.columns:
            precip_data = df_race_window['precipitation'].dropna()
            if len(precip_data) > 0:
                weather_features.update({
                    'precipitation_total': float(precip_data.sum()),
                    'precipitation_max': float(precip_data.max()),
                    'rain_probability': float((precip_data > 0).mean()),
                    'heavy_rain_probability': float((precip_data > 2.5).mean())
                })
        
        # Wind analysis
        
        if 'wind_speed_10m' in df_race_window.columns:
            wind_data = df_race_window['wind_speed_10m'].dropna()
            if len(wind_data) > 0:
                weather_features.update({
                    'wind_speed_avg': float(wind_data.mean()),
                    'wind_speed_max': float(wind_data.max())
                })
        
        # Pressure analysis
        
        if 'pressure_msl' in df_race_window.columns:
            pressure_data = df_race_window['pressure_msl'].dropna()
            if len(pressure_data) > 0:
                weather_features.update({
                    'pressure_avg': float(pressure_data.mean()),
                    'pressure_trend': float(self.calculate_pressure_trend(pressure_data))
                })
        
        # Humidity analysis
        
        if 'relative_humidity_2m' in df_race_window.columns:
            humidity_data = df_race_window['relative_humidity_2m'].dropna()
            if len(humidity_data) > 0:
                weather_features.update({
                    'humidity_avg': float(humidity_data.mean()),
                    'humidity_max': float(humidity_data.max())
                })
        
        # Cloud cover analysis
        
        cloud_columns = ['cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high']
        for col in cloud_columns:
            if col in df_race_window.columns:
                cloud_data = df_race_window[col].dropna()
                if len(cloud_data) > 0:
                    weather_features[f'{col}_avg'] = float(cloud_data.mean())
        
        # Derived features
        
        weather_features['weather_condition'] = self.classify_weather_condition(df_race_window)
        weather_features['estimated_track_temp'] = self.estimate_track_temperature(weather_features)
        weather_features['tire_degradation_factor'] = self.calculate_tire_degradation_factor(weather_features)
        
        print(f'   ✅ Processed {len(weather_features)} real weather features')
        return weather_features
    
    def calculate_pressure_trend(self, pressure_data):
        'Calculate pressure trend for weather prediction'
        if len(pressure_data) < 2:
            return 0
        
        x = np.arange(len(pressure_data))
        slope, _, _, _, _ = linregress(x, pressure_data)
        return slope
    
    def classify_weather_condition(self, df_weather_window):
        'Classify weather condition based on multiple factors'
        if df_weather_window.empty:
            return 'dry'
        
        condition = 'dry'
        
        # Check for rain
        
        if 'precipitation' in df_weather_window.columns:
            max_precip = df_weather_window['precipitation'].max()
            if max_precip > 5.0:
                condition = 'heavy_rain'
            elif max_precip > 0.5:
                condition = 'light_rain'
        
        # Check for extreme temperatures
        
        if 'temperature_2m' in df_weather_window.columns:
            avg_temp = df_weather_window['temperature_2m'].mean()
            if avg_temp > 35:
                condition = 'very_hot'
            elif avg_temp < 10:
                condition = 'very_cold'
        
        return condition
    
    def estimate_track_temperature(self, weather_features):
        'Estimate track temperature based on weather conditions'
        air_temp = weather_features.get('temperature_avg', 20)
        cloud_cover = weather_features.get('cloud_cover_avg', 50)
        
        # Track is typically 10-20°C hotter than air temperature
        
        base_track_temp = air_temp + 15
        
        # Cloud cover reduces track temperature
        
        cloud_adjustment = (100 - cloud_cover) * 0.15
        track_temp = base_track_temp + cloud_adjustment
        
        return max(air_temp + 5, min(track_temp, air_temp + 40))
    
    def calculate_tire_degradation_factor(self, weather_features):
        'Calculate tire degradation factor based on weather conditions'
        degradation = 1.0
        
        temp = weather_features.get('temperature_avg', 20)
        if temp > 30:
            degradation *= 1.3  # High temperatures increase degradation
        elif temp < 15:
            degradation *= 0.8  # Low temperatures reduce degradation
        
        track_temp = weather_features.get('estimated_track_temp', temp + 15)
        if track_temp > 45:
            degradation *= 1.5  # Very hot track significantly increases degradation
        elif track_temp < 25:
            degradation *= 0.7  # Cold track reduces degradation
        
        return degradation
    
    def process_weekend_weather(self, df_weather, race_date):
        'Process weekend weather when race day data is unavailable'
        weather_features = {}
        
        if 'temperature_2m' in df_weather.columns:
            temp_data = df_weather['temperature_2m'].dropna()
            if len(temp_data) > 0:
                weather_features['temperature_avg'] = float(temp_data.mean())
        
        if 'precipitation' in df_weather.columns:
            precip_data = df_weather['precipitation'].dropna()
            if len(precip_data) > 0:
                weather_features['rain_probability'] = float((precip_data > 0).mean())
        
        # Fill missing features with realistic defaults
        
        default_features = self.get_realistic_default_weather_features(race_date)
        for key, value in default_features.items():
            if key not in weather_features:
                weather_features[key] = value
        
        return weather_features
    
    def get_realistic_default_weather_features(self, race_date):
        'Get realistic default weather features based on season and location'
        race_dt = pd.to_datetime(race_date)
        month = race_dt.month
        
        # Seasonal defaults
        
        if month in [12, 1, 2]:  # Winter
            base_temp = 8
            rain_prob = 0.3
        elif month in [3, 4, 5]:  # Spring
            base_temp = 16
            rain_prob = 0.25
        elif month in [6, 7, 8]:  # Summer
            base_temp = 24
            rain_prob = 0.15
        else:  # Autumn
            base_temp = 15
            rain_prob = 0.28
        
        return {
            'temperature_avg': base_temp,
            'temperature_max': base_temp + 5,
            'temperature_min': base_temp - 5,
            'temperature_range': 10.0,
            'precipitation_total': rain_prob * 10,
            'precipitation_max': rain_prob * 5,
            'rain_probability': rain_prob,
            'heavy_rain_probability': rain_prob * 0.3,
            'wind_speed_avg': 3.0,
            'wind_speed_max': 6.0,
            'pressure_avg': 1013.0,
            'pressure_trend': 0.0,
            'humidity_avg': 65.0,
            'humidity_max': 85.0,
            'cloud_cover_avg': 50.0,
            'weather_condition': 'dry' if rain_prob < 0.2 else 'light_rain',
            'estimated_track_temp': base_temp + 15,
            'tire_degradation_factor': 1.0
        }
    
    def clear_cache(self):
        'CLEAR WEATHER CACHE'
        self.weather_cache = {}
        if os.path.exists(self.cache_file):
            os.remove(self.cache_file)
        print('🧹 Weather cache cleared')

def integrate_real_weather_data(df, circuits, use_caching = True):
    'INTEGRATE REAL WEATHER DATA INTO THE MAIN DATASET WITH CACHING'
    print(f'\n🌤️ INTEGRATING REAL WEATHER DATA INTO DATASET...')
    
    if df.empty or circuits.empty:
        print('   ⚠️ No data available for weather integration')
        return df
    
    if 'circuit_id' not in df.columns or 'date' not in df.columns:
        print('   ⚠️ Missing circuit_id or date columns for weather integration')
        return df
    
    weather_integrator = RealWeatherDataIntegrator(circuits)
    
    if not use_caching:
        weather_integrator.clear_cache()
        print('   🚫 Caching disabled for weather data')
    
    unique_races = df[['race_id', 'circuit_id', 'date']].drop_duplicates()
    weather_data_list = []
    
    print(f'   Processing {len(unique_races)} unique races for real weather data...')
    
    for idx, race in unique_races.iterrows():
        race_id = race['race_id']
        circuit_id = race['circuit_id']
        race_date = race['date']
        
        # Skip if we already have weather for this race
        
        existing_weather = [w for w in weather_data_list if w['race_id'] == race_id]
        if existing_weather:
            continue
        
        # Fetch real weather data
        
        weather_data = weather_integrator.fetch_real_weather_data(circuit_id, race_date)
        weather_features = weather_integrator.process_weather_features(weather_data, race_date)
        weather_features['race_id'] = race_id
        
        weather_data_list.append(weather_features)
        
        # Respectful API delay
        
        time.sleep(weather_integrator.request_delay)
        
        if (idx + 1) % 5 == 0:
            print(f'   Processed {idx + 1}/{len(unique_races)} races...')
    
    if weather_data_list:
        weather_df = pd.DataFrame(weather_data_list)
        df_with_weather = df.merge(weather_df, on = 'race_id', how = 'left')
        
        print(f'   ✅ Real weather data integrated for {len(weather_data_list)} races')
        print(f'   📊 Added {len(weather_df.columns) - 1} weather features')
        
        return df_with_weather
    else:
        print('   ⚠️ No weather data could be fetched, using realistic defaults')
        
        # Add realistic default weather features based on race dates
        
        default_weather = weather_integrator.get_realistic_default_weather_features(df['date'].iloc[0])
        for key, value in default_weather.items():
            df[key] = value
        return df

#  9: Data Processing and Feature Engineering

In [20]:
class AdvancedDataCleaner:
    'Advanced data cleaning for F1 datasets'
    
    def __init__(self):
        self.cleaning_report = {}
    
    def comprehensive_clean(self, df, dataset_name):
        'Comprehensive cleaning for different dataset types'
        if df.empty:
            return df
            
        original_shape = df.shape
        df_clean = df.copy()
        
        # Dataset-specific cleaning
        
        if dataset_name == 'drivers':
            df_clean = self.clean_driver_data(df_clean)
        elif dataset_name == 'constructors':
            df_clean = self.clean_constructor_data(df_clean)
        elif dataset_name == 'circuits':
            df_clean = self.clean_circuit_data(df_clean)
        elif dataset_name == 'races':
            df_clean = self.clean_race_data(df_clean)
        elif dataset_name == 'results':
            df_clean = self.clean_results_data(df_clean)
        
        # General cleaning for all datasets
        
        df_clean = self.clean_general_data(df_clean)
        
        
        # Store cleaning report
        
        self.cleaning_report[dataset_name] = {
            'original_rows': original_shape[0],
            'cleaned_rows': df_clean.shape[0],
            'original_cols': original_shape[1],
            'cleaned_cols': df_clean.shape[1],
            'rows_removed': original_shape[0] - df_clean.shape[0],
            'cols_removed': original_shape[1] - df_clean.shape[1]
        }
        
        return df_clean
    
    def clean_driver_data(self, df):
        'Clean driver-specific data'
        
        # Remove drivers with no race entries
        
        if 'total_race_entries' in df.columns:
            df = df[df['total_race_entries'] > 0]
        return df
    
    def clean_constructor_data(self, df):
        'Clean constructor-specific data'
        
        # Remove constructors with no race entries
        
        if 'total_race_entries' in df.columns:
            df = df[df['total_race_entries'] > 0]
        return df
    
    def clean_circuit_data(self, df):
        'Clean circuit-specific data'
        # Remove circuits with no races held
        
        if 'total_races_held' in df.columns:
            df = df[df['total_races_held'] > 0]
        return df
    
    def clean_race_data(self, df):
        'Clean race-specific data'
        
        # Remove races without dates
        
        if 'date' in df.columns:
            df = df[df['date'].notna()]
        return df
    
    def clean_results_data(self, df):
        'Clean results-specific data'
        
        # Remove results without position data
        
        if 'position_number' in df.columns:
            df = df[df['position_number'].notna()]
        return df
    
    def clean_general_data(self, df):
        'General data cleaning for all datasets'
        
        # Remove completely empty columns
        
        df = df.dropna(axis = 1, how = 'all')
        
        # Remove duplicate rows
        
        df = df.drop_duplicates()
        
        return df
    
    def print_cleaning_report(self):
        'Print comprehensive cleaning report'
        print('\n🧹 DATA CLEANING REPORT:')
        print('=' * 50)
        for dataset, report in self.cleaning_report.items():
            print(f'📊 {dataset.upper():<15} | Rows: {report['original_rows']:>4} → {report['cleaned_rows']:<4} | '
                  f'Cols: {report['original_cols']:>2} → {report['cleaned_cols']:<2} | '
                  f'Removed: {report['rows_removed']:>2} rows, {report['cols_removed']:>1} cols')

def build_enhanced_merged_dataset(results, races, circuits, drivers, constructors, qualifying, pit_stops, race_results):
    'BUILD ENHANCED MERGED DATASET WITH CORRECTED COLUMN NAMES'
    print(f'\n🔄 BUILDING ENHANCED MERGED DATASET FROM MYSQL...')
    
    # Use race_results as base if available, otherwise use results
    
    if not race_results.empty:
        df_merged = race_results.copy()
        print(f'   Base race_results: {len(df_merged)} records')
    elif not results.empty:
        df_merged = results.copy()
        print(f'   Base results: {len(df_merged)} records')
    else:
        print('❌ No results or race_results data available - cannot build dataset')
        return pd.DataFrame()
    
    # Merge with races using MySQL column names
    
    if not races.empty and 'race_id' in df_merged.columns and 'id' in races.columns:
        df_merged = df_merged.merge(
            races, 
            left_on = 'race_id', 
            right_on = 'id',
            how = 'left',
            suffixes = ('', '_race')
        )
        print(f'   ✅ Merged races: {len(df_merged)} records')
    
    # Merge with circuits
    
    if not circuits.empty and 'circuit_id' in df_merged.columns and 'id' in circuits.columns:
        df_merged = df_merged.merge(
            circuits,
            left_on = 'circuit_id',
            right_on = 'id',
            how = 'left',
            suffixes = ('', '_circuit')
        )
        print(f'   ✅ Merged circuits: {len(df_merged)} records')
    
    # Merge with drivers
    
    if not drivers.empty and 'driver_id' in df_merged.columns and 'id' in drivers.columns:
        df_merged = df_merged.merge(
            drivers,
            left_on = 'driver_id',
            right_on = 'id',
            how = 'left',
            suffixes = ('', '_driver')
        )
        print(f'   ✅ Merged drivers: {len(df_merged)} records')
    
    # Merge with constructors
    
    if not constructors.empty and 'constructor_id' in df_merged.columns and 'id' in constructors.columns:
        df_merged = df_merged.merge(
            constructors,
            left_on = 'constructor_id',
            right_on = 'id',
            how = 'left',
            suffixes = ('', '_constructor')
        )
        print(f'   ✅ Merged constructors: {len(df_merged)} records')
    
    # Add qualifying data
    
    if not qualifying.empty and 'race_id' in df_merged.columns and 'driver_id' in df_merged.columns:
        try:
            
            # Use position_number from qualifying as qualifying position
            
            qualifying_positions = qualifying[['race_id', 'driver_id', 'position_number']].copy()
            qualifying_positions = qualifying_positions.rename(columns = {'position_number': 'qualifying_position'})
            
            df_merged = df_merged.merge(
                qualifying_positions, 
                on = ['race_id', 'driver_id'], 
                how = 'left'
            )
            print(f'   ✅ Added qualifying data: {len(df_merged)} records')
        except Exception as e:
            print(f'   ⚠️  Error merging qualifying data: {e}')
    
    # Add pit stop data
    
    if not pit_stops.empty and 'race_id' in df_merged.columns and 'driver_id' in df_merged.columns:
        try:
            pit_stops_agg = pit_stops.groupby(['race_id', 'driver_id']).agg(
                total_pit_stops = ('stop', 'count')
            ).reset_index()
            
            df_merged = df_merged.merge(
                pit_stops_agg, 
                on = ['race_id', 'driver_id'], 
                how = 'left'
            )
            print(f'   ✅ Added pit stop data: {len(df_merged)} records')
        except Exception as e:
            print(f'   ⚠️  Error merging pit stop data: {e}')
    
    # Clean up any duplicate columns
    
    df_merged = df_merged.loc[:, ~df_merged.columns.duplicated()]
    
    print(f''🎯 ENHANCED MERGED DATASET: {len(df_merged)} records with {len(df_merged.columns)} columns')
    return df_merged

# 10: Enhanced Feature Engineering with Real Weather and Caching

In [21]:
def create_enhanced_strategy_features_with_weather(df, drivers, constructors, circuits, 
                                                  cache_file = 'features_cache.pkl', 
                                                  use_caching = True):
    'CREATE ENHANCED STRATEGY FEATURES WITH REAL WEATHER DATA AND CACHING'
    
    # Try to load from cache first
   
    if use_caching:
        cache_key = f'features_{len(df)}_{df['race_id'].nunique() if 'race_id' in df.columns else 'unknown'}_{hash(str(sorted(df.columns)))}'
        
        try:
            if os.path.exists(cache_file):
                with open(cache_file, 'rb') as f:
                    cache_data = pickle.load(f)
                    if cache_key in cache_data:
                        print(f'💾 Using cached features: {cache_key}')
                        return cache_data[cache_key].copy()
        except Exception as e:
            print(f'⚠️ Could not load feature cache: {e}')
    
    print('\n🔄 CREATING ENHANCED STRATEGY FEATURES WITH REAL WEATHER...')
    
    if df.empty:
        print('❌ No data available for feature engineering')
        return pd.DataFrame()
    
    strategy_features = []
    processed_count = 0
    
    for idx, race in df.iterrows():
        try:
            race_id = race.get('race_id') or f'race_{idx}'
            driver_id = race.get('driver_id') or f'driver_{idx}'
            constructor_id = race.get('constructor_id') or f'constructor_{idx}'
            circuit_id = race.get('circuit_id') or f'circuit_{idx}'
            
            feature_set = {
                'race_id': race_id,
                'driver_id': driver_id,
                'constructor_id': constructor_id,
                'circuit_id': circuit_id
            }
            
            # Extract race performance data
            
            position = race.get('position_number') or 20
            points = race.get('race_points') or race.get('points') or 0
            grid = race.get('qualifying_position') or race.get('race_grid_position_number') or race.get('grid_position_number') or 20
            
            feature_set.update({
                'final_position': position,
                'points_scored': points,
                'qualifying_position': grid,
                'finished': 1 if position and position > 0 else 0,
                'position_gain': max(0, (grid or 20) - position) if grid and position else 0
            })
            
            # Extract driver statistics
            
            driver_data = extract_enhanced_driver_stats(drivers, driver_id)
            feature_set.update(driver_data)
            
            # Extract constructor statistics
            
            constructor_data = extract_enhanced_constructor_stats(constructors, constructor_id)
            feature_set.update(constructor_data)
            
            # Extract circuit characteristics
            
            circuit_data = extract_enhanced_circuit_stats(circuits, circuit_id)
            feature_set.update(circuit_data)
            
            # Calculate qualifying performance
            
            qualifying_gap = max(0, (grid - 1) * 0.15)
            feature_set['qualifying_gap_to_pole'] = qualifying_gap
            
            # Extract pit stop data
            
            pit_stops = race.get('total_pit_stops') or race.get('race_pit_stops') or 1
            feature_set['total_pit_stops'] = pit_stops
            
            # Extract weather features
            
            weather_features = extract_enhanced_weather_features(race)
            feature_set.update(weather_features)
            
            # Create weather-driver interaction features
            
            interaction_features = create_weather_interaction_features(feature_set, driver_data, constructor_data)
            feature_set.update(interaction_features)
            
            strategy_features.append(feature_set)
            processed_count += 1
            
            if processed_count % 100 == 0:
                print(f'   Processed {processed_count} records...')
                
        except Exception as e:
            print(f'⚠️ Error processing record {idx}: {e}')
            continue
    
    result_df = pd.DataFrame(strategy_features)
    result_df = fill_missing_features_with_enhanced_defaults(result_df)
    
    # Cache the results if caching is enabled
    
    if use_caching:
        try:
            if os.path.exists(cache_file):
                with open(cache_file, 'rb') as f:
                    cache_data = pickle.load(f)
            else:
                cache_data = {}
            
            cache_data[cache_key] = result_df.copy()
            
            with open(cache_file, 'wb') as f:
                pickle.dump(cache_data, f)
            
            print(f'💾 Features cached: {cache_key}')
        except Exception as e:
            print(f'⚠️ Could not cache features: {e}')
    
    print(f'✅ ENHANCED FEATURE ENGINEERING COMPLETE: {len(result_df)} records')
    print(f'📊 Total features created: {len(result_df.columns)}')
    
    return result_df

def extract_enhanced_driver_stats(drivers, driver_id):
    'EXTRACT ENHANCED DRIVER STATISTICS FROM MYSQL DATA'
    stats = {}
    
    if drivers.empty:
        return get_default_driver_stats()
    
    driver_match = drivers[drivers['id'] == driver_id]
    if driver_match.empty:
        return get_default_driver_stats()
    
    driver = driver_match.iloc[0]
    
    # Extract driver statistics
    
    driver_races = driver.get('total_race_entries', 1)
    driver_wins = driver.get('total_race_wins', 0)
    driver_podiums = driver.get('total_podiums', 0)
    driver_points = driver.get('total_points', 0)
    
    # Calculate driver age
    
    driver_age = 28
    if 'date_of_birth' in driver and pd.notna(driver['date_of_birth']):
        try:
            birth_date = pd.to_datetime(driver['date_of_birth'])
            today = pd.to_datetime('today')
            driver_age = today.year - birth_date.year
        except:
            pass
    
    stats.update({
        'driver_experience': driver_races,
        'driver_win_rate': driver_wins / max(driver_races, 1),
        'driver_podium_rate': driver_podiums / max(driver_races, 1),
        'driver_points_per_race': driver_points / max(driver_races, 1),
        'driver_age': driver_age,
        'driver_fastest_laps': driver.get('total_fastest_laps', 0),
        'driver_pole_positions': driver.get('total_pole_positions', 0)
    })
    
    return stats

def extract_enhanced_constructor_stats(constructors, constructor_id):
    'EXTRACT ENHANCED CONSTRUCTOR STATISTICS FROM MYSQL DATA'
    stats = {}
    
    if constructors.empty:
        return get_default_constructor_stats()
    
    constructor_match = constructors[constructors['id'] == constructor_id]
    if constructor_match.empty:
        return get_default_constructor_stats()
    
    constructor = constructor_match.iloc[0]
    
    const_races = constructor.get('total_race_entries', 1)
    const_wins = constructor.get('total_race_wins', 0)
    const_podiums = constructor.get('total_podiums', 0)
    
    # Calculate reliability from race starts vs entries
    
    reliability = 0.85
    if 'total_race_starts' in constructor and 'total_race_entries' in constructor:
        if constructor['total_race_entries'] > 0:
            reliability = constructor['total_race_starts'] / constructor['total_race_entries']
    
    stats.update({
        'constructor_experience': const_races,
        'constructor_win_rate': const_wins / max(const_races, 1),
        'constructor_podium_rate': const_podiums / max(const_races, 1),
        'constructor_reliability': reliability,
        'constructor_points_per_race': constructor.get('total_points', 0) / max(const_races, 1),
        'constructor_fastest_laps': constructor.get('total_fastest_laps', 0)
    })
    
    return stats

def extract_enhanced_circuit_stats(circuits, circuit_id):
    'EXTRACT ENHANCED CIRCUIT CHARACTERISTICS FROM MYSQL DATA'
    stats = {}
    
    if circuits.empty:
        return get_default_circuit_stats()
    
    circuit_match = circuits[circuits['id'] == circuit_id]
    if circuit_match.empty:
        return get_default_circuit_stats()
    
    circuit = circuit_match.iloc[0]
    
    stats.update({
        'circuit_length': circuit.get('length', 5.0),
        'circuit_corners': circuit.get('turns', 12),
        'circuit_altitude': circuit.get('altitude', 100) if 'altitude' in circuit else 100,
        'circuit_races_held': circuit.get('total_races_held', 1),
        'circuit_type': circuit.get('type', 'permanent'),
        'circuit_direction': circuit.get('direction', 'clockwise')
    })
    
    return stats

def extract_enhanced_weather_features(race):
    'EXTRACT ENHANCED WEATHER FEATURES FROM RACE DATA'
    weather_features = {}
    
    weather_metrics = [
        'temperature_avg', 'temperature_max', 'temperature_min', 'temperature_range',
        'precipitation_total', 'rain_probability', 'heavy_rain_probability',
        'wind_speed_avg', 'wind_speed_max', 'pressure_avg', 'humidity_avg',
        'estimated_track_temp', 'tire_degradation_factor', 'weather_condition'
    ]
    
    for metric in weather_metrics:
        if metric in race:
            weather_features[metric] = race[metric]
    
    return weather_features

def create_weather_interaction_features(feature_set, driver_data, constructor_data):
    'CREATE WEATHER INTERACTION FEATURES FOR ENHANCED STRATEGY'
    interaction_features = {}
    
    driver_experience = driver_data.get('driver_experience', 50)
    rain_probability = feature_set.get('rain_probability', 0)
    interaction_features['driver_wet_experience'] = driver_experience * rain_probability
    
    constructor_reliability = constructor_data.get('constructor_reliability', 0.85)
    temperature = feature_set.get('temperature_avg', 20)
    temp_extremeness = abs(temperature - 20) / 20
    interaction_features['team_extreme_temp_performance'] = constructor_reliability * (1 - temp_extremeness)
    
    circuit_corners = feature_set.get('circuit_corners', 12)
    wind_speed = feature_set.get('wind_speed_avg', 3)
    interaction_features['wind_circuit_impact'] = circuit_corners * wind_speed / 100
    
    # New interaction features
    
    track_temp = feature_set.get('estimated_track_temp', 35)
    driver_win_rate = driver_data.get('driver_win_rate', 0.05)
    interaction_features['hot_track_winning_chance'] = driver_win_rate * (track_temp / 30)
    
    return interaction_features

def get_default_driver_stats():
    'GET DEFAULT DRIVER STATISTICS'
    return {
        'driver_experience': 50,
        'driver_win_rate': 0.05,
        'driver_podium_rate': 0.15,
        'driver_points_per_race': 5.0,
        'driver_age': 28,
        'driver_fastest_laps': 2,
        'driver_pole_positions': 1
    }

def get_default_constructor_stats():
    'GET DEFAULT CONSTRUCTOR STATISTICS'
    return {
        'constructor_experience': 300,
        'constructor_win_rate': 0.08,
        'constructor_podium_rate': 0.25,
        'constructor_reliability': 0.85,
        'constructor_points_per_race': 8.0,
        'constructor_fastest_laps': 15
    }

def get_default_circuit_stats():
    'GET DEFAULT CIRCUIT STATISTICS'
    return {
        'circuit_length': 5.0,
        'circuit_corners': 12,
        'circuit_altitude': 100,
        'circuit_races_held': 10,
        'circuit_type': 'permanent',
        'circuit_direction': 'clockwise'
    }

def fill_missing_features_with_enhanced_defaults(df):
    'FILL MISSING FEATURES WITH ENHANCED DEFAULTS'
    feature_defaults = {
        'driver_experience': 50,
        'driver_win_rate': 0.05,
        'driver_podium_rate': 0.15,
        'driver_points_per_race': 5.0,
        'driver_age': 28,
        'driver_fastest_laps': 2,
        'driver_pole_positions': 1,
        'constructor_experience': 300,
        'constructor_win_rate': 0.08,
        'constructor_podium_rate': 0.25,
        'constructor_reliability': 0.85,
        'constructor_points_per_race': 8.0,
        'constructor_fastest_laps': 15,
        'circuit_length': 5.0,
        'circuit_corners': 12,
        'circuit_altitude': 100,
        'circuit_races_held': 10,
        'qualifying_position': 20,
        'qualifying_gap_to_pole': 2.0,
        'total_pit_stops': 1,
        'final_position': 20,
        'points_scored': 0,
        'finished': 1,
        'position_gain': 0,
        'temperature_avg': 20.0,
        'temperature_max': 25.0,
        'temperature_min': 15.0,
        'temperature_range': 10.0,
        'precipitation_total': 0.0,
        'rain_probability': 0.0,
        'heavy_rain_probability': 0.0,
        'wind_speed_avg': 3.0,
        'wind_speed_max': 6.0,
        'pressure_avg': 1013.0,
        'humidity_avg': 60.0,
        'estimated_track_temp': 35.0,
        'tire_degradation_factor': 1.0,
        'weather_condition': 'dry',
        'driver_wet_experience': 0.0,
        'team_extreme_temp_performance': 0.85,
        'wind_circuit_impact': 0.36,
        'hot_track_winning_chance': 0.05
    }
    
    for feature, default in feature_defaults.items():
        if feature in df.columns and df[feature].isnull().any():
            df[feature] = df[feature].fillna(default)
    
    return df

def clear_feature_cache(cache_file = 'features_cache.pkl'):
    'CLEAR FEATURE ENGINEERING CACHE'
    try:
        if os.path.exists(cache_file):
            os.remove(cache_file)
            print(f'🧹 Feature cache cleared: {cache_file}')
        else:
            print(f'ℹ️ No feature cache found: {cache_file}')
    except Exception as e:
        print(f'⚠️ Error clearing feature cache: {e}')

# 11: Strategy Optimization Engine

In [22]:
def optimize_complete_race_strategy(predictor, driver_profile, constructor_profile, circuit_profile, qualifying_pos, weather_conditions):
    'OPTIMIZE COMPLETE RACE STRATEGY WITH REAL WEATHER INTEGRATION'
    input_features = {
        'driver_experience': driver_profile.get('experience', 50),
        'driver_win_rate': driver_profile.get('win_rate', 0.05),
        'driver_podium_rate': driver_profile.get('podium_rate', 0.15),
        'driver_points_per_race': driver_profile.get('points_per_race', 5.0),
        'driver_age': driver_profile.get('age', 28),
        'constructor_experience': constructor_profile.get('experience', 300),
        'constructor_win_rate': constructor_profile.get('win_rate', 0.08),
        'constructor_reliability': constructor_profile.get('reliability', 0.85),
        'circuit_length': circuit_profile.get('length', 5.0),
        'circuit_corners': circuit_profile.get('corners', 12),
        'circuit_altitude': circuit_profile.get('altitude', 100),
        'qualifying_position': qualifying_pos,
        'qualifying_gap_to_pole': max(0, (qualifying_pos - 1) * 0.1)
    }
    
    # Add all weather features
    
    input_features.update(weather_conditions)
    
    # Create interaction features
    
    input_features['driver_wet_experience'] = input_features['driver_experience'] * input_features.get('rain_probability', 0)
    input_features['team_extreme_temp_performance'] = input_features['constructor_reliability'] * (1 - (abs(input_features.get('temperature_avg', 20) - 20) / 20))
    
    input_df = pd.DataFrame([input_features])
    
    # Use only available features that the model was trained on
    
    available_features = {k: v for k, v in input_features.items() if k in predictor.feature_columns}
    input_df_available = pd.DataFrame([available_features])
    
    predicted_position = predictor.predict_enhanced_position(input_df_available)[0]
    
    # Generate comprehensive strategy recommendation
    
    recommendation = generate_comprehensive_strategy_recommendation(
        predicted_position, qualifying_pos, weather_conditions, 
        driver_profile, constructor_profile, circuit_profile
    )
    
    return recommendation

def generate_comprehensive_strategy_recommendation(predicted_position, qualifying_pos, weather_conditions, driver_profile, constructor_profile, circuit_profile):
    'GENERATE COMPREHENSIVE STRATEGY RECOMMENDATION'
    
    position_gain = max(0, qualifying_pos - predicted_position)
    
    # Calculate confidence based on multiple factors
    
    confidence_factors = []
    
    # Position consistency
    
    position_diff = abs(predicted_position - qualifying_pos)
    if position_diff <= 2:
        confidence_factors.append(1.0)
    elif position_diff <= 5:
        confidence_factors.append(0.7)
    else:
        confidence_factors.append(0.4)
    
    # Weather stability
    
    rain_prob = weather_conditions.get('rain_probability', 0)
    if rain_prob < 0.1:
        confidence_factors.append(1.0)
    elif rain_prob < 0.3:
        confidence_factors.append(0.8)
    else:
        confidence_factors.append(0.5)
    
    # Driver experience
    
    driver_exp = driver_profile.get('experience', 50)
    if driver_exp > 100:
        confidence_factors.append(1.0)
    elif driver_exp > 50:
        confidence_factors.append(0.8)
    else:
        confidence_factors.append(0.6)
    
    confidence_score = np.mean(confidence_factors)
    if confidence_score > 0.8:
        confidence = 'High'
    elif confidence_score > 0.6:
        confidence = 'Medium'
    else:
        confidence = 'Low'
    
    # Risk assessment
    
    risk_factors = []
    if position_gain > 5:
        risk_factors.append('Aggressive position gain predicted')
    if rain_prob > 0.5:
        risk_factors.append('High probability of rain')
    if weather_conditions.get('temperature_avg', 20) > 35:
        risk_factors.append('Extreme temperatures expected')
    
    risk_level = 'High' if len(risk_factors) > 1 else 'Medium' if len(risk_factors) == 1 else 'Low'
    
    # Pit stop strategy
    
    pit_stop_strategy = calculate_pit_stop_strategy(weather_conditions, circuit_profile)
    
    # Tire strategy
    
    tire_strategy = calculate_tire_strategy(weather_conditions, circuit_profile)
    
    # Overtaking opportunities
    
    overtaking_advice = calculate_overtaking_opportunities(circuit_profile, predicted_position, qualifying_pos)
    
    recommendation = {
        'predicted_finish': round(predicted_position),
        'qualifying_gain': position_gain,
        'confidence': confidence,
        'confidence_score': round(confidence_score, 2),
        'risk_level': risk_level,
        'risk_factors': risk_factors,
        'recommended_pit_stops': pit_stop_strategy['stops'],
        'pit_stop_laps': pit_stop_strategy['laps'],
        'tire_strategy': tire_strategy,
        'overtaking_opportunities': overtaking_advice,
        'weather_impact': calculate_weather_impact_analysis(weather_conditions),
        'key_considerations': generate_key_considerations(weather_conditions, circuit_profile, driver_profile)
    }
    
    return recommendation

def calculate_pit_stop_strategy(weather_conditions, circuit_profile):
    'CALCULATE OPTIMAL PIT STOP STRATEGY'
    base_stops = 1
    circuit_corners = circuit_profile.get('corners', 12)
    circuit_length = circuit_profile.get('length', 5.0)
    
    # Adjust for circuit characteristics
    
    if circuit_corners > 15:                                    # High downforce circuits
        base_stops += 0.5
    if circuit_length > 6:                                      # Long circuits
        base_stops += 0.5
    
    # Weather adjustments
    
    rain_prob = weather_conditions.get('rain_probability', 0)
    if rain_prob > 0.7:
        base_stops += 1                                           # Extra stop for potential rain tires
    elif rain_prob > 0.3:
        base_stops += 0.5                                        # Possible extra stop
    
    # Temperature adjustments
    
    track_temp = weather_conditions.get('estimated_track_temp', 35)
    if track_temp > 45:
        base_stops += 0.5                                         # Higher degradation
    elif track_temp < 25:
        base_stops -= 0.5                                          # Lower degradation
    
    # Determine final strategy
    
    if base_stops >= 2.5:
        stops = 'Three-stop'
        laps = [15, 30, 45] if circuit_corners > 15 else [20, 40, 55]
    elif base_stops >= 1.5:
        stops = 'Two-stop'
        laps = [20, 40] if circuit_corners > 15 else [25, 50]
    else:
        stops = 'One-stop'
        laps = [30] if circuit_corners > 15 else [35]
    
    return {'stops': stops, 'laps': laps}

def calculate_tire_strategy(weather_conditions, circuit_profile):
    'CALCULATE OPTIMAL TIRE STRATEGY'
    rain_prob = weather_conditions.get('rain_probability', 0)
    track_temp = weather_conditions.get('estimated_track_temp', 35)
    circuit_corners = circuit_profile.get('corners', 12)
    
    if rain_prob > 0.7:
        return 'Start on wet tires, switch to intermediates when track dries, then slicks'
    elif rain_prob > 0.3:
        return 'Flexible strategy: prepare for wet conditions with intermediate option available'
    elif track_temp > 45:
        if circuit_corners > 15:
            return 'Hard-medium one-stop to manage degradation'
        else:
            return 'Medium-hard one-stop with focus on tire preservation'
    elif track_temp < 25:
        return 'Soft-medium aggressive strategy to manage tire warm-up'
    else:
        if circuit_corners > 15:
            return 'Soft-medium-hard two-stop for optimal performance'
        else:
            return 'Medium-hard one-stop balanced strategy'

def calculate_overtaking_opportunities(circuit_profile, predicted_position, qualifying_pos):
    'CALCULATE OVERTAKING OPPORTUNITIES'
    circuit_corners = circuit_profile.get('corners', 12)
    circuit_length = circuit_profile.get('length', 5.0)
    position_gain = qualifying_pos - predicted_position
    
    opportunities = []
    
    if position_gain > 3:
        opportunities.append('Aggressive early overtaking recommended')
    
    if circuit_corners > 15:
        opportunities.append('Focus on corner exit overtaking in sector 3')
    else:
        opportunities.append('DRS zones and long straights provide best opportunities')
    
    if circuit_length > 6:
        opportunities.append('Strategic tire advantage can be used for late-race overtakes')
    
    return opportunities if opportunities else ['Conventional overtaking approach recommended']

def calculate_weather_impact_analysis(weather_conditions):
    'ANALYZE WEATHER IMPACT ON RACE STRATEGY'
    impacts = []
    
    rain_prob = weather_conditions.get('rain_probability', 0)
    temperature = weather_conditions.get('temperature_avg', 20)
    wind_speed = weather_conditions.get('wind_speed_avg', 3)
    
    if rain_prob > 0.7:
        impacts.append('Heavy rain expected - major strategy implications with multiple tire changes')
    elif rain_prob > 0.3:
        impacts.append('Chance of rain - flexible strategy required with intermediate tire option')
    
    if temperature > 35:
        impacts.append('High temperatures - increased tire degradation and cooling concerns')
    elif temperature < 10:
        impacts.append('Low temperatures - tire warm-up challenges and potential graining')
    
    if wind_speed > 8:
        impacts.append('Strong winds - aerodynamic instability and potential safety car period')
    
    if not impacts:
        impacts.append('Stable conditions - conventional strategy optimization')
    
    return impacts

def generate_key_considerations(weather_conditions, circuit_profile, driver_profile):
    'GENERATE KEY STRATEGY CONSIDERATIONS'
    considerations = []
    
    rain_prob = weather_conditions.get('rain_probability', 0)
    driver_exp = driver_profile.get('experience', 50)
    circuit_corners = circuit_profile.get('corners', 12)
    
    if rain_prob > 0.5 and driver_exp < 50:
        considerations.append('Inexperienced driver in wet conditions - conservative approach recommended')
    
    if circuit_corners > 18:
        considerations.append('High-downforce circuit - focus on aerodynamic efficiency and tire management')
    
    if weather_conditions.get('estimated_track_temp', 35) > 40:
        considerations.append('High track temperatures - monitor tire wear closely and consider extra stop')
    
    return considerations

# 12: ML Pipeline + Prediction 

In [23]:
class EnhancedF1StrategyPredictor:
    """
    ENHANCED F1 STRATEGY PREDICTOR WITH COMPLETE ML PIPELINE INTEGRATION AND CACHING
    """
    
    def __init__(self, cache_enabled = True):
        self.models = {}
        self.preprocessor = ComprehensiveDataPreprocessor()
        self.tuner = ComprehensiveHyperparameterTuner()
        self.evaluator = ComprehensiveEvaluator()
        self.learning_curves = AdvancedLearningCurves()
        self.feature_columns = []
        self.is_trained = False
        self.best_model = None
        self.prediction_cache = {}
        self.cache_enabled = cache_enabled
        self.cache_hits = 0
        self.cache_misses = 0
    
    def prepare_enhanced_features(self, X, y = None, is_training = True):
        'PREPARE ENHANCED FEATURES WITH COMPREHENSIVE PREPROCESSING'
        X_processed, y_processed = self.preprocessor.comprehensive_preprocessing_pipeline(
            X, y, problem_type = 'regression'
        )
        
        if is_training and y_processed is not None:
            self.feature_columns = X_processed.columns.tolist()
            
        return X_processed, y_processed
    
    def train_enhanced_models(self, X, y_position, y_pit_stops = None, use_caching = True):
        'TRAIN ENHANCED MODELS WITH COMPLETE ML PIPELINE'
        print(f'\n🎯 TRAINING ENHANCED MODELS WITH COMPLETE ML PIPELINE...')
        
        if X.empty or len(X) < 10:
            print('❌ Insufficient data for enhanced training')
            return
        
        # Cache configuration
        
        self.cache_enabled = use_caching
        if not use_caching:
            self.clear_prediction_cache()
            print('   🚫 Prediction caching disabled for training')
        
        try:
            # STEP 1: Comprehensive Preprocessing
            
            X_processed, y_processed = self.prepare_enhanced_features(X, y_position, is_training = True)
            
            # Store feature columns for prediction alignment
            
            self.feature_columns = X_processed.columns.tolist()
            print(f'   Feature columns stored: {len(self.feature_columns)} features')
            
            # STEP 2: Train-Test Split
            
            X_train, X_test, y_train, y_test = train_test_split(
                X_processed, y_processed, test_size = 0.2, random_state = 42, shuffle = True
            )
            
            print(f'   Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features')
            print(f'   Test set: {X_test.shape[0]} samples')
            
            # STEP 3: Define Models and Parameter Grids
            
            models = {
                'Random Forest': RandomForestRegressor(random_state = 42),
                'Gradient Boosting': GradientBoostingRegressor(random_state = 42),
                'SVR': SVR(),
                'KNN': KNeighborsRegressor(),
                'Decision Tree': DecisionTreeRegressor(random_state = 42)
            }
            
            param_grids = {
                'Random Forest': {
                    'n_estimators': [50, 100, 200],
                    'max_depth': [10, 20, None],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4]
                },
                'Gradient Boosting': {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'max_depth': [3, 5, 7]
                },
                'SVR': {
                    'C': [0.1, 1, 10],
                    'kernel': ['linear', 'rbf'],
                    'gamma': ['scale', 'auto']
                },
                'KNN': {
                    'n_neighbors': [3, 5, 7, 9],
                    'weights': ['uniform', 'distance'],
                    'metric': ['euclidean', 'manhattan']
                },
                'Decision Tree': {
                    'max_depth': [5, 10, 15, 20, None],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4]
                }
            }
            
            # STEP 4: Comprehensive Hyperparameter Tuning
            
            print('  🔧 Performing comprehensive hyperparameter tuning...')
            tuned_models = {}
            
            for name, model in models.items():
                tuned_model = self.tuner.comprehensive_tuning(
                    model, param_grids[name], X_train, y_train, 
                    problem_type = 'regression', tuning_method = 'randomized', n_iter = 20
                )
                tuned_models[name] = tuned_model
            
            # STEP 5: Model Comparison
            
            print('   📊 Comparing model performance...')
            model_comparison = self.tuner.compare_models_comprehensive(
                tuned_models, X_train, y_train, problem_type = 'regression', cv_folds = 5
            )
            
            # Select best model
            
            best_model_name = min(model_comparison, key = lambda x: model_comparison[x]['mean_score'])
            self.best_model = tuned_models[best_model_name]
            
            print(f' 🏆 BEST MODEL: {best_model_name} (MAE: {model_comparison[best_model_name]['mean_score']:.4f})')
            
            # STEP 6: Advanced Learning Curves
            
            print('  📈 Generating advanced learning curves...')
            self.learning_curves.plot_advanced_learning_curves(
                self.best_model, X_train, y_train, model_name = best_model_name, cv_folds = 5
            )
            
            # STEP 7: Comprehensive Evaluation
            
            print(' 📋 Performing comprehensive model evaluation...')
            evaluation_results = self.evaluator.comprehensive_regression_evaluation(
                self.best_model, X_train, X_test, y_train, y_test, model_name = best_model_name
            )
            
            # STEP 8: Train final model on full data
            
            self.best_model.fit(X_processed, y_processed)
            self.models['position'] = self.best_model
            self.is_trained = True
            
            # STEP 9: Feature Importance Analysis
            
            self.analyze_feature_importance(self.best_model, X_processed.columns)
            
            print('✅ Enhanced model training pipeline completed successfully')
            
        except Exception as e:
            print(f'❌ Error in enhanced training pipeline: {e}')
            import traceback
            traceback.print_exc()
    
    def analyze_feature_importance(self, model, feature_names):
        'ANALYZE FEATURE IMPORTANCE FOR MODEL INTERPRETABILITY'
        try:
            if hasattr(model, 'feature_importances_'):
                importances = model.feature_importances_
                feature_importance_df = pd.DataFrame({
                    'feature': feature_names,
                    'importance': importances
                }).sort_values('importance', ascending = False)
                
                print('\n📊 TOP 10 FEATURE IMPORTANCES:')
                for idx, row in feature_importance_df.head(10).iterrows():
                    print(f'   {row['feature']}: {row['importance']:.4f}')
                
                # Plot feature importance
                
                plt.figure(figsize = (12, 8))
                top_features = feature_importance_df.head(15)
                plt.barh(range(len(top_features)), top_features['importance'], color = 'skyblue')
                plt.yticks(range(len(top_features)), top_features['feature'])
                plt.xlabel('Feature Importance')
                plt.title('Top 15 Feature Importances', fontweight = 'bold')
                plt.gca().invert_yaxis()
                plt.tight_layout()
                plt.savefig('feature_importance.png', dpi = 300, bbox_inches = 'tight')
                plt.close()
                
        except Exception as e:
            print(f'⚠️ Could not analyze feature importance: {e}')
    
    def ensure_feature_alignment(self, X_processed):
        'ENSURE FEATURE ALIGNMENT BETWEEN TRAINING AND PREDICTION'
        if not hasattr(self, 'feature_columns') or not self.feature_columns:
            return X_processed
        
        # Create a new DataFrame with the correct feature structure
        
        aligned_data = pd.DataFrame(index = X_processed.index)
        
        for col in self.feature_columns:
            if col in X_processed.columns:
                aligned_data[col] = X_processed[col]
            else:
                # Fill missing features with appropriate defaults
                
                if 'driver' in col:
                    aligned_data[col] = 50.0 if 'experience' in col else 0.05
                elif 'constructor' in col:
                    aligned_data[col] = 300.0 if 'experience' in col else 0.08
                elif 'circuit' in col:
                    aligned_data[col] = 5.0 if 'length' in col else 12.0
                elif 'temperature' in col:
                    aligned_data[col] = 20.0
                elif 'rain' in col:
                    aligned_data[col] = 0.0
                else:
                    aligned_data[col] = 0.0
        
        return aligned_data
    
    def create_prediction_cache_key(self, X):
        'CREATE CACHE KEY FROM INPUT DATA FOR PREDICTION CACHING'
        if X.empty:
            return 'empty_input'
        
        # Use hash of important features for cache key
        
        key_features = ['qualifying_position', 'driver_experience', 'constructor_experience', 
                       'temperature_avg', 'rain_probability']
        
        available_features = [f for f in key_features if f in X.columns]
        if available_features:
        
            # Create hash from feature values
            
            feature_string = ''
            for f in available_features:
                feature_string += str(X[f].values.tobytes())
            return hashlib.md5(feature_string.encode()).hexdigest()
        else:
            return str(hash(str(X.values.tobytes())))
    
    def predict_enhanced_position(self, X):
        'PREDICT ENHANCED POSITION WITH COMPLETE PIPELINE AND CACHING'
        if not self.is_trained or X.empty:
            return self.enhanced_fallback_prediction(X)
        
        try:
            # Create cache key
            
            cache_key = self.create_prediction_cache_key(X)
            
            if self.cache_enabled and cache_key in self.prediction_cache:
                self.cache_hits += 1
                if self.cache_hits % 20 == 0:                          # Log every 20 cache hits
                    print(f'  💾 Prediction cache: {self.cache_hits} hits, {self.cache_misses} misses')
                return self.prediction_cache[cache_key]
            
            X_processed, _ = self.prepare_enhanced_features(X, is_training = False)
            
            # Ensure feature alignment with training data
            
            X_aligned = self.ensure_feature_alignment(X_processed)
            
            predictions = self.best_model.predict(X_aligned)
            predictions = np.clip(predictions, 1, 30)
            
            # Cache the predictions
            
            if self.cache_enabled:
                self.prediction_cache[cache_key] = predictions
                self.cache_misses += 1
            
            return predictions
            
        except Exception as e:
            print(f'⚠️ Enhanced prediction error: {e}')
            return self.enhanced_fallback_prediction(X)
    
    def enhanced_fallback_prediction(self, X):
        'FALLBACK PREDICTION WITH WEATHER CONSIDERATION'
        if X.empty:
            return np.array([10])                                           # Default prediction
        
        try:
            if 'qualifying_position' in X.columns:
                base_prediction = X['qualifying_position'].values
                
                # Weather-based adjustments
                
                weather_adjustment = 0
                
                if 'rain_probability' in X.columns:
                    rain_adjustment = X['rain_probability'].fillna(0) * 2
                    weather_adjustment += rain_adjustment
                
                if 'temperature_avg' in X.columns:
                    temp_deviation = abs(X['temperature_avg'].fillna(20) - 20) / 10
                    temp_adjustment = temp_deviation * 1.5
                    weather_adjustment += temp_adjustment
                
                final_prediction = base_prediction + weather_adjustment
                return np.clip(final_prediction, 1, 30)
            else:
                return np.array([10] * len(X))
        except Exception as e:
            print(f'⚠️ Fallback prediction error: {e}')
            return np.array([10] * len(X))
    
    def clear_prediction_cache(self):
        'CLEAR PREDICTION CACHE'
        self.prediction_cache = {}
        self.cache_hits = 0
        self.cache_misses = 0
        print('🧹 Prediction cache cleared')
    
    def get_prediction_cache_stats(self):
        'GET PREDICTION CACHE STATISTICS'
        total_predictions = self.cache_hits + self.cache_misses
        hit_rate = (self.cache_hits / total_predictions * 100) if total_predictions > 0 else 0
        return {
            'hits': self.cache_hits,
            'misses': self.cache_misses,
            'total': total_predictions,
            'hit_rate': hit_rate,
            'cached_predictions': len(self.prediction_cache)
        }
    
    def enable_caching(self):
        'ENABLE PREDICTION CACHING'
        self.cache_enabled = True
        print('✅ Prediction caching enabled')
    
    def disable_caching(self):
        'DISABLE PREDICTION CACHING'
        self.cache_enabled = False
        print('🚫 Prediction caching disabled')
    
    def save_model(self, filepath = 'f1_strategy_predictor.pkl'):
        'SAVE TRAINED MODEL AND CACHE'
        try:
            model_data = {
                'best_model': self.best_model,
                'feature_columns': self.feature_columns,
                'is_trained': self.is_trained,
                'prediction_cache': self.prediction_cache,
                'cache_stats': {
                    'hits': self.cache_hits,
                    'misses': self.cache_misses
                }
            }
            
            with open(filepath, 'wb') as f:
                pickle.dump(model_data, f)
            
            print(f'💾 Model saved to {filepath}')
        except Exception as e:
            print(f'❌ Error saving model: {e}')
    
    def load_model(self, filepath = 'f1_strategy_predictor.pkl'):
        'LOAD TRAINED MODEL AND CACHE'
        try:
            with open(filepath, 'rb') as f:
                model_data = pickle.load(f)
            
            self.best_model = model_data['best_model']
            self.feature_columns = model_data['feature_columns']
            self.is_trained = model_data['is_trained']
            self.prediction_cache = model_data.get('prediction_cache', {})
            cache_stats = model_data.get('cache_stats', {})
            self.cache_hits = cache_stats.get('hits', 0)
            self.cache_misses = cache_stats.get('misses', 0)
            
            print(f'📂 Model loaded from {filepath}')
            print(f'   Feature columns: {len(self.feature_columns)}')
            print(f'   Cached predictions: {len(self.prediction_cache)}')
        except Exception as e:
            print(f'❌ Error loading model: {e}')

# 13: Complete Main Pipeline Execution

In [24]:
def execute_complete_f1_strategy_pipeline(use_caching = True, use_weather = True):
    """
    EXECUTE COMPLETE F1 STRATEGY ML PIPELINE
    """
    print('\n' + '=' * 80)
    print('F1 RACE STRATEGY ML SYSTEM - COMPLETE EXECUTION')
    print('=' * 80)
    
    try:
        # STEP 1: Load MySQL Data
        
        print('\n📥 STEP 1: LOADING COMPLETE F1 DATA FROM MYSQL DATABASE...')
        dataframes = load_enhanced_mysql_data(use_caching = use_caching)
        
        # STEP 2: Data Cleaning
        
        print('\n🧹 STEP 2: CLEANING AND PREPARING DATA...')
        cleaner = AdvancedDataCleaner()
        cleaned_dataframes = {}
        
        for key, df in dataframes.items():
            if not df.empty:
                cleaned_dataframes[key] = cleaner.comprehensive_clean(df, key)
        
        cleaner.print_cleaning_report()
        
        # STEP 3: Build Merged Dataset
        
        print('\n🔗 STEP 3: BUILDING MERGED DATASET...')
        merged_df = build_enhanced_merged_dataset(
            cleaned_dataframes.get('results', pd.DataFrame()),
            cleaned_dataframes.get('races', pd.DataFrame()),
            cleaned_dataframes.get('circuits', pd.DataFrame()),
            cleaned_dataframes.get('drivers', pd.DataFrame()),
            cleaned_dataframes.get('constructors', pd.DataFrame()),
            cleaned_dataframes.get('qualifying', pd.DataFrame()),
            cleaned_dataframes.get('pitStops', pd.DataFrame()),
            cleaned_dataframes.get('race_results', pd.DataFrame())
        )
        
        if merged_df.empty:
            print('❌ Failed to build merged dataset')
            return None
            
        print(f'📊 Merged dataset contains {len(merged_df)} records from {merged_df['circuit_id'].nunique()} circuits')
        
        # STEP 4: Weather Integration
        
        if use_weather:
            print('\n🌤️ STEP 4: INTEGRATING REAL WEATHER DATA FOR ALL CIRCUITS...')
            merged_df = integrate_real_weather_data(
                merged_df, 
                cleaned_dataframes.get('circuits', pd.DataFrame()),
                use_caching = use_caching
            )
        else:
            print('\n⏭️ STEP 4: SKIPPING WEATHER INTEGRATION (disabled)')
        
        # STEP 5: Feature Engineering
        
        print('\n🔧 STEP 5: ENGINEERING STRATEGY FEATURES WITH REAL WEATHER...')
        features_df = create_enhanced_strategy_features_with_weather(
            merged_df,
            cleaned_dataframes.get('drivers', pd.DataFrame()),
            cleaned_dataframes.get('constructors', pd.DataFrame()),
            cleaned_dataframes.get('circuits', pd.DataFrame()),
            use_caching = use_caching
        )
        
        if features_df.empty:
            print('❌ Feature engineering failed')
            return None
            
        # STEP 6: EDA Analysis
        
        print('\n📊 STEP 6: PERFORMING COMPREHENSIVE EDA...')
        eda = ComprehensiveEDA()
        eda.perform_comprehensive_eda(features_df, target_column = 'final_position', use_cache = use_caching)
        
        # STEP 7: Train ML Models
        
        print('\n🤖 STEP 7: TRAINING ENHANCED ML MODELS...')
        
        # Prepare features and target
        
        feature_columns = [col for col in features_df.columns if col not in 
                          ['race_id', 'driver_id', 'constructor_id', 'circuit_id', 'final_position']]
        
        X = features_df[feature_columns]
        y = features_df['final_position']
        
        # Initialize and train predictor
        
        predictor = EnhancedF1StrategyPredictor(cache_enabled = use_caching)
        predictor.train_enhanced_models(X, y, use_caching = use_caching)
        
        # STEP 8: Test Strategy Optimization
        print('\n🎯 STEP 8: TESTING STRATEGY OPTIMIZATION ENGINE...')
        test_strategy_optimization(predictor, cleaned_dataframes)
        
        # STEP 9: Save Model
        
        print('\n💾 STEP 9: SAVING TRAINED MODEL...')
        predictor.save_model('f1_strategy_predictor_final.pkl')
        
        # Print final statistics
        
        print('\n✅ PIPELINE EXECUTION COMPLETED SUCCESSFULLY!')
        print_cache_statistics(cleaned_dataframes, predictor, eda)
        
        return predictor, features_df
        
    except Exception as e:
        print(f'❌ Pipeline execution failed: {e}')
        
        traceback.print_exc()
        return None, None

def test_strategy_optimization(predictor, dataframes):
    """
    TEST THE STRATEGY OPTIMIZATION ENGINE WITH SAMPLE DATA
    """
    print('   🧪 Testing strategy optimization with sample scenarios...')
    
    # Sample test scenarios
    
    test_scenarios = [
        {
            'driver_profile': {
                'experience': 80,
                'win_rate': 0.15,
                'podium_rate': 0.35,
                'points_per_race': 12.5,
                'age': 26
            },
            'constructor_profile': {
                'experience': 450,
                'win_rate': 0.12,
                'reliability': 0.92,
                'points_per_race': 15.0
            },
            'circuit_profile': {
                'length': 5.8,
                'corners': 16,
                'altitude': 200,
                'type': 'permanent'
            },
            'qualifying_pos': 3,
            'weather_conditions': {
                'temperature_avg': 28.0,
                'rain_probability': 0.1,
                'wind_speed_avg': 4.5,
                'estimated_track_temp': 45.0,
                'tire_degradation_factor': 1.2
            }
        },
        {
            'driver_profile': {
                'experience': 25,
                'win_rate': 0.02,
                'podium_rate': 0.08,
                'points_per_race': 3.5,
                'age': 22
            },
            'constructor_profile': {
                'experience': 150,
                'win_rate': 0.03,
                'reliability': 0.78,
                'points_per_race': 4.0
            },
            'circuit_profile': {
                'length': 4.3,
                'corners': 12,
                'altitude': 50,
                'type': 'street'
            },
            'qualifying_pos': 15,
            'weather_conditions': {
                'temperature_avg': 18.0,
                'rain_probability': 0.6,
                'wind_speed_avg': 8.0,
                'estimated_track_temp': 30.0,
                'tire_degradation_factor': 0.9
            }
        }
    ]
    
    for i, scenario in enumerate(test_scenarios, 1):
        print(f'\n   📋 Test Scenario {i}:')
        print(f'      Qualifying Position: P{scenario['qualifying_pos']}')
        print(f'      Weather: {scenario['weather_conditions']['temperature_avg']}°C, '
              f'Rain: {scenario['weather_conditions']['rain_probability']*100}%')
        
        try:
            strategy = optimize_complete_race_strategy(
                predictor,
                scenario['driver_profile'],
                scenario['constructor_profile'], 
                scenario['circuit_profile'],
                scenario['qualifying_pos'],
                scenario['weather_conditions']
            )
            
            print(f'      🎯 Predicted Finish: P{strategy['predicted_finish']}')
            print(f'      📈 Position Gain: +{strategy['qualifying_gain']}')
            print(f'      🛑 Recommended Stops: {strategy['recommended_pit_stops']}')
            print(f'      🔧 Tire Strategy: {strategy['tire_strategy']}')
            print(f'      ⚠️ Risk Level: {strategy['risk_level']}')
            
        except Exception as e:
            print(f'      ❌ Strategy optimization failed: {e}')

def print_cache_statistics(dataframes, predictor, eda):
    """
    PRINT COMPREHENSIVE CACHE STATISTICS
    """
    print('\n📊 CACHE PERFORMANCE STATISTICS:')
    print('-' * 50)
    
    # Database cache stats
    
    db_cache_hits = sum(1 for df in dataframes.values() if not df.empty)
    print(f'💾 Database Tables Loaded: {db_cache_hits}')
    
    # EDA cache stats
    
    eda_stats = eda.get_cache_stats()
    print(f'📈 EDA Cache: {eda_stats['hits']} hits, {eda_stats['misses']} misses '
          f'({eda_stats['hit_rate']:.1f}% hit rate)')
    
    # Prediction cache stats
    
    pred_stats = predictor.get_prediction_cache_stats()
    print(f'🤖 Prediction Cache: {pred_stats['hits']} hits, {pred_stats['misses']} misses '
          f'({pred_stats['hit_rate']:.1f}% hit rate)')

# Clear caches if needed

def clear_all_caches():
    """
    CLEAR ALL CACHES FOR FRESH START
    """
    print('🧹 CLEARING ALL CACHES...')
    
    # Clear feature cache
    
    clear_feature_cache()
    
    # Clear EDA cache
    
    eda = ComprehensiveEDA()
    eda.clear_cache()
    
    # Clear weather cache
    
    weather_integrator = RealWeatherDataIntegrator(pd.DataFrame())
    weather_integrator.clear_cache()
    
    print('✅ All caches cleared successfully')

# Main execution block
if __name__ == '__main__':
    
    # Run with caching 
    
    print('🚀 STARTING F1 STRATEGY ML PIPELINE...')
    predictor, features = execute_complete_f1_strategy_pipeline(use_caching = True, use_weather = True)
    
    

🚀 STARTING F1 STRATEGY ML PIPELINE...

F1 RACE STRATEGY ML SYSTEM - COMPLETE EXECUTION

📥 STEP 1: LOADING COMPLETE F1 DATA FROM MYSQL DATABASE...
🔄 LOADING COMPLETE F1 DATA FROM MYSQL DATABASE...
✅ MySQL database connection established successfully
✅ Loaded 912 records from driver
✅ Loaded 185 records from constructor
✅ Loaded 77 records from circuit
✅ Loaded 1149 records from race
✅ Loaded 183464 records from race_data
✅ Loaded 26576 records from qualifying_result
✅ Loaded 21866 records from pit_stop
✅ Loaded 53 records from grand_prix
✅ Loaded 249 records from country
✅ Loaded 7 records from continent
✅ Loaded 27151 records from race_result
✅ Loaded lap times from interactive_lap_statistics_mat
✅ Loaded 16716 records from fastest_lap
✅ Loaded 25388 records from starting_grid_position
📊 Database Cache Stats: 0 hits, 13 misses (0.0% hit rate)
✅ MySQL database connection closed
🎯 MYSQL DATA LOADING COMPLETE: 15 datasets with 313,808 total records

🧹 STEP 2: CLEANING AND PREPARING DATA..