In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

# Load data
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')

def advanced_feature_engineering(df, is_training=True):
    """
    Enhanced feature engineering with automotive domain expertise
    """
    df_processed = df.copy()
    
    # 1. Core efficiency ratios
    df_processed['engine_per_owner'] = df_processed['engine_capacity_cc'] / (df_processed['owner_count'] + 0.5)
    df_processed['power_to_weight_ratio'] = df_processed['engine_capacity_cc'] / 1000  # Normalized power ratio
    
    # 2. Advanced fuel type efficiency mapping
    fuel_efficiency_map = {
        'Electric': 3.2, 'Hybrid': 2.1, 'Diesel': 1.45, 
        'Petrol': 1.0, 'CNG': 1.15, 'LPG': 1.1
    }
    df_processed['fuel_efficiency_factor'] = df_processed['fuel_category'].map(fuel_efficiency_map).fillna(1.0)
    
    # 3. Transmission efficiency with more categories
    transmission_map = {
        'Manual': 1.25, 'Automatic': 1.0, 'CVT': 1.1, 
        'Semi-Automatic': 1.15, 'AMT': 1.05
    }
    df_processed['transmission_efficiency'] = df_processed['transmission_type'].map(transmission_map).fillna(1.0)
    
    # 4. Enhanced brand efficiency scores
    brand_efficiency = {
        'Toyota': 1.3, 'Honda': 1.25, 'Hyundai': 1.2, 'Maruti': 1.25,
        'Nissan': 1.15, 'Kia': 1.15, 'Volkswagen': 1.1, 'Ford': 1.05,
        'Chevrolet': 0.95, 'BMW': 0.9, 'Mercedes': 0.85, 'Audi': 0.88,
        'Tesla': 1.5, 'Tata': 1.1, 'Mahindra': 1.0
    }
    df_processed['brand_efficiency_score'] = df_processed['car_brand'].map(brand_efficiency).fillna(1.0)
    
    # 5. Sophisticated age depreciation
    df_processed['age_efficiency_loss'] = np.exp(-0.08 * (df_processed['owner_count'] - 1))
    df_processed['age_efficiency_loss'] = df_processed['age_efficiency_loss'].clip(lower=0.5)
    
    # 6. Engine displacement categories with efficiency curves
    df_processed['engine_category'] = pd.cut(
        df_processed['engine_capacity_cc'], 
        bins=[0, 800, 1200, 1600, 2000, 2500, 3000, 5000],
        labels=[6, 5, 4, 3, 2, 1, 0]
    ).astype(float)
    
    # 7. Multi-factor efficiency score
    df_processed['combined_efficiency'] = (
        df_processed['fuel_efficiency_factor'] * 
        df_processed['transmission_efficiency'] * 
        df_processed['brand_efficiency_score'] * 
        df_processed['age_efficiency_loss']
    )
    
    # 8. Advanced engine metrics
    df_processed['engine_load_factor'] = np.log1p(df_processed['engine_capacity_cc']) / 8
    df_processed['displacement_per_owner'] = df_processed['engine_capacity_cc'] / (df_processed['owner_count'] ** 0.8)
    
    # 9. Market segment classification
    luxury_brands = ['BMW', 'Mercedes', 'Audi', 'Tesla', 'Jaguar', 'Porsche']
    economy_brands = ['Maruti', 'Tata', 'Hyundai', 'Honda', 'Toyota']
    df_processed['is_luxury'] = df_processed['car_brand'].isin(luxury_brands).astype(int)
    df_processed['is_economy'] = df_processed['car_brand'].isin(economy_brands).astype(int)
    
    # 10. Color efficiency (thermal properties)
    color_efficiency = {
        'White': 1.05, 'Silver': 1.03, 'Gray': 1.02, 'Blue': 1.01,
        'Black': 0.97, 'Red': 0.98, 'Green': 1.0, 'Brown': 0.99
    }
    df_processed['color_efficiency'] = df_processed['exterior_color'].map(color_efficiency).fillna(1.0)
    
    # 11. Advanced interaction features
    df_processed['fuel_engine_interaction'] = (
        df_processed['fuel_efficiency_factor'] * 
        np.log1p(df_processed['engine_capacity_cc'])
    )
    df_processed['brand_age_interaction'] = (
        df_processed['brand_efficiency_score'] * 
        df_processed['age_efficiency_loss']
    )
    df_processed['transmission_engine_interaction'] = (
        df_processed['transmission_efficiency'] * 
        df_processed['engine_category']
    )
    
    # 12. Polynomial features for key variables
    df_processed['engine_squared'] = df_processed['engine_capacity_cc'] ** 2
    df_processed['owner_squared'] = df_processed['owner_count'] ** 2
    
    # 13. Efficiency ratios
    df_processed['efficiency_per_cc'] = df_processed['combined_efficiency'] / (df_processed['engine_capacity_cc'] + 1)
    df_processed['brand_fuel_synergy'] = df_processed['brand_efficiency_score'] * df_processed['fuel_efficiency_factor']
    
    return df_processed

def remove_outliers_comprehensive(df, target_col='fuel_efficiency_kmpl'):
    """
    Multi-method outlier removal with statistical validation
    """
    df_clean = df.copy()
    initial_count = len(df_clean)
    
    # Method 1: Modified Z-score (more robust)
    median = df_clean[target_col].median()
    mad = np.median(np.abs(df_clean[target_col] - median))
    modified_z_scores = 0.6745 * (df_clean[target_col] - median) / mad
    
    # Method 2: IQR with adaptive bounds
    Q1 = df_clean[target_col].quantile(0.1)
    Q3 = df_clean[target_col].quantile(0.9)
    IQR = Q3 - Q1
    lower_bound = Q1 - 2.0 * IQR
    upper_bound = Q3 + 2.0 * IQR
    
    # Method 3: Domain-specific bounds (fuel efficiency realistic range)
    domain_lower = 5.0  # Minimum realistic fuel efficiency
    domain_upper = 35.0  # Maximum realistic fuel efficiency
    
    # Combine all methods
    mask = (
        (np.abs(modified_z_scores) < 3.5) &
        (df_clean[target_col] >= lower_bound) & 
        (df_clean[target_col] <= upper_bound) &
        (df_clean[target_col] >= domain_lower) & 
        (df_clean[target_col] <= domain_upper)
    )
    
    removed_count = initial_count - mask.sum()
    print(f"Removed {removed_count} outliers ({removed_count/initial_count*100:.2f}%)")
    
    return df_clean[mask]

class SuperiorEnsemble:
    def __init__(self):
        self.models = {}
        self.weights = {}
        self.label_encoders = {}
        self.scaler = RobustScaler()
        self.feature_selector = SelectKBest(f_regression, k='all')
        self.feature_names = []
        
    def prepare_features(self, df, is_training=True):
        """Enhanced feature preparation with selection"""
        df_prep = df.copy()
        
        # Categorical encoding
        categorical_features = ['fuel_category', 'car_brand', 'transmission_type', 'exterior_color']
        
        for col in categorical_features:
            if is_training:
                le = LabelEncoder()
                df_prep[col + '_encoded'] = le.fit_transform(df_prep[col].astype(str))
                self.label_encoders[col] = le
            else:
                le = self.label_encoders[col]
                df_prep[col + '_encoded'] = df_prep[col].astype(str).map(
                    lambda x: le.transform([x])[0] if x in le.classes_ else -1
                )
        
        # Feature selection
        feature_cols = [
            'engine_capacity_cc', 'owner_count', 'engine_per_owner', 'power_to_weight_ratio',
            'fuel_efficiency_factor', 'transmission_efficiency', 'brand_efficiency_score',
            'age_efficiency_loss', 'engine_category', 'combined_efficiency',
            'engine_load_factor', 'displacement_per_owner', 'is_luxury', 'is_economy',
            'color_efficiency', 'fuel_engine_interaction', 'brand_age_interaction',
            'transmission_engine_interaction', 'engine_squared', 'owner_squared',
            'efficiency_per_cc', 'brand_fuel_synergy'
        ] + [col + '_encoded' for col in categorical_features]
        
        X = df_prep[feature_cols].fillna(0)
        self.feature_names = feature_cols
        
        # Scaling
        if is_training:
            X_scaled = self.scaler.fit_transform(X)
        else:
            X_scaled = self.scaler.transform(X)
        
        return X_scaled
    
    def train_superior_models(self, X_train, y_train, X_val, y_val):
        """Train optimized ensemble with additional models"""
        
        # Model 1: Enhanced XGBoost
        self.models['xgb'] = xgb.XGBRegressor(
            n_estimators=1500,
            max_depth=8,
            learning_rate=0.02,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.1,
            reg_lambda=0.1,
            random_state=42,
            n_jobs=-1
        )
        
        # Model 2: Enhanced LightGBM
        self.models['lgb'] = lgb.LGBMRegressor(
            n_estimators=1500,
            max_depth=8,
            learning_rate=0.02,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.1,
            reg_lambda=0.1,
            random_state=42,
            n_jobs=-1,
            verbose=-1
        )
        
        # Model 3: Enhanced CatBoost
        self.models['catboost'] = CatBoostRegressor(
            iterations=1500,
            depth=8,
            learning_rate=0.02,
            reg_lambda=0.1,
            random_seed=42,
            verbose=False
        )
        
        # Model 4: Enhanced Random Forest
        self.models['rf'] = RandomForestRegressor(
            n_estimators=800,
            max_depth=20,
            min_samples_split=3,
            min_samples_leaf=1,
            max_features='sqrt',
            random_state=42,
            n_jobs=-1
        )
        
        # Model 5: Extra Trees (additional diversity)
        self.models['et'] = ExtraTreesRegressor(
            n_estimators=800,
            max_depth=20,
            min_samples_split=3,
            min_samples_leaf=1,
            max_features='sqrt',
            random_state=42,
            n_jobs=-1
        )
        
        # Model 6: Enhanced Gradient Boosting
        self.models['gb'] = GradientBoostingRegressor(
            n_estimators=800,
            max_depth=8,
            learning_rate=0.02,
            subsample=0.8,
            random_state=42
        )
        
        # Model 7: Ridge with optimal alpha
        self.models['ridge'] = Ridge(alpha=1.5, random_state=42)
        
        # Model 8: ElasticNet for additional regularization
        self.models['elastic'] = ElasticNet(alpha=0.5, l1_ratio=0.5, random_state=42)
        
        # Train all models
        val_scores = {}
        
        for name, model in self.models.items():
            print(f"Training {name}...")
            model.fit(X_train, y_train)
            val_pred = model.predict(X_val)
            val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
            val_scores[name] = val_rmse
            print(f"{name} - Validation RMSE: {val_rmse:.4f}")
        
        # Calculate dynamic weights with temperature scaling
        inv_scores = np.array([1/score for score in val_scores.values()])
        temperature = 3.0  # Higher temperature for more balanced weights
        softmax_weights = np.exp(inv_scores * temperature) / np.sum(np.exp(inv_scores * temperature))
        
        self.weights = dict(zip(val_scores.keys(), softmax_weights))
        
        print("\nOptimal model weights:")
        for name, weight in self.weights.items():
            print(f"{name}: {weight:.4f}")
        
        return val_scores
    
    def predict(self, X):
        """Make weighted ensemble predictions"""
        predictions = np.zeros(X.shape[0])
        
        for name, model in self.models.items():
            pred = model.predict(X)
            predictions += self.weights[name] * pred
        
        return predictions

# Main execution
print("Starting Superior Fuel Efficiency Prediction...")

# Feature engineering
train_enhanced = advanced_feature_engineering(train_df, is_training=True)
test_enhanced = advanced_feature_engineering(test_df, is_training=False)

# Remove outliers
train_clean = remove_outliers_comprehensive(train_enhanced)
print(f"Training data shape after cleaning: {train_clean.shape}")

# Initialize ensemble
ensemble = SuperiorEnsemble()

# Prepare features
X_full = ensemble.prepare_features(train_clean, is_training=True)
y_full = train_clean['fuel_efficiency_kmpl'].values

# Stratified split for better validation
bins = pd.cut(y_full, bins=10, labels=False)
X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42, stratify=bins
)

print("Training Superior Ensemble...")
val_scores = ensemble.train_superior_models(X_train, y_train, X_val, y_val)

# Ensemble validation
ensemble_pred = ensemble.predict(X_val)
ensemble_rmse = np.sqrt(mean_squared_error(y_val, ensemble_pred))
ensemble_r2 = r2_score(y_val, ensemble_pred)
ensemble_mae = mean_absolute_error(y_val, ensemble_pred)

print(f"\nFinal Ensemble Performance:")
print(f"Validation RMSE: {ensemble_rmse:.4f}")
print(f"Validation R²: {ensemble_r2:.4f}")
print(f"Validation MAE: {ensemble_mae:.4f}")

# Robust cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_full, bins)):
    X_train_cv, X_val_cv = X_full[train_idx], X_full[val_idx]
    y_train_cv, y_val_cv = y_full[train_idx], y_full[val_idx]
    
    temp_ensemble = SuperiorEnsemble()
    temp_ensemble.label_encoders = ensemble.label_encoders
    temp_ensemble.scaler = ensemble.scaler
    temp_ensemble.train_superior_models(X_train_cv, y_train_cv, X_val_cv, y_val_cv)
    
    cv_pred = temp_ensemble.predict(X_val_cv)
    cv_rmse = np.sqrt(mean_squared_error(y_val_cv, cv_pred))
    cv_scores.append(cv_rmse)
    
    print(f"Fold {fold + 1} RMSE: {cv_rmse:.4f}")

print(f"\nCross-Validation Results:")
print(f"Mean RMSE: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")

# Final predictions
print("Generating final predictions...")
final_ensemble = SuperiorEnsemble()
X_full_final = final_ensemble.prepare_features(train_clean, is_training=True)
y_full_final = train_clean['fuel_efficiency_kmpl'].values

# Final training
bins_final = pd.cut(y_full_final, bins=10, labels=False)
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_full_final, y_full_final, test_size=0.15, random_state=42, stratify=bins_final
)

final_ensemble.train_superior_models(X_train_final, y_train_final, X_val_final, y_val_final)

# Test predictions
X_test_final = final_ensemble.prepare_features(test_enhanced, is_training=False)
test_predictions = final_ensemble.predict(X_test_final)

# Post-processing: Ensure realistic bounds
test_predictions = np.clip(test_predictions, 8.0, 30.0)

# Create submission
submission = pd.DataFrame({
    'fuel_efficiency_kmpl': test_predictions
})

print(f"\nFinal Test Predictions Summary:")
print(f"Min: {test_predictions.min():.2f}")
print(f"Max: {test_predictions.max():.2f}")
print(f"Mean: {test_predictions.mean():.2f}")
print(f"Std: {test_predictions.std():.2f}")

# Save submission
submission.to_csv('superior_submission.csv', index=False)
print("\nSuperior submission saved as 'superior_submission.csv'")
