In [6]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [7]:
def prepare_repair_cost_data(df):
    """
    Prepare data for repair cost prediction model
    Works with Sri Lanka EV repair cost dataset
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Raw dataset with EV repair information
    
    Returns:
    --------
    X : pandas.DataFrame
        Feature matrix for training
    y_total : pandas.Series
        Target variable - total repair cost in LKR
    y_out_of_pocket : pandas.Series
        Target variable - out-of-pocket cost in LKR
    encoders : dict
        Dictionary containing all label encoders
    feature_cols : list
        List of feature column names
    """
    
    # Filter only records that need repair
    df_repair = df[df['needs_repair'] == True].copy()
    
    print(f"Total records with repairs: {len(df_repair)}")
    print(f"Original dataset size: {len(df)}")
    
    # Define numerical feature columns
    feature_cols = [
        'age_years',
        'age_months',
        'mileage_km',
        'mileage_miles',
        'battery_capacity_kwh',
        'current_capacity_kwh',
        'battery_health_percent',
        'battery_cycles',
        'base_price_lkr',
        'fast_charge_ratio',
        'avg_temperature_c',
        'previous_repairs',
        'labor_hours',
        'labor_rate_lkr_per_hour',
        'labor_cost_lkr',
        'parts_cost_lkr',
        'diagnostic_fee_lkr'
    ]
    
    # Initialize label encoders for categorical variables
    le_model = LabelEncoder()
    le_repair = LabelEncoder()
    le_provider = LabelEncoder()
    le_region = LabelEncoder()
    le_charging_freq = LabelEncoder()
    
    # Encode categorical variables
    df_repair['model_encoded'] = le_model.fit_transform(df_repair['model'])
    df_repair['repair_type_encoded'] = le_repair.fit_transform(df_repair['repair_type'])
    df_repair['service_provider_encoded'] = le_provider.fit_transform(df_repair['service_provider'])
    df_repair['region_encoded'] = le_region.fit_transform(df_repair['region'])
    df_repair['charging_freq_encoded'] = le_charging_freq.fit_transform(df_repair['public_charging_frequency'])
    
    # Encode boolean features
    df_repair['home_charging_encoded'] = df_repair['home_charging_available'].astype(int)
    df_repair['under_warranty_encoded'] = df_repair['under_warranty'].astype(int)
    
    # Add encoded features to feature list
    feature_cols.extend([
        'model_encoded',
        'repair_type_encoded',
        'service_provider_encoded',
        'region_encoded',
        'charging_freq_encoded',
        'home_charging_encoded',
        'under_warranty_encoded',
        'region_cost_multiplier'
    ])
    
    # Create feature matrix
    X = df_repair[feature_cols].copy()
    
    # Target variables (you can choose which one to predict)
    y_total = df_repair['total_repair_cost_lkr']  # Total repair cost
    y_out_of_pocket = df_repair['out_of_pocket_cost_lkr']  # Out-of-pocket cost
    
    # Check for any missing values
    if X.isnull().sum().sum() > 0:
        print("\nWarning: Missing values found!")
        print(X.isnull().sum()[X.isnull().sum() > 0])
        X = X.fillna(X.mean())
    
    # Create encoders dictionary
    encoders = {
        'model': le_model,
        'repair_type': le_repair,
        'service_provider': le_provider,
        'region': le_region,
        'charging_frequency': le_charging_freq
    }
    
    return X, y_total, y_out_of_pocket, encoders, feature_cols

In [8]:
def train_repair_cost_model(df, target='out_of_pocket', model_dir='models'):
    """
    Train repair cost prediction model for Sri Lankan EV dataset
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Raw dataset with EV repair information
    target : str, optional (default='out_of_pocket')
        Which cost to predict: 'out_of_pocket' or 'total'
    model_dir : str, optional (default='models')
        Directory to save trained models
    
    Returns:
    --------
    model : GradientBoostingRegressor
        Trained model
    encoders : dict
        Dictionary of label encoders
    feature_cols : list
        List of feature column names
    metrics : dict
        Dictionary containing performance metrics
    """
    
    # Create model directory if it doesn't exist
    os.makedirs(model_dir, exist_ok=True)
    
    # Prepare data
    X, y_total, y_out_of_pocket, encoders, feature_cols = prepare_repair_cost_data(df)
    
    # Select target variable
    if target == 'out_of_pocket':
        y = y_out_of_pocket
        print("\n" + "="*70)
        print("Training model to predict OUT-OF-POCKET COST (LKR)")
        print("="*70)
    else:
        y = y_total
        print("\n" + "="*70)
        print("Training model to predict TOTAL REPAIR COST (LKR)")
        print("="*70)
    
    print(f"\nDataset shape: {X.shape}")
    print(f"Number of features: {len(feature_cols)}")
    print(f"Target variable range: LKR {y.min():,.2f} - LKR {y.max():,.2f}")
    print(f"Target variable mean: LKR {y.mean():,.2f}")
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True
    )
    
    print(f"\nTraining set size: {len(X_train):,}")
    print(f"Test set size: {len(X_test):,}")
    
    # Train Gradient Boosting model
    print("\nTraining Gradient Boosting Regressor...")
    model = GradientBoostingRegressor(
        n_estimators=200,
        max_depth=10,
        learning_rate=0.1,
        min_samples_split=5,
        min_samples_leaf=2,
        subsample=0.8,
        random_state=42,
        verbose=0
    )
    
    model.fit(X_train, y_train)
    print("✓ Training complete!")
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    print("\n" + "="*70)
    print("MODEL PERFORMANCE")
    print("="*70)
    
    # Training set metrics
    print("\nTraining Set:")
    train_mae = mean_absolute_error(y_train, y_pred_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    train_r2 = r2_score(y_train, y_pred_train)
    
    print(f"MAE:  LKR {train_mae:,.2f}")
    print(f"RMSE: LKR {train_rmse:,.2f}")
    print(f"R²:   {train_r2:.4f}")
    
    # Test set metrics
    print("\nTest Set:")
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_r2 = r2_score(y_test, y_pred_test)
    
    print(f"MAE:  LKR {test_mae:,.2f}")
    print(f"RMSE: LKR {test_rmse:,.2f}")
    print(f"R²:   {test_r2:.4f}")
    
    # Cross-validation
    print("\n" + "-"*70)
    print("Cross-Validation (5-fold):")
    cv_scores = cross_val_score(
        model, X_train, y_train, 
        cv=5, 
        scoring='neg_mean_absolute_error', 
        n_jobs=-1
    )
    print(f"CV MAE: LKR {-cv_scores.mean():,.2f} (+/- LKR {cv_scores.std() * 2:,.2f})")
    
    # Feature importance
    print("\n" + "="*70)
    print("TOP 15 MOST IMPORTANT FEATURES")
    print("="*70)
    
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(feature_importance.head(15).to_string(index=False))
    
    # Save models and artifacts
    model_prefix = f'{model_dir}/repair_cost_{target}'
    
    joblib.dump(model, f'{model_prefix}_model.pkl')
    joblib.dump(encoders, f'{model_prefix}_encoders.pkl')
    joblib.dump(feature_cols, f'{model_prefix}_features.pkl')
    
    print("\n" + "="*70)
    print("MODEL SAVED")
    print("="*70)
    print(f"✓ Model saved to: {model_prefix}_model.pkl")
    print(f"✓ Encoders saved to: {model_prefix}_encoders.pkl")
    print(f"✓ Features saved to: {model_prefix}_features.pkl")
    
    # Save feature importance
    feature_importance.to_csv(f'{model_prefix}_feature_importance.csv', index=False)
    print(f"✓ Feature importance saved to: {model_prefix}_feature_importance.csv")
    
    # Create evaluation plots
    try:
        create_evaluation_plots(y_test, y_pred_test, feature_importance, model_prefix, target)
    except Exception as e:
        print(f"Note: Could not create plots - {e}")
    
    # Return results
    metrics = {
        'train_mae': train_mae,
        'train_rmse': train_rmse,
        'train_r2': train_r2,
        'test_mae': test_mae,
        'test_rmse': test_rmse,
        'test_r2': test_r2
    }
    
    return model, encoders, feature_cols, metrics

In [9]:
df = pd.read_csv('sl_ev_repair_cost_dataset.csv')
df.head()

Unnamed: 0,vehicle_id,model,manufacture_year,age_years,age_months,mileage_km,mileage_miles,battery_capacity_kwh,current_capacity_kwh,battery_health_percent,...,labor_cost_lkr,parts_cost_lkr,diagnostic_fee_lkr,service_charge_lkr,total_repair_cost_lkr,under_warranty,warranty_coverage_percent,out_of_pocket_cost_lkr,total_cost_usd,out_of_pocket_usd
0,SLEV_00001,BYD Atto 3,2024,2,24,17248,10717,60,55.37,92.28,...,0.0,0.0,0.0,0.0,0.0,False,0.0,0.0,0.0,0.0
1,SLEV_00002,Nissan Leaf,2019,7,84,72872,45280,40,27.2,68.0,...,11520.23,549652.65,7589.54,44329.21,611053.89,False,0.0,611053.89,2036.85,2036.85
2,SLEV_00003,Tesla Model 3,2017,9,108,107531,66816,75,48.29,64.38,...,14580.76,240732.71,3375.76,18009.07,275308.01,False,0.0,275308.01,917.69,917.69
3,SLEV_00004,Nissan Leaf 62kWh,2020,6,72,99840,62037,62,47.16,76.06,...,0.0,0.0,0.0,0.0,0.0,False,0.0,0.0,0.0,0.0
4,SLEV_00005,MG ZS EV,2021,5,60,55950,34765,44,36.91,83.89,...,0.0,0.0,0.0,0.0,0.0,False,0.0,0.0,0.0,0.0


In [10]:
repair_model = train_repair_cost_model(df,target='out_of_pocket', model_dir='models')

print("✅ All models trained successfully!")

Total records with repairs: 2861
Original dataset size: 10000

Training model to predict OUT-OF-POCKET COST (LKR)

Dataset shape: (2861, 25)
Number of features: 25
Target variable range: LKR 1,735.22 - LKR 12,586,670.66
Target variable mean: LKR 644,999.54

Training set size: 2,288
Test set size: 573

Training Gradient Boosting Regressor...
✓ Training complete!

MODEL PERFORMANCE

Training Set:
MAE:  LKR 1,003.73
RMSE: LKR 3,000.79
R²:   1.0000

Test Set:
MAE:  LKR 33,013.30
RMSE: LKR 122,135.69
R²:   0.9881

----------------------------------------------------------------------
Cross-Validation (5-fold):
CV MAE: LKR 32,066.16 (+/- LKR 11,594.21)

TOP 15 MOST IMPORTANT FEATURES
               feature  importance
        parts_cost_lkr    0.982303
under_warranty_encoded    0.007104
         mileage_miles    0.001985
        battery_cycles    0.001432
            mileage_km    0.001394
        labor_cost_lkr    0.001048
     fast_charge_ratio    0.000783
battery_health_percent    0.00066

In [31]:
model = joblib.load('models/repair_cost_out_of_pocket_model.pkl')
encoders = joblib.load('models/repair_cost_out_of_pocket_encoders.pkl')
feature_cols = joblib.load('models/repair_cost_out_of_pocket_features.pkl')

In [35]:
def safe_label_transform(le, value):
    if value in le.classes_:
        return le.transform([value])[0]
    else:
        return le.transform([le.classes_[0]])[0]  # fallback

df['repair_type_encoded'] = safe_label_transform(
    encoders['repair_type'], df.loc[0, 'repair_type']
)

In [36]:
input_data = {
    'age_years': 6,
    'age_months': 72,
    'mileage_km': 60000,
    'mileage_miles': 37282,
    'battery_capacity_kwh': 64,
    'current_capacity_kwh': 54.4,
    'battery_health_percent': 85,
    'battery_cycles': 300,
    'base_price_lkr': 15000000,
    'fast_charge_ratio': 0.4,
    'avg_temperature_c': 29,
    'previous_repairs': 2,
    'labor_hours': 8,
    'labor_rate_lkr_per_hour': 3000,
    'labor_cost_lkr': 24000,
    'parts_cost_lkr': 500000,
    'diagnostic_fee_lkr': 7500,

    'model': 'Hyundai Kona Electric',
    'repair_type': 'Battery Replacement',
    'service_provider': 'Authorized Dealer',
    'region': 'Colombo',
    'public_charging_frequency': 'Weekly',

    'home_charging_available': True,
    'under_warranty': False,

    'region_cost_multiplier': 1.15
}


In [37]:
df = pd.DataFrame([input_data])

df['model_encoded'] = safe_label_transform(encoders['model'], df.loc[0, 'model'])
df['repair_type_encoded'] = safe_label_transform(encoders['repair_type'], df.loc[0, 'repair_type'])
df['service_provider_encoded'] = safe_label_transform(encoders['service_provider'], df.loc[0, 'service_provider'])
df['region_encoded'] = safe_label_transform(encoders['region'], df.loc[0, 'region'])
df['charging_freq_encoded'] = safe_label_transform(
    encoders['charging_frequency'], df.loc[0, 'public_charging_frequency']
)

df['home_charging_encoded'] = int(df.loc[0, 'home_charging_available'])
df['under_warranty_encoded'] = int(df.loc[0, 'under_warranty'])

df.drop(columns=[
    'model', 'repair_type', 'service_provider', 'region',
    'public_charging_frequency', 'home_charging_available', 'under_warranty'
], inplace=True)

df = df[feature_cols]

prediction = model.predict(df)[0]
print(f"Predicted Repair Cost: LKR {prediction:,.2f}")

Predicted Repair Cost: LKR 554,943.78
