In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error
import joblib

In [14]:
# Load new dataset
new_data_file = r"/Users/livalacaisse/Documents/DataScience/CO2/000-C02 First Delivery/Cleaned_countries/Cl_no_FR/DT_Cleaned.csv"
dtype_spec = {
    'Fuel consumption': 'float32',
    'Engine_cm3': 'float32',
    'Kg_veh': 'float32',
    'Test_mass': 'float32',
    'Power_KW': 'float32',
    'El_Consumpt_whkm': 'float32',
    'Energy': 'category',
    'year': 'int64',
    'Wheelbase_mm': 'float32',
    'Axle_width_steer_mm': 'float32',
    'Axle_width_other_mm': 'float32',
    'Eco-innovation program': 'float32',
    'Electric range (km)': 'float32',
    'Erwltp (g/km)': 'float32',
    'CO2_wltp': 'float32'
}
new_data = pd.read_csv(new_data_file, dtype=dtype_spec, low_memory=False)

In [15]:
# Define the features for each energy type
features_dict = {
    'petrol': ['Fuel consumption', 'Wheelbase_mm', 'Engine_cm3', 'Power_KW', 'Axle_width_steer_mm', 
               'Test_mass', 'Axle_width_other_mm', 'Eco-innovation program', 'Kg_veh', 'year', 
               'Erwltp (g/km)', 'El_Consumpt_whkm'],
    'diesel': ['Wheelbase_mm', 'Axle_width_other_mm', 'Power_KW', 'Test_mass', 'Axle_width_steer_mm', 
               'Kg_veh', 'Engine_cm3', 'Fuel consumption', 'year', 'Erwltp (g/km)', 'Eco-innovation program', 
               'El_Consumpt_whkm'],
    'lpg': ['Kg_veh', 'Fuel consumption', 'Axle_width_steer_mm', 'Axle_width_other_mm', 'year', 'Test_mass', 
            'Power_KW', 'Erwltp (g/km)', 'Engine_cm3', 'Wheelbase_mm', 'Eco-innovation program', 
            'Electric range (km)'],
    'hybrid petrol': ['Engine_cm3', 'Axle_width_other_mm', 'Axle_width_steer_mm', 'Fuel consumption', 
                     'El_Consumpt_whkm', 'Power_KW', 'Electric range (km)', 'year', 'Wheelbase_mm', 
                     'Eco-innovation program', 'Test_mass', 'Kg_veh'],
    'hybrid diesel': ['El_Consumpt_whkm', 'Axle_width_other_mm', 'Electric range (km)', 'Axle_width_steer_mm', 
                     'year', 'Kg_veh', 'Test_mass', 'Fuel consumption', 'Wheelbase_mm', 'Eco-innovation program', 
                     'Erwltp (g/km)', 'Power_KW']
}

In [16]:
# Function to prepare data for prediction
def prepare_data_for_prediction(df, energy_types, features_dict):
    if isinstance(energy_types, list):
        dfs = []
        for energy in energy_types:
            df_energy = df[df['Energy'] == energy].copy()
            if df_energy.empty:
                print(f"No data for energy type: {energy}")
                continue
            df_energy = df_energy[features_dict[energy] + ['CO2_wltp']]
            dfs.append(df_energy)
        if not dfs:
            return pd.DataFrame(), pd.Series(), []
        df_combined = pd.concat(dfs, axis=0)
        features = [feat for energy in energy_types for feat in features_dict[energy]]
        features = list(set(features))  # Remove duplicates
    else:
        df_combined = df[df['Energy'] == energy_types].copy()
        if df_combined.empty:
            print(f"No data for energy type: {energy_types}")
            return pd.DataFrame(), pd.Series(), []
        features = features_dict[energy_types]
    
    X = df_combined[features]
    y = df_combined['CO2_wltp']
    
    return X, y, features


In [17]:
# List of saved models
models = {
    'petrol': 'best_model_petrol.joblib',
    'diesel': 'best_model_diesel.joblib',
    'lpg': 'best_model_lpg.joblib',
    'hybrid petrol': 'best_model_hybrid_petrol.joblib',
    'hybrid diesel': 'best_model_hybrid_diesel.joblib',
    'all': 'best_model_all.joblib'
}


In [18]:
# Test all models on the new dataset
def test_models_on_new_data(new_data, models, features_dict, degree=2):
    results = {}
    
    for energy, model_file in models.items():
        print(f"\nTesting model for {energy}...")
        
        # Load the saved model
        model = joblib.load(model_file)
        
        # Prepare the data for the specific energy type
        if energy == 'all':
            energy_types = list(features_dict.keys())
        else:
            energy_types = energy
        
        X, y, features = prepare_data_for_prediction(new_data, energy_types, features_dict)
        
        if X.empty or y.empty:
            print(f"No data available for energy type: {energy}")
            continue
        
        # Handle missing values
        X = X.fillna(X.mean())
        
        # Polynomial Features
        poly = PolynomialFeatures(degree=degree, include_bias=False)
        X_poly = poly.fit_transform(X)
        
        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_poly)
        
        # Predictions with the loaded model
        y_pred = model.predict(X_scaled)
        
        # Evaluation of the model
        r2 = r2_score(y, y_pred)
        mse = mean_squared_error(y, y_pred)
        
        # Results reporting
        print(f"R^2 Score: {r2}")
        print(f"Mean Squared Error: {mse}")
        
        results[energy] = {
            'r2': r2,
            'mse': mse
        }
    
    return results

In [19]:
# Test the models
results = test_models_on_new_data(new_data, models, features_dict, degree=2)

# Print the results
for energy, metrics in results.items():
    print(f"\nResults for {energy} model:")
    print(f"R^2 Score: {metrics['r2']}")
    print(f"Mean Squared Error: {metrics['mse']}")


Testing model for petrol...
R^2 Score: 0.7400788024872671
Mean Squared Error: 317.9210289000305

Testing model for diesel...
R^2 Score: -1.1900488172179098
Mean Squared Error: 2225.31301378844

Testing model for lpg...
R^2 Score: -1866.1765196275783
Mean Squared Error: 72296.3617563038

Testing model for hybrid petrol...
R^2 Score: -19.87219645043516
Mean Squared Error: 2779.7046266912093

Testing model for hybrid diesel...
R^2 Score: -133.15617182926636
Mean Squared Error: 17500.585644923685

Testing model for all...
R^2 Score: -115542.97565405961
Mean Squared Error: 318096028.44870085

Results for petrol model:
R^2 Score: 0.7400788024872671
Mean Squared Error: 317.9210289000305

Results for diesel model:
R^2 Score: -1.1900488172179098
Mean Squared Error: 2225.31301378844

Results for lpg model:
R^2 Score: -1866.1765196275783
Mean Squared Error: 72296.3617563038

Results for hybrid petrol model:
R^2 Score: -19.87219645043516
Mean Squared Error: 2779.7046266912093

Results for hybrid 