In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import confusion_matrix, accuracy_score
import shap

In [7]:
# Load data
df = pd.read_csv('/Users/livalacaisse/Documents/DataScience/CO2/000-C02 First Delivery/Cleaned_countries/Cl_no_FR/PT_Cleaned.csv')
var_to_include = ['Em_on_target', 'Fuel consumption', 'Engine_cm3', 'Electric range (km)', 'Kg_veh', 'Test_mass', 'Power_KW', 'El_Consumpt_whkm', 'Energy', 'Fuel_mode', 'Brand', 'Veh_type', 'Veh_Model']
dfdt = df[var_to_include]

# Drop duplicates
dfdt = dfdt.drop_duplicates()

# Drop vehicles with 'Energy' as 'electric'
dfdt = dfdt[dfdt['Energy'] != 'electric']

In [8]:
# Split data
X = dfdt.drop('Em_on_target', axis=1)
y = dfdt['Em_on_target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Define OneHotEncoder and ColumnTransformer
categorical_features = ['Energy', 'Fuel_mode', 'Brand', 'Veh_type', 'Veh_Model']
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_features)
    ],
    remainder='passthrough'
)

In [10]:
# Create and fit the modeling pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif)),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

In [11]:
# Define parameter grid for RandomizedSearchCV
param_grid = {
    'feature_selection__k': [10, 15],
    'classifier__max_depth': [None, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10, 20],
    'classifier__min_samples_leaf': [1, 5, 10],
    'classifier__ccp_alpha': [0.0, 0.01, 0.1]
}

In [12]:
# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, n_iter=10, cv=2, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)
print("Best parameters:", random_search.best_params_)
print("Best cross-validated score:", random_search.best_score_)

# Evaluate the best model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Best parameters: {'feature_selection__k': 15, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': 15, 'classifier__ccp_alpha': 0.0}
Best cross-validated score: 0.9997302740415738
Confusion Matrix:
 [[11028     2]
 [    0  2873]]
Test Accuracy: 0.999856146155506


In [13]:
# Extract important features from the best model
def get_feature_names(column_transformer):
    output_features = []
    for name, transformer, features in column_transformer.transformers_:
        if name != 'remainder':
            if hasattr(transformer, 'get_feature_names_out'):
                transformer_features = transformer.get_feature_names_out(features)
            else:
                transformer_features = [f"{name}_{f}" for f in features]
            output_features.extend(transformer_features)
        else:
            output_features.extend(features)
    return output_features

feature_selector = best_model.named_steps['feature_selection']
features_selected = feature_selector.get_support()
transformed_features = get_feature_names(best_model.named_steps['preprocessor'])
important_features = [feature for feature, selected in zip(transformed_features, features_selected) if selected]
print("Important features:", important_features)

Important features: ['Energy_diesel', 'Energy_hybrid diesel', 'Energy_hybrid petrol', 'Energy_petrol', 'Fuel_mode_M', 'Fuel_mode_P', 'Brand_VOLVO', 'Veh_Model_A 250 E', 'Veh_Model_GLC 300 DE 4MATIC', 0, 2, 3, 4, 5, 6]


In [17]:
# Extract feature importances
importances = pd.DataFrame({
    "Feature": important_features,
    "Importance": best_model.named_steps['classifier'].feature_importances_
}).sort_values(by="Importance", ascending=False)
print("Feature importances:\n", importances.head(15))

from sklearn.metrics import classification_report

print("Classification Report for Decision Tree:")
report_dt = classification_report(y_test, y_pred)
print(report_dt)

Feature importances:
                         Feature  Importance
10                            2    0.998028
11                            3    0.000806
9                             0    0.000474
4                   Fuel_mode_M    0.000329
12                            4    0.000247
13                            5    0.000062
14                            6    0.000055
0                 Energy_diesel    0.000000
1          Energy_hybrid diesel    0.000000
2          Energy_hybrid petrol    0.000000
3                 Energy_petrol    0.000000
5                   Fuel_mode_P    0.000000
6                   Brand_VOLVO    0.000000
7             Veh_Model_A 250 E    0.000000
8   Veh_Model_GLC 300 DE 4MATIC    0.000000
Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11030
           1       1.00      1.00      1.00      2873

    accuracy                           1.00     13903
   macro avg   