In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import confusion_matrix, accuracy_score
import shap

In [25]:
# Load data
df = pd.read_csv('/Users/livalacaisse/Documents/DataScience/CO2/000-C02 First Delivery/Cleaned_countries/Cl_no_FR/PT_Cleaned.csv')
var_to_include = ['Em_on_target', 'Fuel consumption', 'Engine_cm3', 'Electric range (km)', 'Kg_veh', 'Test_mass', 'Power_KW', 'El_Consumpt_whkm', 'Energy', 'Fuel_mode', 'Brand', 'Veh_type', 'Veh_Model']
dfdt = df[var_to_include]

In [26]:
# Split data
X = dfdt.drop('Em_on_target', axis=1)
y = dfdt['Em_on_target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Define OneHotEncoder and ColumnTransformer
categorical_features = ['Energy', 'Fuel_mode', 'Brand', 'Veh_type', 'Veh_Model']
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_features)
    ],
    remainder='passthrough'
)

In [28]:
# Create and fit the modeling pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif)),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

In [29]:
# Define parameter grid for GridSearchCV
param_grid = {
    'feature_selection__k': [10, 15],
    'classifier__max_depth': [None, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10, 20],
    'classifier__min_samples_leaf': [1, 5, 10],
    'classifier__ccp_alpha': [0.0, 0.01, 0.1]
}

In [30]:
# Perform GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=2, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated score:", grid_search.best_score_)

Best parameters: {'classifier__ccp_alpha': 0.0, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'feature_selection__k': 10}
Best cross-validated score: 0.9999971119216984


In [31]:
# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[152618      0]
 [     0  20508]]
Test Accuracy: 1.0


In [32]:
# Extract important features from the best model
def get_feature_names(column_transformer):
    output_features = []
    for name, transformer, features in column_transformer.transformers_:
        if name != 'remainder':
            if hasattr(transformer, 'get_feature_names_out'):
                transformer_features = transformer.get_feature_names_out(features)
            else:
                transformer_features = [f"{name}_{f}" for f in features]
            output_features.extend(transformer_features)
        else:
            output_features.extend(features)
    return output_features

feature_selector = best_model.named_steps['feature_selection']
features_selected = feature_selector.get_support()
transformed_features = get_feature_names(best_model.named_steps['preprocessor'])
important_features = [feature for feature, selected in zip(transformed_features, features_selected) if selected]
print("Important features:", important_features)

Important features: ['Energy_electric', 'Energy_hybrid petrol', 'Fuel_mode_E', 'Fuel_mode_M', 'Fuel_mode_P', 0, 2, 3, 4, 6]


In [33]:
# Extract feature importances
importances = pd.DataFrame({
    "Feature": important_features,
    "Importance": best_model.named_steps['classifier'].feature_importances_
}).sort_values(by="Importance", ascending=False)
print("Feature importances:\n", importances.head(8))

Feature importances:
                 Feature  Importance
6                     2    0.971286
5                     0    0.021044
8                     4    0.005928
7                     3    0.001177
9                     6    0.000513
3           Fuel_mode_M    0.000053
0       Energy_electric    0.000000
1  Energy_hybrid petrol    0.000000


In [40]:
from sklearn.metrics import classification_report

In [43]:
print("Classification Report for Decision Tree:")
report_dt = classification_report(y_test, y_pred)
print(report_dt)

Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    152618
           1       1.00      1.00      1.00     20508

    accuracy                           1.00    173126
   macro avg       1.00      1.00      1.00    173126
weighted avg       1.00      1.00      1.00    173126

