In [3]:

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import confusion_matrix

In [5]:
# Define a function to get feature names from ColumnTransformer
def get_feature_names(column_transformer):
    output_features = []
    for name, transformer, features in column_transformer.transformers_:
        if name != 'remainder':
            if hasattr(transformer, 'get_feature_names_out'):
                transformer_features = transformer.get_feature_names_out(features)
            else:
                transformer_features = [f"{name}_{f}" for f in features]
            output_features.extend(transformer_features)
        else:
            output_features.extend(features)
    return output_features

In [7]:
# Load data
# Define the data types for each column to optimize memory usage
dtype_spec = {
    'Em_on_target': 'category',
    'Fuel consumption': 'float32',
    'Engine_cm3': 'float32',
    'Electric range (km)': 'float32',
    'Kg_veh': 'float32',
    'Test_mass': 'float32',
    'Power_KW': 'float32',
    'El_Consumpt_whkm': 'float32',
    'Energy': 'category',
    'Fuel_mode': 'category',
    'Brand': 'category',
    'Veh_type': 'category',
    'Veh_Model': 'category',
    'year': 'int32'
}
columns = ['Em_on_target', 'Fuel consumption', 'Engine_cm3', 'Electric range (km)', 'Kg_veh', 'Test_mass', 'Power_KW', 'El_Consumpt_whkm', 'Energy', 'Fuel_mode', 'Brand', 'Veh_type', 'Veh_Model', 'year']
dfdt = pd.read_csv('/Users/livalacaisse/Documents/DataScience/CO2/000-C02 First Delivery/Concatenate/PT_FR_ES_IT.csv', usecols=columns, dtype=dtype_spec, low_memory=False)
# Convert data types after inspection
dfdt['Engine_cm3'] = pd.to_numeric(dfdt['Engine_cm3'], errors='coerce').astype('float32').fillna(0).astype('int32')


NameError: name 'pd' is not defined

In [9]:
dfdt.drop_duplicates(inplace=True)

In [11]:
dfdt.info()

<class 'pandas.core.frame.DataFrame'>
Index: 868057 entries, 0 to 22786155
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   Veh_type             868057 non-null  category
 1   Brand                868057 non-null  category
 2   Veh_Model            868057 non-null  category
 3   Kg_veh               868057 non-null  float32 
 4   Test_mass            868057 non-null  float32 
 5   Energy               868057 non-null  category
 6   Fuel_mode            868057 non-null  category
 7   Engine_cm3           868057 non-null  int32   
 8   Power_KW             868057 non-null  float32 
 9   El_Consumpt_whkm     868057 non-null  float32 
 10  Fuel consumption     868057 non-null  float32 
 11  Electric range (km)  868057 non-null  float32 
 12  Em_on_target         868057 non-null  category
dtypes: category(6), float32(6), int32(1)
memory usage: 36.6 MB


In [13]:
# Split data
X = dfdt.drop('Em_on_target', axis=1)
y = dfdt['Em_on_target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Define OneHotEncoder and ColumnTransformer
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
categorical_features = ['Energy', 'Fuel_mode', 'Brand', 'Veh_type', 'Veh_Model']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_features)
    ],
    remainder='passthrough'
)

In [17]:
# Create and fit the modeling pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif)),
    ('classifier', DecisionTreeClassifier(random_state=42))
])
pipeline.fit(X_train, y_train)

In [20]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Confusion Matrix:
 [[150031     18]
 [    18  23545]]


In [22]:
# Define and execute GridSearchCV
param_grid = {
    'feature_selection__k': [10, 15],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 5],
    'classifier__ccp_alpha': [0.0, 0.01]
}
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=2, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated score:", grid_search.best_score_)



KeyboardInterrupt: 

In [None]:
# Extract features from the best model
best_model = grid_search.best_estimator_
feature_selector = best_model.named_steps['feature_selection']
features_selected = feature_selector.get_support()
transformed_features = get_feature_names(best_model.named_steps['preprocessor'])
important_features = [feature for feature, selected in zip(transformed_features, features_selected) if selected]
print("Important features:", important_features)


In [None]:
## Feature importance
import numpy as np

def get_final_feature_names(pipeline, input_features, dtype_dict):
    # Create a dummy DataFrame with appropriate data types
    dummy_data = pd.DataFrame(np.zeros(shape=(1, len(input_features))), columns=input_features)
    for col, dtype in dtype_dict.items():
        dummy_data[col] = dummy_data[col].astype(dtype)

    # Transform the dummy data through the pipeline's preprocessing steps
    pipeline.named_steps['preprocessor'].transform(dummy_data)

    # Retrieve feature names from the pipeline's ColumnTransformer
    feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

    # Adjust for feature selection
    if 'feature_selection' in pipeline.named_steps:
        support_mask = pipeline.named_steps['feature_selection'].get_support()
        selected_features = feature_names[support_mask]
    else:
        selected_features = feature_names

    return selected_features

# Define data types as they appear in X_train for dummy data creation
dtype_dict = {col: X_train[col].dtype for col in X_train.columns}

# Retrieve correct feature names
final_features = get_final_feature_names(best_model, X_train.columns, dtype_dict)

# Now pair these with the feature importances
importances = pd.DataFrame({
    "Feature": final_features,
    "Importance": best_model.named_steps['classifier'].feature_importances_
}).sort_values(by="Importance", ascending=False)

print("Feature importances:\n", importances.head(8))

In [None]:
### Best model parameters
from sklearn.metrics import accuracy_score

# Define the OneHotEncoder and ColumnTransformer
categorical_features = ['Energy', 'Fuel_mode', 'Brand', 'Veh_type']  # same categorical columns as before
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_features)
    ],
    remainder='passthrough'
)

# Create the modeling pipeline including preprocessing
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Parameter grid
param_grid = {
    'classifier__min_samples_leaf': [1, 4, 6, 10],
    'classifier__min_samples_split': [2, 10, 20],
    'classifier__max_depth': [None, 5, 10, 15],
    'classifier__ccp_alpha': [0.0, 0.01, 0.1]
}

# Initialize and fit the GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=2, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model and parameters
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model
y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

# Extracting feature importance from the best model
feature_importances = best_model.named_steps['classifier'].feature_importances_
transformed_features = best_model.named_steps['preprocessor'].get_feature_names_out()

# Pairing feature names with their importances
importances = pd.DataFrame({
    'Feature': transformed_features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
print(importances.head(10))

In [None]:
pip install shap

In [None]:
import shap

# print the JS visualization code to the notebook
shap.initjs()

In [None]:
pip install graphviz

In [None]:
# Function to extract feature names
def get_feature_names(column_transformer):
    col_names = []
    for name, transformer, cols in column_transformer.transformers_:
        if name == 'remainder' and transformer == 'passthrough':
            col_names.extend(cols)
        elif hasattr(transformer, 'get_feature_names_out'):
            col_names.extend(transformer.get_feature_names_out())
        else:
            col_names.extend(cols)
    return col_names

# Assuming your pipeline has been fitted
fitted_preprocessor = pipeline.named_steps['preprocessor']
X_train_transformed = fitted_preprocessor.transform(X_train)
feature_names = get_feature_names(fitted_preprocessor)
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=feature_names)

## check number of feature names
print("Number of feature names:", len(feature_names))
print("Number of features in transformed data:", X_train_transformed_df.shape[1])

# Check if any mismatch in numbers
assert len(feature_names) == X_train_transformed_df.shape[1], "Mismatch in number of features!"

# Initialize SHAP Explainer
explainer = shap.Explainer(pipeline.named_steps['classifier'], X_train_transformed_df)

# Generate SHAP values (Ensure this returns an Explanation object)
shap_values = explainer(X_train_transformed_df)

# Initialize JavaScript for SHAP in Jupyter Notebooks
shap.initjs()

# Check if SHAP values are in a list (indicative of multi-class outputs)
if isinstance(shap_values, list):
    # Assuming we're interested in the first class (adjust as necessary)
    class_index = 0
    shap.summary_plot(shap_values[class_index], X_train_transformed_df, feature_names=feature_names)
else:
    shap.summary_plot(shap_values, X_train_transformed_df, feature_names=feature_names)
# Simple bar plot to display the mean absolute values of SHAP for all features
shap.summary_plot(shap_values, X_train_transformed_df, plot_type="bar")

In [None]:
# Simple bar plot to display the mean absolute values of SHAP for all features
shap.summary_plot(shap_values, X_train_transformed_df, plot_type="bar")

In [None]:
# Debug feature names
print(type(feature_names))
print(feature_names[:10])

# Initialize SHAP Explainer
explainer = shap.Explainer(pipeline.named_steps['classifier'], X_train_transformed_df)
shap_values = explainer(X_train_transformed_df)

# Initialize JavaScript for visualization in Jupyter Notebooks
shap.initjs()

# Display the bar plot using the correct function and ensuring shap_values is an Explanation object
if isinstance(shap_values, shap.Explanation):
    shap.plots.bar(shap_values)
else:
    print("shap_values must be an shap.Explanation object for shap.plots.bar().")

In [None]:
explainer = shap.Explainer(pipeline)
shap_values = explainer.shap_values(X_test)

In [None]:


# draw model
dot_data = export_graphviz(
    single_split_model,
    out_file=None,
    filled=True,
    rounded=True,
    special_characters=True,
)
graph = graphviz.Source(dot_data)
graph