In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [18]:
# Load data
# Define the data types for each column to optimize memory usage
dtype_spec = {
    'Em_on_target': 'float32',
    'Fuel consumption': 'float32',
    'Engine_cm3': 'float32',
    'Electric range (km)': 'float32',
    'Kg_veh': 'float32',
    'Test_mass': 'float32',
    'Power_KW': 'float32',
    'El_Consumpt_whkm': 'float32',
    'Energy': 'category',
    'Fuel_mode': 'category',
    'Brand': 'category',
    'Veh_type': 'category',
    'Veh_Model': 'category',
    'year': 'float32',
    'Version': 'category'
}
columns = ['Em_on_target', 'Fuel consumption', 'Engine_cm3', 'Electric range (km)', 'Kg_veh', 'Test_mass', 'Power_KW', 'El_Consumpt_whkm', 'Energy', 'Fuel_mode', 'Brand', 'Veh_type', 'Veh_Model','year','Version','Country']
df = pd.read_csv('/Users/livalacaisse/Documents/DataScience/CO2/000-C02 First Delivery/Cleaned_countries/FR_Cleaned.csv', usecols=columns, dtype=dtype_spec, low_memory=False)
# Convert data types after inspection
df['Engine_cm3'] = pd.to_numeric(df['Engine_cm3'], errors='coerce').astype('float32').fillna(0).astype('int32')

df.drop_duplicates(inplace=True)

In [20]:
# NaN status
# Calculate NaN counts and percentages for each column
nan_counts = df.isna().sum()
nan_percentages = (df.isna().sum() / len(df)) * 100

# Create a DataFrame to display the NaN information in a tidy format
nan_df = pd.DataFrame({
    'NaN Count': nan_counts,
    'NaN Percentage': nan_percentages
})

# Print the DataFrame
print(nan_df)

                     NaN Count  NaN Percentage
Country                      0             0.0
Veh_type                     0             0.0
Version                      0             0.0
Brand                        0             0.0
Veh_Model                    0             0.0
Kg_veh                       0             0.0
Test_mass                    0             0.0
Energy                       0             0.0
Fuel_mode                    0             0.0
Engine_cm3                   0             0.0
Power_KW                     0             0.0
El_Consumpt_whkm             0             0.0
year                         0             0.0
Fuel consumption             0             0.0
Electric range (km)          0             0.0
Em_on_target                 0             0.0


In [21]:
# Select relevant features and target
features = ['Fuel consumption', 'Engine_cm3', 'Electric range (km)', 'Kg_veh', 'Test_mass', 'Power_KW', 'El_Consumpt_whkm', 'Energy', 'Fuel_mode', 'Brand', 'Veh_type', 'Veh_Model','year','Version']
target = 'Em_on_target'

X = df[features]
y = df[target]

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Categorical and numerical features
categorical_features = ['Energy', 'Fuel_mode', 'Brand', 'Veh_type', 'Veh_Model','Version']
numerical_features = [feat for feat in features if feat not in categorical_features]

# Creating the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Pipeline for Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, max_features='sqrt'))
])

# Pipeline for SVM
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(random_state=42))
])

In [23]:
from sklearn.metrics import classification_report

# Fit, predict, and evaluate the Random Forest model
rf_pipeline.fit(X_train, y_train)
rf_pred = rf_pipeline.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f'Random Forest Accuracy: {rf_accuracy}')
print("Random Forest Classification Report:\n", classification_report(y_test, rf_pred))

# Fit, predict, and evaluate the SVM model
svm_pipeline.fit(X_train, y_train)
svm_pred = svm_pipeline.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f'SVM Accuracy: {svm_accuracy}')
print("SVM Classification Report:\n", classification_report(y_test, svm_pred))

Random Forest Accuracy: 0.9998089032868634
Random Forest Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     95011
         1.0       1.00      1.00      1.00     14881

    accuracy                           1.00    109892
   macro avg       1.00      1.00      1.00    109892
weighted avg       1.00      1.00      1.00    109892

SVM Accuracy: 0.99976340406945
SVM Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     95011
         1.0       1.00      1.00      1.00     14881

    accuracy                           1.00    109892
   macro avg       1.00      1.00      1.00    109892
weighted avg       1.00      1.00      1.00    109892



In [24]:
### Optimisation wiht RandomizedSearch

In [27]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Random search parameters for Random Forest
rf_param_dist = {
    'classifier__max_depth': randint(3, 20),
    'classifier__min_samples_split': randint(2, 20),
    'classifier__min_samples_leaf': randint(1, 10),
    'classifier__max_features': ['sqrt', None],  # assuming you've adjusted this from 'auto' as discussed previously
    'classifier__n_estimators': randint(100, 200)  # Example parameter for number of trees
}

In [29]:
# Initialize and fit the RandomizedSearchCV
rf_random_search = RandomizedSearchCV(
    estimator=rf_pipeline, 
    param_distributions=rf_param_dist, 
    n_iter=50,  # Adjusted to match the actual size of the parameter space
    cv=2, 
    scoring='accuracy', 
    n_jobs=-1, 
    random_state=42
)
rf_random_search.fit(X_train, y_train)

# Output the best model parameters and accuracy
print("Best Parameters:", rf_random_search.best_params_)
print("Best cross-validated accuracy:", rf_random_search.best_score_)

Best Parameters: {'classifier__max_depth': 19, 'classifier__max_features': None, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 6, 'classifier__n_estimators': 195}
Best cross-validated accuracy: 0.9998202764557607


In [30]:
# Random search parameters for SVM
svm_param_dist = {
    'classifier__C': np.logspace(-3, 2, 6),
    'classifier__kernel': ['rbf', 'linear'],
    'classifier__gamma': ['scale', 'auto', 0.1, 1, 10]
}

In [40]:
# Initialize and fit the RandomizedSearchCV
svm_random_search = RandomizedSearchCV(estimator=svm_pipeline, param_distributions=svm_param_dist, n_iter=20, cv=2, scoring='accuracy', n_jobs=-1, random_state=42)
svm_random_search.fit(X_train, y_train)

# Best model and parameters
print("Best Parameters:", svm_random_search.best_params_)
print("Best cross-validated accuracy:", svm_random_search.best_score_)




Best Parameters: {'classifier__kernel': 'rbf', 'classifier__gamma': 'scale', 'classifier__C': 100.0}
Best cross-validated accuracy: 0.9998271014004787


In [41]:
# Feature importance

In [42]:
# Feature importance from Random Forest
if hasattr(rf_random_search.best_estimator_.named_steps['classifier'], 'feature_importances_'):
    feature_importances_rf = rf_random_search.best_estimator_.named_steps['classifier'].feature_importances_
    feature_names = preprocessor.transformers_[0][-1] + preprocessor.transformers_[1][-1].get_feature_names_out().tolist()  # Adjust this if your feature names are stored differently

    # Create a DataFrame to view the feature importances
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances_rf
    }).sort_values(by='Importance', ascending=False)

    print("Random Forest Feature Importances:")
    print(feature_importance_df)
else:
    print("No feature_importances_ attribute available.")

AttributeError: 'list' object has no attribute 'get_feature_names_out'

In [None]:
# Feature importance for SVM
if 'linear' in svm_random_search.best_params_['classifier__kernel']:
    svm_coefficients = svm_random_search.best_estimator_.named_steps['classifier'].coef_[0]
    feature_names = preprocessor.transformers_[0][-1] + preprocessor.transformers_[1][-1].get_feature_names_out().tolist()  # Adjust if different

    svm_feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': svm_coefficients
    }).sort_values(by='Importance', key=abs, ascending=False)

    print("SVM Feature Importances (Linear Kernel):")
    print(svm_feature_importance_df)
else:
    print("SVM is using a non-linear kernel; feature importance is not directly interpretable.")
