In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [9]:
# Load data
df = pd.read_csv('PtCO2180424.csv')
var_to_include = ['Em_on_target', 'Fuel consumption', 'Engine_cm3', 'Electric range (km)', 'Kg_veh', 'Test_mass', 'Power_KW', 'El_Consumpt_whkm', 'Energy', 'Fuel_mode', 'Brand', 'Veh_type']
dfdt = df[var_to_include]

In [10]:
# Select relevant features and target
features = ['Fuel consumption', 'Engine_cm3', 'Electric range (km)', 'Kg_veh', 'Test_mass', 'Power_KW', 'El_Consumpt_whkm', 'Energy', 'Fuel_mode', 'Brand', 'Veh_type']
target = 'Em_on_target'

X = df[features]
y = df[target]

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Categorical and numerical features
categorical_features = ['Energy', 'Fuel_mode', 'Brand', 'Veh_type']
numerical_features = [feat for feat in features if feat not in categorical_features]

# Creating the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Pipeline for Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Pipeline for SVM
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(random_state=42))
])

In [12]:
# Fit Random Forest
rf_pipeline.fit(X_train, y_train)
# Predict with Random Forest
rf_pred = rf_pipeline.predict(X_test)
# Evaluate Random Forest
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f'Random Forest Accuracy: {rf_accuracy}')

# Fit SVM
svm_pipeline.fit(X_train, y_train)
# Predict with SVM
svm_pred = svm_pipeline.predict(X_test)
# Evaluate SVM
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f'SVM Accuracy: {svm_accuracy}')

Random Forest Accuracy: 1.0
SVM Accuracy: 0.9987217650148835


In [13]:
from sklearn.metrics import classification_report

# Fit, predict, and evaluate the Random Forest model
rf_pipeline.fit(X_train, y_train)
rf_pred = rf_pipeline.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f'Random Forest Accuracy: {rf_accuracy}')
print("Random Forest Classification Report:\n", classification_report(y_test, rf_pred))

# Fit, predict, and evaluate the SVM model
svm_pipeline.fit(X_train, y_train)
svm_pred = svm_pipeline.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f'SVM Accuracy: {svm_accuracy}')
print("SVM Classification Report:\n", classification_report(y_test, svm_pred))

# Perform Grid Search to optimize the Random Forest model
rf_param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_features': ['sqrt'],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__ccp_alpha': [0.0, 0.01, 0.1]
}
rf_grid_search = GridSearchCV(estimator=rf_pipeline, param_grid=rf_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
rf_best_pred = rf_grid_search.best_estimator_.predict(X_test)
print("Optimized Random Forest Classification Report:\n", classification_report(y_test, rf_best_pred))

# Perform Grid Search to optimize the SVM model
svm_param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['rbf', 'linear'],
    'classifier__gamma': ['scale', 'auto']
}
svm_grid_search = GridSearchCV(estimator=svm_pipeline, param_grid=svm_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
svm_grid_search.fit(X_train, y_train)
svm_best_pred = svm_grid_search.best_estimator_.predict(X_test)
print("Optimized SVM Classification Report:\n", classification_report(y_test, svm_best_pred))

Random Forest Accuracy: 1.0
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     95809
           1       1.00      1.00      1.00     18411

    accuracy                           1.00    114220
   macro avg       1.00      1.00      1.00    114220
weighted avg       1.00      1.00      1.00    114220

SVM Accuracy: 0.9987217650148835
SVM Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     95809
           1       0.99      1.00      1.00     18411

    accuracy                           1.00    114220
   macro avg       1.00      1.00      1.00    114220
weighted avg       1.00      1.00      1.00    114220

Optimized Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     95809
           1       1.00      1.00      1.00     18411

    acc



Optimized SVM Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     95809
           1       1.00      1.00      1.00     18411

    accuracy                           1.00    114220
   macro avg       1.00      1.00      1.00    114220
weighted avg       1.00      1.00      1.00    114220



In [14]:
# Grid search parameters for Random Forest
rf_param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_features': ['auto', 'sqrt'],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

# Grid search parameters for SVM
svm_param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['rbf', 'linear'],
    'classifier__gamma': ['scale', 'auto']
}

# Grid search parameters for Random Forest
rf_param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_features': ['sqrt'],  # Updated from 'auto' to 'sqrt'
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__ccp_alpha': [0.0, 0.01, 0.1]
}

# Initialize and fit the GridSearchCV
rf_grid_search = GridSearchCV(estimator=rf_pipeline, param_grid=rf_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

# Best model and parameters
print("Best Parameters:", rf_grid_search.best_params_)
print("Best cross-validated accuracy:", rf_grid_search.best_score_)

Best Parameters: {'classifier__ccp_alpha': 0.0, 'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best cross-validated accuracy: 0.9999956224589742


In [15]:
# Feature importance

In [16]:
feature_importances_rf = rf_grid_search.best_estimator_.named_steps['classifier'].feature_importances_
feature_names = X_train.columns  # Adjust this if your feature names are stored differently

# Create a DataFrame to view the feature importances
import pandas as pd
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances_rf
}).sort_values(by='Importance', ascending=False)

print("Random Forest Feature Importances:")
print(feature_importance_df)

ValueError: All arrays must be of the same length

In [None]:
### coefficients SVM

# Check if the SVM model is linear and extract coefficients if true
if 'linear' in svm_grid_search.best_params_['classifier__kernel']:
    svm_coefficients = svm_grid_search.best_estimator_.named_steps['classifier'].coef_[0]
    svm_feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': svm_coefficients
    }).sort_values(by='Importance', key=abs, ascending=False)

    print("SVM Feature Importances (Linear Kernel):")
    print(svm_feature_importance_df)
else:
    print("SVM is using a non-linear kernel; feature importance is not directly interpretable.")