In [522]:
import warnings
warnings.filterwarnings('ignore')

In [523]:
import numpy as np
import pandas as pd
import hvplot.pandas
import plotly.express as px
from pathlib import Path
from collections import Counter
import dataframe_image as dfi

In [524]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from imblearn.metrics import classification_report_imbalanced

In [525]:
car_clean_df = pd.read_csv("Resources/car_clean.csv")
car_clean_df.head()

Unnamed: 0.1,Unnamed: 0,manufacturer,model,year,selling_price,fuel,seller_type,mileage
0,0,Maruti,Swift Dzire VDI,2014,450000,Diesel,Individual,23.4 kmpl
1,1,Skoda,Rapid 1.5 TDI Ambition,2014,370000,Diesel,Individual,21.14 kmpl
2,2,Honda,City 2017-2020 EXi,2006,158000,Petrol,Individual,17.7 kmpl
3,3,Hyundai,i20 Sportz Diesel,2010,225000,Diesel,Individual,23.0 kmpl
4,4,Maruti,Swift VXI BSIII,2007,130000,Petrol,Individual,16.1 kmpl


In [526]:
# Split our preprocessed data into our features and target arrays
X = pd.get_dummies(car_clean_df.drop(columns="fuel").copy())
y = car_clean_df["fuel"].copy()

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Logistic Regression

In [529]:
# Train the Logistic Regression model
LR = LogisticRegression(solver='liblinear', random_state=1)

LR.fit(X_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [530]:
# Check all classes in the model
LR.classes_

array(['Diesel', 'Other', 'Petrol'], dtype=object)

In [531]:
# Calculated the balanced accuracy score
y_pred = LR.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.4181174880876182

In [532]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[859,   0, 231],
       [ 13,   0,  14],
       [459,   0, 401]], dtype=int64)

In [533]:
# Create a DataFrame from the confusion matrix.
LR_cm = confusion_matrix(y_test, y_pred)

LR_cm_df = pd.DataFrame(
    LR_cm, 
    index=["Actual Diesel", "Actual Other", "Actual Petrol"], 
    columns=["Predicted Diesel", "Predicted Other", "Predicted Petrol"])

LR_cm_df

Unnamed: 0,Predicted Diesel,Predicted Other,Predicted Petrol
Actual Diesel,859,0,231
Actual Other,13,0,14
Actual Petrol,459,0,401


In [534]:
# Print the imbalanced classification report
print("Logistic Regression")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Logistic Regression


Accuracy: 41.81% 


                   pre       rec       spe        f1       geo       iba       sup

     Diesel       0.65      0.79      0.47      0.71      0.61      0.38      1090
      Other       0.00      0.00      1.00      0.00      0.00      0.00        27
     Petrol       0.62      0.47      0.78      0.53      0.60      0.35       860

avg / total       0.63      0.64      0.61      0.62      0.60      0.36      1977



### Balanced Random Forest Classifier

In [535]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

brfc.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [536]:
# Check all classes in the model
brfc.classes_

array(['Diesel', 'Other', 'Petrol'], dtype=object)

In [537]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.8518318332556296

In [538]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[983,  75,  32],
       [  1,  26,   0],
       [180,  86, 594]], dtype=int64)

In [539]:
# Create a DataFrame from the confusion matrix.
BRFC_cm = confusion_matrix(y_test, y_pred)

BRFC_cm_df = pd.DataFrame(
    BRFC_cm, 
    index=["Actual Diesel", "Actual Other", "Actual Petrol"], 
    columns=["Predicted Diesel", "Predicted Other", "Predicted Petrol"])

BRFC_cm_df

Unnamed: 0,Predicted Diesel,Predicted Other,Predicted Petrol
Actual Diesel,983,75,32
Actual Other,1,26,0
Actual Petrol,180,86,594


In [540]:
# Print the imbalanced classification report
print("Balanced Random Forest Classifier")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Balanced Random Forest Classifier


Accuracy: 85.18% 


                   pre       rec       spe        f1       geo       iba       sup

     Diesel       0.84      0.90      0.80      0.87      0.85      0.73      1090
      Other       0.14      0.96      0.92      0.24      0.94      0.89        27
     Petrol       0.95      0.69      0.97      0.80      0.82      0.65       860

avg / total       0.88      0.81      0.87      0.83      0.84      0.70      1977



### Easy Ensemble AdaBoost Classifier

In [541]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eeac = EasyEnsembleClassifier(n_estimators=100, random_state=1)

eeac.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [542]:
# Check all classes in the model
eeac.classes_

array(['Diesel', 'Other', 'Petrol'], dtype=object)

In [543]:
# Calculated the balanced accuracy score
y_pred = eeac.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.7201286985015027

In [544]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[714,   8, 368],
       [  0,  23,   4],
       [193, 105, 562]], dtype=int64)

In [545]:
# Create a DataFrame from the confusion matrix.
EEAC_cm = confusion_matrix(y_test, y_pred)
EEAC_cm_df = pd.DataFrame(
    EEAC_cm, 
    index=["Actual Diesel", "Actual Other", "Actual Petrol"], 
    columns=["Predicted Diesel", "Predicted Other", "Predicted Petrol"])

EEAC_cm_df

Unnamed: 0,Predicted Diesel,Predicted Other,Predicted Petrol
Actual Diesel,714,8,368
Actual Other,0,23,4
Actual Petrol,193,105,562


In [546]:
# Print the imbalanced classification report
print("Easy Ensemble AdaBoost Classifier")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble AdaBoost Classifier
Accuracy: 72.01% 


                   pre       rec       spe        f1       geo       iba       sup

     Diesel       0.79      0.66      0.78      0.72      0.72      0.51      1090
      Other       0.17      0.85      0.94      0.28      0.90      0.80        27
     Petrol       0.60      0.65      0.67      0.63      0.66      0.44       860

avg / total       0.70      0.66      0.73      0.67      0.69      0.48      1977

