In [37]:
import warnings
warnings.filterwarnings('ignore')

In [38]:
import numpy as np
import pandas as pd
import hvplot.pandas
import plotly.express as px
import dataframe_image as dfi
import tensorflow as tf
from pathlib import Path
from collections import Counter

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from imblearn.metrics import classification_report_imbalanced

In [40]:
car_clean_df = pd.read_csv("Resources/car_clean.csv")
car_clean_df.head()

Unnamed: 0.1,Unnamed: 0,manufacturer,model,year,selling_price,fuel,seller_type,mileage
0,0,Maruti,Swift Dzire VDI,2014,450000,Diesel,Individual,23.4
1,1,Skoda,Rapid 1.5 TDI Ambition,2014,370000,Diesel,Individual,21.14
2,2,Honda,City 2017-2020 EXi,2006,158000,Petrol,Individual,17.7
3,3,Hyundai,i20 Sportz Diesel,2010,225000,Diesel,Individual,23.0
4,4,Maruti,Swift VXI BSIII,2007,130000,Petrol,Individual,16.1


In [41]:
# Split our preprocessed data into our features and target arrays
X = pd.get_dummies(car_clean_df.drop(columns="fuel").copy())
y = car_clean_df["fuel"].copy()

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Logistic Regression

In [42]:
# Train the Logistic Regression model
LR = LogisticRegression(solver='liblinear', random_state=1)

LR.fit(X_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [43]:
# Check all classes in the model
LR.classes_

array(['Diesel', 'Other', 'Petrol'], dtype=object)

In [44]:
# Calculated the balanced accuracy score
y_pred = LR.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.4055100672747731

In [45]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[851,   0, 220],
       [  8,   0,  14],
       [511,   0, 373]], dtype=int64)

In [46]:
# Create a DataFrame from the confusion matrix.
LR_cm = confusion_matrix(y_test, y_pred)

LR_cm_df = pd.DataFrame(
    LR_cm, 
    index=["Actual Diesel", "Actual Other", "Actual Petrol"], 
    columns=["Predicted Diesel", "Predicted Other", "Predicted Petrol"])

LR_cm_df

Unnamed: 0,Predicted Diesel,Predicted Other,Predicted Petrol
Actual Diesel,851,0,220
Actual Other,8,0,14
Actual Petrol,511,0,373


In [47]:
# Print the imbalanced classification report
print("Logistic Regression")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Logistic Regression


Accuracy: 40.55% 


                   pre       rec       spe        f1       geo       iba       sup

     Diesel       0.62      0.79      0.43      0.70      0.58      0.35      1071
      Other       0.00      0.00      1.00      0.00      0.00      0.00        22
     Petrol       0.61      0.42      0.79      0.50      0.58      0.32       884

avg / total       0.61      0.62      0.59      0.60      0.57      0.33      1977



### Balanced Random Forest Classifier

In [48]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

brfc.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [49]:
# Check all classes in the model
brfc.classes_

array(['Diesel', 'Other', 'Petrol'], dtype=object)

In [50]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.8700177817824878

In [51]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[947,  46,  78],
       [  0,  20,   2],
       [ 88,  74, 722]], dtype=int64)

In [52]:
# Create a DataFrame from the confusion matrix.
BRFC_cm = confusion_matrix(y_test, y_pred)

BRFC_cm_df = pd.DataFrame(
    BRFC_cm, 
    index=["Actual Diesel", "Actual Other", "Actual Petrol"], 
    columns=["Predicted Diesel", "Predicted Other", "Predicted Petrol"])

BRFC_cm_df

Unnamed: 0,Predicted Diesel,Predicted Other,Predicted Petrol
Actual Diesel,947,46,78
Actual Other,0,20,2
Actual Petrol,88,74,722


In [53]:
# Print the imbalanced classification report
print("Balanced Random Forest Classifier")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Balanced Random Forest Classifier


Accuracy: 87.00% 


                   pre       rec       spe        f1       geo       iba       sup

     Diesel       0.91      0.88      0.90      0.90      0.89      0.80      1071
      Other       0.14      0.91      0.94      0.25      0.92      0.85        22
     Petrol       0.90      0.82      0.93      0.86      0.87      0.75       884

avg / total       0.90      0.85      0.91      0.87      0.88      0.78      1977



### Easy Ensemble AdaBoost Classifier

In [54]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eeac = EasyEnsembleClassifier(n_estimators=100, random_state=1)

eeac.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [55]:
# Check all classes in the model
eeac.classes_

array(['Diesel', 'Other', 'Petrol'], dtype=object)

In [56]:
# Calculated the balanced accuracy score
y_pred = eeac.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.8125888272947096

In [57]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[691,   6, 374],
       [  0,  20,   2],
       [ 87,  16, 781]], dtype=int64)

In [58]:
# Create a DataFrame from the confusion matrix.
EEAC_cm = confusion_matrix(y_test, y_pred)
EEAC_cm_df = pd.DataFrame(
    EEAC_cm, 
    index=["Actual Diesel", "Actual Other", "Actual Petrol"], 
    columns=["Predicted Diesel", "Predicted Other", "Predicted Petrol"])

EEAC_cm_df

Unnamed: 0,Predicted Diesel,Predicted Other,Predicted Petrol
Actual Diesel,691,6,374
Actual Other,0,20,2
Actual Petrol,87,16,781


In [59]:
# Print the imbalanced classification report
print("Easy Ensemble AdaBoost Classifier")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble AdaBoost Classifier
Accuracy: 81.26% 


                   pre       rec       spe        f1       geo       iba       sup

     Diesel       0.89      0.65      0.90      0.75      0.76      0.57      1071
      Other       0.48      0.91      0.99      0.62      0.95      0.89        22
     Petrol       0.68      0.88      0.66      0.77      0.76      0.59       884

avg / total       0.79      0.75      0.79      0.75      0.76      0.58      1977

