In [87]:
import warnings
warnings.filterwarnings('ignore')

In [88]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import dataframe_image as dfi

In [89]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [90]:
cars_df = pd.read_csv("Car details v3.csv")
cars_df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [91]:
# Remove rows that have at least 1 null value.
cars_no_Null_df = cars_df.dropna()

print(cars_no_Null_df.shape)
cars_no_Null_df.head(10)

(7906, 13)


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0
5,Hyundai Xcent 1.2 VTVT E Plus,2017,440000,45000,Petrol,Individual,Manual,First Owner,20.14 kmpl,1197 CC,81.86 bhp,113.75nm@ 4000rpm,5.0
6,Maruti Wagon R LXI DUO BSIII,2007,96000,175000,LPG,Individual,Manual,First Owner,17.3 km/kg,1061 CC,57.5 bhp,"7.8@ 4,500(kgm@ rpm)",5.0
7,Maruti 800 DX BSII,2001,45000,5000,Petrol,Individual,Manual,Second Owner,16.1 kmpl,796 CC,37 bhp,59Nm@ 2500rpm,4.0
8,Toyota Etios VXD,2011,350000,90000,Diesel,Individual,Manual,First Owner,23.59 kmpl,1364 CC,67.1 bhp,170Nm@ 1800-2400rpm,5.0
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,Diesel,Individual,Manual,First Owner,20.0 kmpl,1399 CC,68.1 bhp,160Nm@ 2000rpm,5.0


In [92]:
# Split our preprocessed data into our features and target arrays
y = cars_no_Null_df["fuel"].values
X = pd.get_dummies(cars_no_Null_df.drop(columns="fuel").copy())

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

### Easy Ensemble AdaBoost Classifier

In [93]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eeac = EasyEnsembleClassifier(n_estimators=100, random_state=1)

eeac.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [94]:
# Calculated the balanced accuracy score
y_pred = eeac.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.7568847342604055

In [95]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  9,   0,   1,   1],
       [ 68, 715,  94, 224],
       [  1,   0,   7,   0],
       [ 65,  31, 174, 587]], dtype=int64)

In [96]:
# Create a DataFrame from the confusion matrix.
EEAC_cm = confusion_matrix(y_test, y_pred)
EEAC_cm_df = pd.DataFrame(
    EEAC_cm, 
    index=["Actual CNG", "Actual Diesel", "Actual LPG", "Actual Petrol"], 
    columns=["Predicted CNG", "Predicted Diesel", "Predicted LPG", "Predicted Petrol"])

EEAC_cm_df

Unnamed: 0,Predicted CNG,Predicted Diesel,Predicted LPG,Predicted Petrol
Actual CNG,9,0,1,1
Actual Diesel,68,715,94,224
Actual LPG,1,0,7,0
Actual Petrol,65,31,174,587


In [97]:
# Print the imbalanced classification report
print("Easy Ensemble AdaBoost Classifier")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble AdaBoost Classifier
Accuracy: 75.69% 


                   pre       rec       spe        f1       geo       iba       sup

        CNG       0.06      0.82      0.93      0.12      0.87      0.75        11
     Diesel       0.96      0.65      0.96      0.77      0.79      0.61      1101
        LPG       0.03      0.88      0.86      0.05      0.87      0.76         8
     Petrol       0.72      0.68      0.80      0.70      0.74      0.54       857

avg / total       0.85      0.67      0.89      0.74      0.77      0.58      1977



### Balanced Random Forest Classifier

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

brfc.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Create a DataFrame from the confusion matrix.
BRFC_cm = confusion_matrix(y_test, y_pred)

BRFC_cm_df = pd.DataFrame(
    BRFC_cm, 
    index=["Actual CNG", "Actual Diesel", "Actual LPG", "Actual Petrol"], 
    columns=["Predicted CNG", "Predicted Diesel", "Predicted LPG", "Predicted Petrol"])

BRFC_cm_df