In [374]:
import warnings
warnings.filterwarnings('ignore')

In [375]:
import numpy as np
import pandas as pd
import hvplot.pandas
import plotly.express as px
from pathlib import Path
from collections import Counter
import dataframe_image as dfi

In [376]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from imblearn.metrics import classification_report_imbalanced

In [377]:
cars_df = pd.read_csv("Resources/Car details v3.csv")
cars_df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [378]:
# Remove rows that have at least 1 null value.
cars_no_Null_df = cars_df.dropna()

print(cars_no_Null_df.shape)
cars_no_Null_df.head(10)

(7906, 13)


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0
5,Hyundai Xcent 1.2 VTVT E Plus,2017,440000,45000,Petrol,Individual,Manual,First Owner,20.14 kmpl,1197 CC,81.86 bhp,113.75nm@ 4000rpm,5.0
6,Maruti Wagon R LXI DUO BSIII,2007,96000,175000,LPG,Individual,Manual,First Owner,17.3 km/kg,1061 CC,57.5 bhp,"7.8@ 4,500(kgm@ rpm)",5.0
7,Maruti 800 DX BSII,2001,45000,5000,Petrol,Individual,Manual,Second Owner,16.1 kmpl,796 CC,37 bhp,59Nm@ 2500rpm,4.0
8,Toyota Etios VXD,2011,350000,90000,Diesel,Individual,Manual,First Owner,23.59 kmpl,1364 CC,67.1 bhp,170Nm@ 1800-2400rpm,5.0
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,Diesel,Individual,Manual,First Owner,20.0 kmpl,1399 CC,68.1 bhp,160Nm@ 2000rpm,5.0


In [379]:
# Seperate name and split it between manufactuerer and model
cars_no_Null_df[['brand','type']] =  cars_no_Null_df["name"].str.split(" ", n=1, expand=True)
cars_no_Null_df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,brand,type
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0,Maruti,Swift Dzire VDI
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0,Skoda,Rapid 1.5 TDI Ambition
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0,Honda,City 2017-2020 EXi
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0,Hyundai,i20 Sportz Diesel
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0,Maruti,Swift VXI BSIII


In [380]:
# Select column
car_clean_df=cars_no_Null_df[['brand',"year","selling_price","fuel","seller_type","mileage"]]
car_clean_df

Unnamed: 0,brand,year,selling_price,fuel,seller_type,mileage
0,Maruti,2014,450000,Diesel,Individual,23.4 kmpl
1,Skoda,2014,370000,Diesel,Individual,21.14 kmpl
2,Honda,2006,158000,Petrol,Individual,17.7 kmpl
3,Hyundai,2010,225000,Diesel,Individual,23.0 kmpl
4,Maruti,2007,130000,Petrol,Individual,16.1 kmpl
...,...,...,...,...,...,...
8123,Hyundai,2013,320000,Petrol,Individual,18.5 kmpl
8124,Hyundai,2007,135000,Diesel,Individual,16.8 kmpl
8125,Maruti,2009,382000,Diesel,Individual,19.3 kmpl
8126,Tata,2013,290000,Diesel,Individual,23.57 kmpl


In [381]:
# Clean fuel types
car_clean_df['fuel'] = car_clean_df['fuel'].replace(['CNG', 'LPG'], 'Other')

In [382]:
car_over19=car_clean_df[car_clean_df["year"]>2017]

print(car_over19.count)

<bound method DataFrame.count of         brand  year  selling_price    fuel seller_type      mileage
24       Tata  2018         500000  Petrol  Individual    20.3 kmpl
29     Maruti  2018         254999  Petrol  Individual    16.8 kmpl
33    Hyundai  2018         730000  Petrol  Individual    18.6 kmpl
35     Maruti  2019         330000   Other  Individual  33.44 km/kg
36     Maruti  2019         366000  Petrol  Individual   23.95 kmpl
...       ...   ...            ...     ...         ...          ...
8097     Ford  2018         880000  Diesel  Individual    23.0 kmpl
8100     Tata  2018         800000  Diesel  Individual    21.5 kmpl
8108      Kia  2019        1575000  Diesel  Individual    17.8 kmpl
8109     Ford  2018         750000  Diesel  Individual    24.4 kmpl
8112     Ford  2018         746000  Diesel  Individual    24.4 kmpl

[1463 rows x 6 columns]>


In [383]:
# Split our preprocessed data into our features and target arrays
X = pd.get_dummies(car_over19.drop(columns="fuel").copy())
y = car_over19["fuel"].copy()

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### Logistic Regression

In [384]:
# Train the Logistic Regression model
LR = LogisticRegression(solver='liblinear', random_state=1)

LR.fit(X_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [385]:
# Check all classes in the model
LR.classes_

array(['Diesel', 'Other', 'Petrol'], dtype=object)

In [386]:
# Calculated the balanced accuracy score
y_pred = LR.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.42429106474050293

In [387]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 65,   0, 117],
       [  0,   0,   6],
       [ 15,   0, 163]], dtype=int64)

In [388]:
# Create a DataFrame from the confusion matrix.
LR_cm = confusion_matrix(y_test, y_pred)

LR_cm_df = pd.DataFrame(
    LR_cm, 
    index=["Actual Diesel", "Actual Other", "Actual Petrol"], 
    columns=["Predicted Diesel", "Predicted Other", "Predicted Petrol"])

LR_cm_df

Unnamed: 0,Predicted Diesel,Predicted Other,Predicted Petrol
Actual Diesel,65,0,117
Actual Other,0,0,6
Actual Petrol,15,0,163


In [389]:
# Print the imbalanced classification report
print("Logistic Regression")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Logistic Regression


Accuracy: 42.43% 


                   pre       rec       spe        f1       geo       iba       sup

     Diesel       0.81      0.36      0.92      0.50      0.57      0.31       182
      Other       0.00      0.00      1.00      0.00      0.00      0.00         6
     Petrol       0.57      0.92      0.35      0.70      0.56      0.33       178

avg / total       0.68      0.62      0.64      0.59      0.56      0.32       366



### Balanced Random Forest Classifier

In [390]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

brfc.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [391]:
# Check all classes in the model
brfc.classes_

array(['Diesel', 'Other', 'Petrol'], dtype=object)

In [392]:
# Calculated the balanced accuracy score
y_pred = brfc.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.8460509527925258

In [393]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[146,   1,  35],
       [  0,   6,   0],
       [ 28,  19, 131]], dtype=int64)

In [394]:
# Create a DataFrame from the confusion matrix.
BRFC_cm = confusion_matrix(y_test, y_pred)

BRFC_cm_df = pd.DataFrame(
    BRFC_cm, 
    index=["Actual Diesel", "Actual Other", "Actual Petrol"], 
    columns=["Predicted Diesel", "Predicted Other", "Predicted Petrol"])

BRFC_cm_df

Unnamed: 0,Predicted Diesel,Predicted Other,Predicted Petrol
Actual Diesel,146,1,35
Actual Other,0,6,0
Actual Petrol,28,19,131


In [395]:
# Print the imbalanced classification report
print("Balanced Random Forest Classifier")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Balanced Random Forest Classifier


Accuracy: 84.61% 


                   pre       rec       spe        f1       geo       iba       sup

     Diesel       0.84      0.80      0.85      0.82      0.82      0.68       182
      Other       0.23      1.00      0.94      0.38      0.97      0.95         6
     Petrol       0.79      0.74      0.81      0.76      0.77      0.59       178

avg / total       0.80      0.77      0.83      0.78      0.80      0.64       366



### Easy Ensemble AdaBoost Classifier

In [396]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eeac = EasyEnsembleClassifier(n_estimators=100, random_state=1)

eeac.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [397]:
# Check all classes in the model
eeac.classes_

array(['Diesel', 'Other', 'Petrol'], dtype=object)

In [398]:
# Calculated the balanced accuracy score
y_pred = eeac.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.8405152899534922

In [399]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[144,   0,  38],
       [  0,   6,   0],
       [ 34,  14, 130]], dtype=int64)

In [400]:
# Create a DataFrame from the confusion matrix.
EEAC_cm = confusion_matrix(y_test, y_pred)
EEAC_cm_df = pd.DataFrame(
    EEAC_cm, 
    index=["Actual Diesel", "Actual Other", "Actual Petrol"], 
    columns=["Predicted Diesel", "Predicted Other", "Predicted Petrol"])

EEAC_cm_df

Unnamed: 0,Predicted Diesel,Predicted Other,Predicted Petrol
Actual Diesel,144,0,38
Actual Other,0,6,0
Actual Petrol,34,14,130


In [401]:
# Print the imbalanced classification report
print("Easy Ensemble AdaBoost Classifier")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Easy Ensemble AdaBoost Classifier
Accuracy: 84.05% 


                   pre       rec       spe        f1       geo       iba       sup

     Diesel       0.81      0.79      0.82      0.80      0.80      0.64       182
      Other       0.30      1.00      0.96      0.46      0.98      0.96         6
     Petrol       0.77      0.73      0.80      0.75      0.76      0.58       178

avg / total       0.78      0.77      0.81      0.77      0.79      0.62       366

