# Supervised Learning Model
This notebook contains supervised learning model using Balanced Random Forest Classifier to see what features from the used car dataset influence the target of price.

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
from config import password
import psycopg2

The first model used is BRFC used on all features besides identifier and geographical columns. The BRFC is then used on a sample of 5000. It is also worth mentioning that the price column is converted into 7 seperate bins to reduce the amount of unique prices.

In [2]:
file_path = 'Used_Cars.csv'
df = pd.read_csv(file_path)
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.1,Unnamed: 0,id,vin,price,miles,stock_no,year,make,model,trim,...,drivetrain,transmission,fuel_type,engine_size,engine_block,seller_name,street,city,state,zip
0,0,38b2f52e-8f5d,1GCWGFCF3F1284719,20998.0,115879.0,W1T503168C,2015.0,Chevrolet,Express Cargo,Work Van,...,RWD,Automatic,E85 / Unleaded,4.8,V,nissan ellicott city,8569 Baltimore National Pike,Ellicott City,MD,21043
1,1,97ba4955-ccf0,WBY7Z8C59JVB87514,27921.0,7339.0,P33243,2018.0,BMW,i3,s,...,RWD,Automatic,Electric / Premium Unleaded,0.6,I,hendrick honda pompano beach,5381 N Federal Highway,Pompano Beach,FL,33064
2,2,be1da9fd-0f34,ML32F4FJ2JHF10325,11055.0,39798.0,WM2091A,2018.0,Mitsubishi,Mirage G4,SE,...,FWD,Automatic,Unleaded,1.2,I,russ darrow toyota,2700 West Washington St.,West Bend,WI,53095
3,3,84327e45-6cb6,1GCPTEE15K1291189,52997.0,28568.0,9U2Y425A,2019.0,Chevrolet,Colorado,ZR2,...,4WD,Automatic,Diesel,2.8,I,young kia,308 North Main Street,Layton,UT,84041
4,6,43847b9a-6fed,1B7HC16Y8YS543285,3995.0,137537.0,BP8246A,2000.0,Dodge,Ram Pickup,ST,...,RWD,Manual,Unleaded,5.2,V,baumann auto group,2379 W. State St.,Fremont,OH,43420


In [3]:
clean_df = df.drop(columns=['Unnamed: 0', 'id', 'vin', 'seller_name', 'street', 'stock_no', 'city', 'state', 'zip',])
clean_df.drop(clean_df.loc[clean_df['miles'] == 0].index, inplace=True)
clean_df = clean_df.sample(n=5000)
clean_df.head()

Unnamed: 0,price,miles,year,make,model,trim,body_type,vehicle_type,drivetrain,transmission,fuel_type,engine_size,engine_block
3073917,78000.0,29386.0,2019.0,Ford,F-350 Super Duty,Lariat,Pickup,Truck,4WD,Automatic,Diesel,6.7,V
5411703,21955.0,27787.0,2018.0,Kia,Sportage,EX,SUV,Truck,4WD,Automatic,Unleaded,2.4,I
5850372,16994.0,76929.0,2015.0,Cadillac,XTS,Luxury Collection,Sedan,Car,4WD,Automatic,Unleaded,3.6,V
5245137,12484.0,70886.0,2014.0,Volkswagen,Jetta,SE,Sedan,Car,FWD,Automatic,Unleaded,1.8,I
1450444,42988.0,48792.0,2015.0,Toyota,Tundra,Platinum,Pickup,Truck,4WD,Automatic,E85 / Unleaded,5.7,V


In [4]:
clean_df.describe()

Unnamed: 0,price,miles,year,engine_size
count,5000.0,5000.0,5000.0,5000.0
mean,27536.7116,53064.5384,2016.6438,3.12694
std,15390.959043,42832.756657,3.304044,1.3484
min,1588.0,1.0,1995.0,1.0
25%,17166.75,22864.0,2015.0,2.0
50%,24537.0,40056.5,2018.0,2.5
75%,34924.0,73640.25,2019.0,3.6
max,262989.0,399199.0,2021.0,7.0


In [5]:
bins = [0, 5000, 10000, 15000, 25000, 50000, 100000, 400000]
labels = ['0-4,999', '5,000-9,999', '10,000-14999', '15,000-24,999', '25,000-49,999', '50,000-99,999', '100,000-400,000']

In [6]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )


In [7]:
clean_df

Unnamed: 0,price,miles,year,make,model,trim,body_type,vehicle_type,drivetrain,transmission,fuel_type,engine_size,engine_block
3073917,"50,000-99,999",29386.0,2019.0,Ford,F-350 Super Duty,Lariat,Pickup,Truck,4WD,Automatic,Diesel,6.7,V
5411703,"15,000-24,999",27787.0,2018.0,Kia,Sportage,EX,SUV,Truck,4WD,Automatic,Unleaded,2.4,I
5850372,"15,000-24,999",76929.0,2015.0,Cadillac,XTS,Luxury Collection,Sedan,Car,4WD,Automatic,Unleaded,3.6,V
5245137,"10,000-14999",70886.0,2014.0,Volkswagen,Jetta,SE,Sedan,Car,FWD,Automatic,Unleaded,1.8,I
1450444,"25,000-49,999",48792.0,2015.0,Toyota,Tundra,Platinum,Pickup,Truck,4WD,Automatic,E85 / Unleaded,5.7,V
...,...,...,...,...,...,...,...,...,...,...,...,...,...
792560,"15,000-24,999",84027.0,2017.0,Chevrolet,Impala,1LT,Sedan,Car,FWD,Automatic,Unleaded,2.5,I
2512074,"15,000-24,999",47715.0,2019.0,Jeep,Cherokee,Limited,SUV,Truck,4WD,Automatic,Unleaded,3.2,V
1097218,"25,000-49,999",35232.0,2018.0,Ford,F-150,Lariat,Pickup,Truck,4WD,Automatic,Unleaded,3.5,V
2215426,"15,000-24,999",46878.0,2017.0,Jeep,Renegade,Altitude Package,SUV,Truck,FWD,Automatic,E85 / Unleaded,2.4,I


In [8]:
clean_df.nunique()

price              7
miles           4833
year              25
make              41
model            457
trim             468
body_type         15
vehicle_type       2
drivetrain         3
transmission       2
fuel_type         14
engine_size       48
engine_block       3
dtype: int64

In [9]:
clean_df['price'].describe()

count              5000
unique                7
top       25,000-49,999
freq               2061
Name: price, dtype: object

In [10]:
clean_df = clean_df.dropna()

In [11]:
drop_columns = ['price_0-4,999', 'price_5,000-9,999', 'price_10,000-14999', 'price_15,000-24,999', 'price_25,000-49,999', 'price_50,000-99,999', 'price_100,000-400,000']

In [12]:

X = pd.get_dummies(clean_df)
X = X.drop(columns=drop_columns)
y = clean_df['price']

In [13]:
X

Unnamed: 0,miles,year,engine_size,make_Acura,make_Alfa Romeo,make_Audi,make_BMW,make_Bentley,make_Buick,make_Cadillac,...,fuel_type_Electric / Unleaded,fuel_type_Premium Unleaded,fuel_type_Premium Unleaded / Unleaded,fuel_type_Premium Unleaded; Unleaded,fuel_type_Unleaded,fuel_type_Unleaded / E85,fuel_type_Unleaded / Unleaded,engine_block_H,engine_block_I,engine_block_V
3073917,29386.0,2019.0,6.7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5411703,27787.0,2018.0,2.4,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
5850372,76929.0,2015.0,3.6,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
5245137,70886.0,2014.0,1.8,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1450444,48792.0,2015.0,5.7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
792560,84027.0,2017.0,2.5,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2512074,47715.0,2019.0,3.2,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1097218,35232.0,2018.0,3.5,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2215426,46878.0,2017.0,2.4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [14]:
X.describe()

Unnamed: 0,miles,year,engine_size,make_Acura,make_Alfa Romeo,make_Audi,make_BMW,make_Bentley,make_Buick,make_Cadillac,...,fuel_type_Electric / Unleaded,fuel_type_Premium Unleaded,fuel_type_Premium Unleaded / Unleaded,fuel_type_Premium Unleaded; Unleaded,fuel_type_Unleaded,fuel_type_Unleaded / E85,fuel_type_Unleaded / Unleaded,engine_block_H,engine_block_I,engine_block_V
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,53064.5384,2016.6438,3.12694,0.0136,0.0016,0.019,0.0402,0.0004,0.0116,0.016,...,0.0174,0.2128,0.0002,0.0012,0.6598,0.0002,0.0008,0.0276,0.5008,0.4716
std,42832.756657,3.304044,1.3484,0.115835,0.039972,0.136538,0.196448,0.019998,0.107088,0.125488,...,0.130769,0.409328,0.014142,0.034624,0.473824,0.014142,0.028276,0.16384,0.500049,0.499243
min,1.0,1995.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22864.0,2015.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,40056.5,2018.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,73640.25,2019.0,3.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
max,399199.0,2021.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3750, 1008)

In [16]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [17]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  8,   0,   0,   0,   0,   7,   0],
       [ 25,  65,   1,  14,   7,  19,   2],
       [  0,   0,   3,   0,   0,   0,   1],
       [ 13,  68,  10, 240,  62,   6,  18],
       [  4,   8,  53, 125, 236,   0,  96],
       [ 26,  16,   0,   1,   0,  38,   1],
       [  0,   0,  27,   4,  10,   0,  36]], dtype=int64)

In [18]:
balanced_accuracy_score(y_test, y_pred)

0.5329498696517472

In [19]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.11      0.53      0.94      0.18      0.71      0.48        15
   10,000-14999       0.41      0.49      0.92      0.45      0.67      0.43       133
100,000-400,000       0.03      0.75      0.93      0.06      0.83      0.68         4
  15,000-24,999       0.62      0.58      0.83      0.60      0.69      0.46       417
  25,000-49,999       0.75      0.45      0.89      0.56      0.63      0.39       522
    5,000-9,999       0.54      0.46      0.97      0.50      0.67      0.43        82
  50,000-99,999       0.23      0.47      0.90      0.31      0.65      0.40        77

    avg / total       0.62      0.50      0.88      0.54      0.66      0.42      1250



In [20]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

[(0.08970717172050738, 'year'),
 (0.08664439234472071, 'miles'),
 (0.05690506444147572, 'engine_size'),
 (0.025687759492793324, 'fuel_type_Unleaded'),
 (0.024123208663402236, 'fuel_type_Premium Unleaded'),
 (0.023060419043461265, 'drivetrain_FWD'),
 (0.022439677708152535, 'engine_block_I'),
 (0.020721663580223582, 'drivetrain_4WD'),
 (0.018806291876133464, 'engine_block_V'),
 (0.0166258857268406, 'vehicle_type_Truck'),
 (0.014243400663976256, 'body_type_SUV'),
 (0.013397226080301904, 'make_Porsche'),
 (0.013166513720925123, 'drivetrain_RWD'),
 (0.013134915502553874, 'body_type_Sedan'),
 (0.01262504801523297, 'body_type_Pickup'),
 (0.010636373428650498, 'make_Ford'),
 (0.010564485207216224, 'vehicle_type_Car'),
 (0.010121831762451463, 'model_911'),
 (0.009611935558300132, 'make_Toyota'),
 (0.009487614465182872, 'trim_SE'),
 (0.009096687118204081, 'transmission_Manual'),
 (0.008080546572549332, 'trim_Base'),
 (0.007781707954285964, 'make_Jeep'),
 (0.007706257361365188, 'make_Honda'),
 (0

Looking at the feature importances we can see that year and miles has the most influence over price, which is then followed by enginesize, fuel type, and the drivetrain. This makes sense as the year and miles on a car play a big part in deciding if the car is worth what they are selling it for. While this model is informatiive, it doesn't answer the question of what make has most influence on price.

# BRFC Focused on Make
The next model is a BRFC with the used car dataset, except this time the data is cleaned to focus on the make of the car. This means we drop some of the columns such as body_type, fuel_type and more, so the feature importances highlights the makes that influence price.

In [21]:
clean_df = pd.read_csv('make_model.csv')

In [22]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'transmission', 'fuel_type', 'engine_size', 'engine_block'])

In [23]:
clean_df

Unnamed: 0,price,miles,year,make,model,trim
0,20998.0,115879.0,2015.0,Chevrolet,Express Cargo,Work Van
1,27921.0,7339.0,2018.0,BMW,i3,s
2,11055.0,39798.0,2018.0,Mitsubishi,Mirage G4,SE
3,52997.0,28568.0,2019.0,Chevrolet,Colorado,ZR2
4,3995.0,137537.0,2000.0,Dodge,Ram Pickup,ST
...,...,...,...,...,...,...
6167618,69900.0,15270.0,2019.0,Ford,F-250 Super Duty,Lariat
6167619,32991.0,143026.0,2011.0,Ford,F-250 Super Duty,King Ranch
6167620,82900.0,3686.0,2021.0,Ford,F-250 Super Duty,King Ranch
6167621,59995.0,39111.0,2019.0,Ford,F-250 Super Duty,XLT


In [24]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [25]:

make_df = clean_df.sample(n=5000)


In [26]:
make_df

Unnamed: 0,price,miles,year,make,model,trim
1687077,"10,000-14999",35447.0,2017.0,Ford,C-Max,SE
3971881,"15,000-24,999",62960.0,2017.0,Nissan,Rogue,S
2412659,"10,000-14999",115473.0,2011.0,Toyota,Camry,SE
1925670,"15,000-24,999",103890.0,2014.0,Nissan,Frontier,SV
224265,"25,000-49,999",18104.0,2019.0,Ford,Edge,SEL
...,...,...,...,...,...,...
2934755,"50,000-99,999",32657.0,2020.0,GMC,Sierra 1500,AT4
4425987,"25,000-49,999",44140.0,2018.0,GENESIS,G80,Base
5467496,"25,000-49,999",27684.0,2020.0,Toyota,RAV4,Limited
953195,"25,000-49,999",64854.0,2015.0,Ford,F-150,Lariat


In [27]:
X = pd.get_dummies(make_df)
X = X.drop(columns= ["price_0-4,999", "price_5,000-9,999", "price_10,000-14999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = make_df['price']

In [28]:
X.describe()

Unnamed: 0,miles,year,make_Acura,make_Alfa Romeo,make_Audi,make_BMW,make_Buick,make_Cadillac,make_Chevrolet,make_Chrysler,...,trim_s,trim_s Sport,trim_sDrive35i,trim_xDrive28i,trim_xDrive30i,trim_xDrive35d,trim_xDrive35i,trim_xDrive35i Premium,trim_xDrive40e,trim_xDrive50i
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,52808.4176,2016.6258,0.0146,0.0012,0.0164,0.0398,0.012,0.0146,0.1148,0.011,...,0.0002,0.0002,0.0016,0.0006,0.0018,0.0002,0.0032,0.0002,0.0004,0.0008
std,42893.478583,3.422472,0.119957,0.034624,0.127021,0.195509,0.108896,0.119957,0.318812,0.104313,...,0.014142,0.014142,0.039972,0.02449,0.042392,0.014142,0.056484,0.014142,0.019998,0.028276
min,0.0,1997.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22777.75,2015.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,40025.0,2018.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,72937.0,2019.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,376396.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3750, 962)

In [30]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [31]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  7,   0,   0,   0,   0,   4,   0],
       [ 15,  39,   4,  23,   1,  42,   0],
       [  0,   0,   4,   0,   0,   0,   0],
       [ 12,  41,  26, 243,  48,  33,   8],
       [  3,  18,  70,  92, 243,   9,  80],
       [ 48,  16,   0,   1,   0,  42,   0],
       [  0,   0,  19,   2,   9,   0,  48]], dtype=int64)

In [32]:
balanced_accuracy_score(y_test, y_pred)

0.5745533259104392

In [33]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.08      0.64      0.94      0.15      0.77      0.58        11
   10,000-14999       0.34      0.31      0.93      0.33      0.54      0.28       124
100,000-400,000       0.03      1.00      0.90      0.06      0.95      0.91         4
  15,000-24,999       0.67      0.59      0.86      0.63      0.71      0.49       411
  25,000-49,999       0.81      0.47      0.92      0.60      0.66      0.42       515
    5,000-9,999       0.32      0.39      0.92      0.35      0.60      0.34       107
  50,000-99,999       0.35      0.62      0.92      0.45      0.75      0.55        78

    avg / total       0.64      0.50      0.90      0.54      0.67      0.43      1250



In [34]:
# List the features sorted in descending order by feature importance
make_dict = sorted(zip(clf.feature_importances_, X.columns), reverse=True)
make_dict

[(0.12974143001857716, 'miles'),
 (0.1151207094160488, 'year'),
 (0.019324221352737835, 'trim_Base'),
 (0.015328002768468432, 'make_Mercedes-Benz'),
 (0.01275113400589603, 'make_Ford'),
 (0.012313477882929129, 'trim_SE'),
 (0.01221782764212053, 'make_Chevrolet'),
 (0.011884133407419372, 'make_Toyota'),
 (0.0113805026270288, 'trim_Limited'),
 (0.01133397215901601, 'make_RAM'),
 (0.01060575060579816, 'trim_S'),
 (0.00969452380252859, 'make_Nissan'),
 (0.009496263749753634, 'make_Jeep'),
 (0.009375522138588254, 'make_Honda'),
 (0.00819634328576046, 'make_Hyundai'),
 (0.007141484098895135, 'trim_EX'),
 (0.007070922225794664, 'model_AMG GT 4-Door Coupe'),
 (0.007039631291624823, 'trim_LX'),
 (0.006993076199958701, 'model_F-150'),
 (0.006938785004547812, 'trim_XLT'),
 (0.0068227670439302765, 'trim_LT'),
 (0.006659181031886547, 'make_BMW'),
 (0.006288072750639508, 'make_Porsche'),
 (0.006069818605500571, 'make_Kia'),
 (0.0060468715911546475, 'trim_63'),
 (0.005832699311959656, 'trim_1LT'),
 (

# Luxury Makes
After the model focusing on makes, the next step was to process the data even more so that the makes are split on luxury and non luxury makes. This is because the pricing and quality of luxury makes create a bias when comparing to regular makes.

In [35]:
clean_df = pd.read_csv('luxury_makes.csv')

In [36]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'fuel_type', 'engine_size', 'engine_block'])

In [37]:
clean_df = clean_df.dropna()

In [38]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [39]:
clean_df['make'].unique()

array(['BMW', 'Mercedes-Benz', 'Lexus', 'Porsche', 'Audi', 'Lamborghini',
       'Alfa Romeo', 'Ferrari', 'Maserati', 'Jaguar', 'Aston Martin',
       'Bentley', 'Land Rover', 'Cadillac', 'Rolls-Royce', 'INFINITI',
       'Maybach', 'Acura', 'Fisker', 'McLaren', 'Lotus'], dtype=object)

In [40]:

luxury_make_df = clean_df.sample(n=5000)

In [41]:
X = pd.get_dummies(luxury_make_df)
X = X.drop(columns=["price_0-4,999", "price_5,000-9,999", "price_10,000-14999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = luxury_make_df['price']

In [42]:
X

Unnamed: 0,miles,year,make_Acura,make_Alfa Romeo,make_Aston Martin,make_Audi,make_BMW,make_Bentley,make_Cadillac,make_Ferrari,...,trim_xDrive28i,trim_xDrive30i,trim_xDrive35d,trim_xDrive35i,trim_xDrive35i Premium,trim_xDrive35i Sport Activity,trim_xDrive40e,trim_xDrive50i,transmission_Automatic,transmission_Manual
76839,11131.0,2018.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
739869,90398.0,2014.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
474617,21616.0,2018.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1106366,30.0,2020.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
221769,83983.0,2014.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126033,22432.0,2019.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
509620,12118.0,2019.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
487758,96693.0,2014.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1056339,28009.0,2018.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3750, 564)

In [44]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

ValueError: Input contains NaN

In [None]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
balanced_accuracy_score(y_test, y_pred)

In [None]:
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

# Non-Luxury Makes

In [None]:
clean_df = pd.read_csv('regular_makes.csv')

In [None]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'fuel_type', 'engine_size', 'engine_block'])

In [None]:
clean_df = clean_df.dropna()

In [None]:
clean_df['make'].unique()

In [None]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [None]:
regular_make_df = clean_df.sample(n=5000)

In [None]:
X = pd.get_dummies(regular_make_df)
X = X.drop(columns=["price_0-4,999", "price_5,000-9,999", "price_10,000-14999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = regular_make_df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

In [None]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [None]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
balanced_accuracy_score(y_test, y_pred)

In [None]:
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

# Testing other Models

# Easy Ensemble AdaBoosy Classifier

In [45]:
from imblearn.ensemble import EasyEnsembleClassifier


In [72]:
clean_df = pd.read_csv('make_model.csv')

In [73]:
clean_df = clean_df.drop(columns=['id','vin'])

In [74]:
clean_df

Unnamed: 0,price,miles,year,make,model,trim,body_type,vehicle_type,drivetrain,transmission,fuel_type,engine_size,engine_block
0,20998.0,115879.0,2015.0,Chevrolet,Express Cargo,Work Van,Cargo Van,Truck,RWD,Automatic,E85 / Unleaded,4.8,V
1,27921.0,7339.0,2018.0,BMW,i3,s,Hatchback,Car,RWD,Automatic,Electric / Premium Unleaded,0.6,I
2,11055.0,39798.0,2018.0,Mitsubishi,Mirage G4,SE,Sedan,Car,FWD,Automatic,Unleaded,1.2,I
3,52997.0,28568.0,2019.0,Chevrolet,Colorado,ZR2,Pickup,Truck,4WD,Automatic,Diesel,2.8,I
4,3995.0,137537.0,2000.0,Dodge,Ram Pickup,ST,Pickup,Truck,RWD,Manual,Unleaded,5.2,V
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6167618,69900.0,15270.0,2019.0,Ford,F-250 Super Duty,Lariat,Pickup,Truck,4WD,Automatic,Diesel,6.7,V
6167619,32991.0,143026.0,2011.0,Ford,F-250 Super Duty,King Ranch,Pickup,Truck,4WD,Automatic,Diesel,6.7,V
6167620,82900.0,3686.0,2021.0,Ford,F-250 Super Duty,King Ranch,Pickup,Truck,4WD,Automatic,Diesel,6.7,V
6167621,59995.0,39111.0,2019.0,Ford,F-250 Super Duty,XLT,Pickup,Truck,4WD,Automatic,Diesel,6.7,V


In [75]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [76]:
make_df = clean_df.sample(n=5000)

In [77]:
X = pd.get_dummies(make_df)
X = X.drop(columns= ["price_0-4,999", "price_5,000-9,999", "price_10,000-14999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = make_df['price']

In [78]:
X

Unnamed: 0,miles,year,engine_size,make_Acura,make_Alfa Romeo,make_Audi,make_BMW,make_Buick,make_Cadillac,make_Chevrolet,...,fuel_type_Electric / Premium Unleaded,fuel_type_Electric / Premium Unleaded; Premium Unleaded,fuel_type_Electric / Unleaded,fuel_type_Premium Unleaded,fuel_type_Premium Unleaded; Unleaded,fuel_type_Unleaded,fuel_type_Unleaded / E85,engine_block_H,engine_block_I,engine_block_V
5463579,153565.0,2011.0,2.5,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1736052,47189.0,2017.0,2.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
50873,33539.0,2019.0,2.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4007441,111624.0,2015.0,2.5,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3012970,57372.0,2017.0,5.7,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5260911,2206.0,2019.0,1.8,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4488952,24580.0,2020.0,5.3,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
2183844,82420.0,2017.0,1.8,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1192784,25245.0,2018.0,3.6,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3750, 1022)

In [80]:
eec = EasyEnsembleClassifier(random_state=1, n_estimators=100)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [81]:
# Display the confusion matrix
y_pred = eec.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  8,   0,   0,   0,   0,   7,   0],
       [  0,  81,   0,  31,   3,  21,   0],
       [  0,   1,   2,   0,   0,   0,   1],
       [  2,  80,   0, 257,  85,  11,   0],
       [  1,  38,   0, 120, 329,   2,  13],
       [  3,  28,   0,   1,   0,  38,   0],
       [  0,   0,   1,   8,  51,   0,  27]], dtype=int64)

In [82]:
balanced_accuracy_score(y_test, y_pred)

0.5324290976416617

In [83]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.57      0.53      1.00      0.55      0.73      0.51        15
   10,000-14999       0.36      0.60      0.87      0.45      0.72      0.50       136
100,000-400,000       0.67      0.50      1.00      0.57      0.71      0.47         4
  15,000-24,999       0.62      0.59      0.80      0.60      0.69      0.46       435
  25,000-49,999       0.70      0.65      0.81      0.68      0.73      0.52       503
    5,000-9,999       0.48      0.54      0.97      0.51      0.72      0.50        70
  50,000-99,999       0.66      0.31      0.99      0.42      0.55      0.29        87

    avg / total       0.62      0.59      0.84      0.60      0.70      0.48      1250



In [84]:
# List the features sorted in descending order by feature importance
make_dict = sorted(zip(clf.feature_importances_, X.columns), reverse=True)
make_dict

[(0.12974143001857716, 'miles'),
 (0.1151207094160488, 'year'),
 (0.019324221352737835, 'trim_AMG® GLC43'),
 (0.015328002768468432, 'make_Maserati'),
 (0.01275113400589603, 'make_Ferrari'),
 (0.012313477882929129, 'trim_RTL-T'),
 (0.01221782764212053, 'make_Cadillac'),
 (0.011884133407419372, 'make_Subaru'),
 (0.0113805026270288, 'trim_LX'),
 (0.01133397215901601, 'make_Pontiac'),
 (0.01060575060579816, 'trim_Pure Plus'),
 (0.00969452380252859, 'make_Mercedes-Benz'),
 (0.009496263749753634, 'make_INFINITI'),
 (0.009375522138588254, 'make_GMC'),
 (0.00819634328576046, 'make_Honda'),
 (0.007141484098895135, 'trim_Crew'),
 (0.007070922225794664, 'model_A5 Sportback'),
 (0.007039631291624823, 'trim_LAREDO'),
 (0.006993076199958701, 'model_F-150'),
 (0.006938785004547812, 'trim_WRX Premium'),
 (0.0068227670439302765, 'trim_I Premium'),
 (0.006659181031886547, 'make_Audi'),
 (0.006288072750639508, 'make_Nissan'),
 (0.006069818605500571, 'make_Jaguar'),
 (0.0060468715911546475, 'trim_45'),
 (

Non-luxury makes only

In [98]:
clean_df = pd.read_csv('regular_makes.csv')

In [99]:
clean_df = clean_df.drop(columns=['id','vin'])

In [100]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [101]:
make_df = clean_df.sample(n=20000)

In [102]:
X = pd.get_dummies(make_df)
X = X.drop(columns= ["price_0-4,999", "price_5,000-9,999", "price_10,000-14999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = make_df['price']

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(15000, 1063)

In [104]:
eec = EasyEnsembleClassifier(random_state=1, n_estimators=100)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [105]:
# Display the confusion matrix
y_pred = eec.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  26,    1,    0,    0,    0,   34,    0],
       [  13,  396,    0,  102,    2,   99,    0],
       [   0,    0,    8,    0,    0,    0,    0],
       [  10,  502,   52, 1089,  137,   70,   18],
       [   1,  242,   94,  521,  736,   17,  167],
       [  52,  109,    1,    4,    0,  225,    0],
       [   0,    4,   36,    5,   99,    1,  127]], dtype=int64)

In [106]:
balanced_accuracy_score(y_test, y_pred)

0.5870668753864688

In [107]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.25      0.43      0.98      0.32      0.65      0.40        61
   10,000-14999       0.32      0.65      0.80      0.42      0.72      0.51       612
100,000-400,000       0.04      1.00      0.96      0.08      0.98      0.97         8
  15,000-24,999       0.63      0.58      0.80      0.61      0.68      0.45      1878
  25,000-49,999       0.76      0.41      0.93      0.53      0.62      0.36      1778
    5,000-9,999       0.50      0.58      0.95      0.54      0.74      0.53       391
  50,000-99,999       0.41      0.47      0.96      0.43      0.67      0.43       272

    avg / total       0.61      0.52      0.87      0.54      0.67      0.43      5000



In [108]:
# List the features sorted in descending order by feature importance
make_dict = sorted(zip(clf.feature_importances_, X.columns), reverse=True)
make_dict

[(0.12974143001857716, 'miles'),
 (0.1151207094160488, 'year'),
 (0.019324221352737835, 'trim_FE'),
 (0.015328002768468432, 'make_Suzuki'),
 (0.01275113400589603, 'make_Hyundai'),
 (0.012313477882929129, 'trim_SEL'),
 (0.01221782764212053, 'make_Ford'),
 (0.011884133407419372, 'model_370Z Roadster'),
 (0.0113805026270288, 'trim_Outback Sport'),
 (0.01133397215901601, 'model_300'),
 (0.01060575060579816, 'trim_SE Hybrid'),
 (0.00969452380252859, 'make_Volvo'),
 (0.009496263749753634, 'make_Mercury'),
 (0.009375522138588254, 'make_Kia'),
 (0.00819634328576046, 'make_Lincoln'),
 (0.007141484098895135, 'trim_High Country'),
 (0.007070922225794664, 'model_B9 Tribeca'),
 (0.007039631291624823, 'trim_NISMO RS'),
 (0.006993076199958701, 'model_Fusion Energi'),
 (0.006938785004547812, 'trim_Trekking'),
 (0.0068227670439302765, 'trim_Mainstreet'),
 (0.006659181031886547, 'make_Chrysler'),
 (0.006288072750639508, 'model_200'),
 (0.006069818605500571, 'make_Mitsubishi'),
 (0.0060468715911546475, '

# Database

In [None]:
db_string = f"postgresql://postgres:ilovedata@localhost:5432/final_project_db"

In [None]:
engine=create_engine(db_string)

In [None]:
clean_df.to_sql(name='used_car_sample', con=engine, if_exists='replace')
make_df.to_sql(name='make_sample', con=engine, if_exists='replace')
luxury_make_df.to_sql(name='luxury_sample', con=engine, if_exists='replace')
regular_make_df.to_sql(name='regular_sample', con=engine, if_exists='replace')