# Supervised Learning Model
This notebook contains supervised learning model using Balanced Random Forest Classifier to see what features from the used car dataset influence the target of price.

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
from config import password
import psycopg2

The first model used is BRFC used on all features besides identifier and geographical columns. The BRFC is then used on a sample of 5000. It is also worth mentioning that the price column is converted into 7 seperate bins to reduce the amount of unique prices.

In [2]:
file_path = 'Used_Cars.csv'
df = pd.read_csv(file_path)
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.1,Unnamed: 0,id,vin,price,miles,stock_no,year,make,model,trim,...,drivetrain,transmission,fuel_type,engine_size,engine_block,seller_name,street,city,state,zip
0,0,38b2f52e-8f5d,1GCWGFCF3F1284719,20998.0,115879.0,W1T503168C,2015.0,Chevrolet,Express Cargo,Work Van,...,RWD,Automatic,E85 / Unleaded,4.8,V,nissan ellicott city,8569 Baltimore National Pike,Ellicott City,MD,21043
1,1,97ba4955-ccf0,WBY7Z8C59JVB87514,27921.0,7339.0,P33243,2018.0,BMW,i3,s,...,RWD,Automatic,Electric / Premium Unleaded,0.6,I,hendrick honda pompano beach,5381 N Federal Highway,Pompano Beach,FL,33064
2,2,be1da9fd-0f34,ML32F4FJ2JHF10325,11055.0,39798.0,WM2091A,2018.0,Mitsubishi,Mirage G4,SE,...,FWD,Automatic,Unleaded,1.2,I,russ darrow toyota,2700 West Washington St.,West Bend,WI,53095
3,3,84327e45-6cb6,1GCPTEE15K1291189,52997.0,28568.0,9U2Y425A,2019.0,Chevrolet,Colorado,ZR2,...,4WD,Automatic,Diesel,2.8,I,young kia,308 North Main Street,Layton,UT,84041
4,6,43847b9a-6fed,1B7HC16Y8YS543285,3995.0,137537.0,BP8246A,2000.0,Dodge,Ram Pickup,ST,...,RWD,Manual,Unleaded,5.2,V,baumann auto group,2379 W. State St.,Fremont,OH,43420


In [3]:
clean_df = df.drop(columns=['Unnamed: 0', 'id', 'vin', 'seller_name', 'street', 'stock_no', 'city', 'state', 'zip',])
clean_df.drop(clean_df.loc[clean_df['miles'] == 0].index, inplace=True)
clean_df = clean_df.sample(n=5000)
clean_df.head()

Unnamed: 0,price,miles,year,make,model,trim,body_type,vehicle_type,drivetrain,transmission,fuel_type,engine_size,engine_block
2275270,16690.0,27864.0,2019.0,Kia,Optima,LX,Sedan,Car,FWD,Automatic,Unleaded,2.4,I
3509422,50488.0,9193.0,2019.0,Ford,F-150,XLT,Pickup,Truck,4WD,Automatic,E85 / Unleaded,5.0,V
4038465,25850.0,92000.0,2016.0,Ford,F-150,Lariat,Pickup,Truck,4WD,Automatic,Unleaded,2.7,V
3916493,24000.0,17589.0,2018.0,Toyota,RAV4,XLE,SUV,Truck,4WD,Automatic,Unleaded,2.5,I
665248,21998.0,45838.0,2016.0,Acura,TLX,Technology Package,Sedan,Car,FWD,Automatic,Premium Unleaded,2.4,I


In [4]:
clean_df.describe()

Unnamed: 0,price,miles,year,engine_size
count,5000.0,5000.0,5000.0,5000.0
mean,28366.6486,51822.5784,2016.7076,3.1267
std,16875.960804,41587.304112,3.389572,1.326937
min,0.0,2.0,1997.0,1.0
25%,17979.75,22400.75,2016.0,2.0
50%,24999.0,39090.5,2018.0,2.7
75%,35991.75,72022.0,2019.0,3.6
max,329990.0,306632.0,2021.0,7.3


In [5]:
bins = [0, 5000, 10000, 15000, 25000, 50000, 100000, 400000]
labels = ['0-4,999', '5,000-9,999', '10,000-14,999', '15,000-24,999', '25,000-49,999', '50,000-99,999', '100,000-400,000']

In [6]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )


In [7]:
clean_df

Unnamed: 0,price,miles,year,make,model,trim,body_type,vehicle_type,drivetrain,transmission,fuel_type,engine_size,engine_block
2275270,"15,000-24,999",27864.0,2019.0,Kia,Optima,LX,Sedan,Car,FWD,Automatic,Unleaded,2.4,I
3509422,"50,000-99,999",9193.0,2019.0,Ford,F-150,XLT,Pickup,Truck,4WD,Automatic,E85 / Unleaded,5.0,V
4038465,"25,000-49,999",92000.0,2016.0,Ford,F-150,Lariat,Pickup,Truck,4WD,Automatic,Unleaded,2.7,V
3916493,"15,000-24,999",17589.0,2018.0,Toyota,RAV4,XLE,SUV,Truck,4WD,Automatic,Unleaded,2.5,I
665248,"15,000-24,999",45838.0,2016.0,Acura,TLX,Technology Package,Sedan,Car,FWD,Automatic,Premium Unleaded,2.4,I
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4171557,"10,000-14,999",156906.0,2011.0,Toyota,Camry,XLE,Sedan,Car,FWD,Automatic,Unleaded,3.5,V
1030667,"5,000-9,999",115120.0,2004.0,Nissan,350Z Roadster,Enthusiast,Roadster,Car,RWD,Automatic,Premium Unleaded,3.5,V
614728,"5,000-9,999",77000.0,2012.0,Honda,Civic,LX,Coupe,Car,FWD,Automatic,Unleaded,1.8,I
5636124,"15,000-24,999",68410.0,2015.0,Toyota,Sienna,SE,Minivan,Truck,FWD,Automatic,Unleaded,3.5,V


In [8]:
clean_df.nunique()

price              7
miles           4835
year              25
make              45
model            449
trim             466
body_type         18
vehicle_type       2
drivetrain         3
transmission       2
fuel_type         13
engine_size       50
engine_block       3
dtype: int64

In [9]:
clean_df['price'].describe()

count              4999
unique                7
top       25,000-49,999
freq               2129
Name: price, dtype: object

In [10]:
clean_df = clean_df.dropna()

In [11]:
drop_columns = ['price_0-4,999', 'price_5,000-9,999', 'price_10,000-14,999', 'price_15,000-24,999', 'price_25,000-49,999', 'price_50,000-99,999', 'price_100,000-400,000']

In [12]:
X = pd.get_dummies(clean_df)
X = X.drop(columns=drop_columns)
y = clean_df['price']

In [13]:
X

Unnamed: 0,miles,year,engine_size,make_Acura,make_Alfa Romeo,make_Audi,make_BMW,make_Bentley,make_Buick,make_Cadillac,...,fuel_type_Electric / Premium Unleaded,fuel_type_Electric / Premium Unleaded; Premium Unleaded,fuel_type_Electric / Unleaded,fuel_type_Premium Unleaded,fuel_type_Premium Unleaded; Unleaded,fuel_type_Unleaded,fuel_type_Unleaded / Unleaded,engine_block_H,engine_block_I,engine_block_V
2275270,27864.0,2019.0,2.4,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3509422,9193.0,2019.0,5.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4038465,92000.0,2016.0,2.7,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3916493,17589.0,2018.0,2.5,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
665248,45838.0,2016.0,2.4,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4171557,156906.0,2011.0,3.5,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1030667,115120.0,2004.0,3.5,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
614728,77000.0,2012.0,1.8,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5636124,68410.0,2015.0,3.5,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [14]:
X.describe()

Unnamed: 0,miles,year,engine_size,make_Acura,make_Alfa Romeo,make_Audi,make_BMW,make_Bentley,make_Buick,make_Cadillac,...,fuel_type_Electric / Premium Unleaded,fuel_type_Electric / Premium Unleaded; Premium Unleaded,fuel_type_Electric / Unleaded,fuel_type_Premium Unleaded,fuel_type_Premium Unleaded; Unleaded,fuel_type_Unleaded,fuel_type_Unleaded / Unleaded,engine_block_H,engine_block_I,engine_block_V
count,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,...,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0,4999.0
mean,51801.934587,2016.708142,3.126845,0.013003,0.001,0.020604,0.038608,0.0006,0.012603,0.014203,...,0.003401,0.0002,0.014403,0.212242,0.0008,0.662733,0.0004,0.032006,0.486097,0.481896
std,41565.835021,3.389695,1.32703,0.113297,0.031613,0.142069,0.192677,0.024492,0.111562,0.118338,...,0.058222,0.014144,0.119157,0.408936,0.028279,0.472824,0.02,0.176035,0.499857,0.499722
min,2.0,1997.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22400.5,2016.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,39090.0,2018.0,2.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,71975.5,2019.0,3.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
max,306632.0,2021.0,7.3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3749, 1004)

In [16]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [17]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 13,   1,   0,   0,   0,   2,   0],
       [ 16,  59,   6,  15,   2,  15,   6],
       [  0,   0,   7,   0,   0,   0,   2],
       [ 13,  61,  10, 226,  58,   2,  27],
       [  6,   5,  60,  90, 258,   0, 112],
       [ 25,  32,   2,   4,   0,  19,   0],
       [  0,   0,  21,   3,  12,   0,  60]], dtype=int64)

In [18]:
balanced_accuracy_score(y_test, y_pred)

0.5711326631148498

In [19]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.18      0.81      0.95      0.29      0.88      0.76        16
  10,000-14,999       0.37      0.50      0.91      0.43      0.67      0.43       119
100,000-400,000       0.07      0.78      0.92      0.12      0.85      0.71         9
  15,000-24,999       0.67      0.57      0.87      0.61      0.70      0.48       397
  25,000-49,999       0.78      0.49      0.90      0.60      0.66      0.42       531
    5,000-9,999       0.50      0.23      0.98      0.32      0.48      0.21        82
  50,000-99,999       0.29      0.62      0.87      0.40      0.74      0.53        96

    avg / total       0.64      0.51      0.90      0.55      0.67      0.44      1250



In [20]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

[(0.08848894057101156, 'miles'),
 (0.08003564745005144, 'year'),
 (0.061724659871885586, 'engine_size'),
 (0.031065648693140543, 'fuel_type_Unleaded'),
 (0.025057959612664552, 'engine_block_V'),
 (0.02402752469379794, 'fuel_type_Premium Unleaded'),
 (0.02280343806092011, 'drivetrain_FWD'),
 (0.020161062489782996, 'vehicle_type_Truck'),
 (0.018641279245813255, 'body_type_Sedan'),
 (0.017443693533474917, 'engine_block_I'),
 (0.015872409602041598, 'drivetrain_4WD'),
 (0.014347513775414447, 'make_Chevrolet'),
 (0.012986654104056456, 'make_Ford'),
 (0.012303597219456654, 'body_type_Pickup'),
 (0.01219447003898541, 'transmission_Manual'),
 (0.012136112012606098, 'vehicle_type_Car'),
 (0.01186377983932343, 'body_type_SUV'),
 (0.010402171255264494, 'make_Toyota'),
 (0.010282620739484747, 'drivetrain_RWD'),
 (0.009556609594528977, 'transmission_Automatic'),
 (0.008822857952131636, 'trim_Limited'),
 (0.00882179984085719, 'body_type_Coupe'),
 (0.008102345253929373, 'trim_Base'),
 (0.0078947407322

Looking at the feature importances we can see that year and miles has the most influence over price, which is then followed by enginesize, fuel type, and the drivetrain. This makes sense as the year and miles on a car play a big part in deciding if the car is worth what they are selling it for. While this model is informatiive, it doesn't answer the question of what make has most influence on price.

# BRFC Focused on Make
The next model is a BRFC with the used car dataset, except this time the data is cleaned to focus on the make of the car. This means we drop some of the columns such as body_type, fuel_type and more, so the feature importances highlights the makes that influence price.

In [21]:
clean_df = pd.read_csv('make_model.csv')

In [22]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'transmission', 'fuel_type', 'engine_size', 'engine_block'])

In [23]:
clean_df

Unnamed: 0,price,miles,year,make,model,trim
0,20998.0,115879.0,2015.0,Chevrolet,Express Cargo,Work Van
1,27921.0,7339.0,2018.0,BMW,i3,s
2,11055.0,39798.0,2018.0,Mitsubishi,Mirage G4,SE
3,52997.0,28568.0,2019.0,Chevrolet,Colorado,ZR2
4,3995.0,137537.0,2000.0,Dodge,Ram Pickup,ST
...,...,...,...,...,...,...
6167618,69900.0,15270.0,2019.0,Ford,F-250 Super Duty,Lariat
6167619,32991.0,143026.0,2011.0,Ford,F-250 Super Duty,King Ranch
6167620,82900.0,3686.0,2021.0,Ford,F-250 Super Duty,King Ranch
6167621,59995.0,39111.0,2019.0,Ford,F-250 Super Duty,XLT


In [24]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [25]:

make_df = clean_df.sample(n=5000)


In [26]:
make_df = make_df.dropna()
make_df

Unnamed: 0,price,miles,year,make,model,trim
5237904,"15,000-24,999",45171.0,2019.0,Honda,HR-V,Sport
27618,"10,000-14,999",137665.0,2011.0,Ford,Expedition,Limited
5090526,"10,000-14,999",75445.0,2013.0,Chevrolet,Cruze,1LT
497623,"25,000-49,999",56925.0,2018.0,Honda,CR-V,EX
4421836,"15,000-24,999",96456.0,2014.0,INFINITI,QX50,Journey
...,...,...,...,...,...,...
3747283,"5,000-9,999",165905.0,2011.0,Dodge,Journey,Express
4560302,"50,000-99,999",11394.0,2020.0,RAM,Ram 1500 Pickup,Limited
1639833,"15,000-24,999",19081.0,2019.0,Subaru,Impreza,Base
5090906,"15,000-24,999",19122.0,2019.0,Chevrolet,Cruze,LT


In [27]:
make_df.isnull().values.sum()

0

In [28]:
X = pd.get_dummies(make_df)
X = X.drop(columns= ["price_0-4,999", "price_5,000-9,999", "price_10,000-14,999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = make_df['price']

In [29]:
X.describe()

Unnamed: 0,miles,year,make_Acura,make_Alfa Romeo,make_Audi,make_BMW,make_Bentley,make_Buick,make_Cadillac,make_Chevrolet,...,trim_passion,trim_s Grand Touring,trim_s Touring,trim_sDrive28i,trim_sDrive35i,trim_xDrive28i,trim_xDrive30i,trim_xDrive35i,trim_xDrive40e,trim_xDrive50i
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,52578.8574,2016.656,0.012,0.0014,0.02,0.0394,0.0004,0.0122,0.0126,0.1122,...,0.0002,0.0002,0.0002,0.0002,0.0008,0.0016,0.0012,0.0022,0.0008,0.0002
std,42956.989935,3.332069,0.108896,0.037394,0.140014,0.194564,0.019998,0.109789,0.111551,0.315644,...,0.014142,0.014142,0.014142,0.014142,0.028276,0.039972,0.034624,0.046857,0.028276,0.014142
min,0.0,1997.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22448.75,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,39336.5,2018.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,73000.25,2019.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,310611.0,2022.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3750, 973)

In [31]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [32]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  8,   1,   0,   0,   0,   0,   0],
       [ 16,  66,   3,  14,   1,  20,   0],
       [  0,   0,   6,   0,   0,   0,   0],
       [  6, 108,  24, 157,  73,  17,  14],
       [  6,  27,  84,  48, 284,   7,  69],
       [ 50,  33,   2,   1,   0,  23,   0],
       [  0,   0,  23,   5,  14,   0,  40]], dtype=int64)

In [33]:
balanced_accuracy_score(y_test, y_pred)

0.5817341473535943

In [34]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.09      0.89      0.94      0.17      0.91      0.83         9
  10,000-14,999       0.28      0.55      0.85      0.37      0.68      0.45       120
100,000-400,000       0.04      1.00      0.89      0.08      0.94      0.90         6
  15,000-24,999       0.70      0.39      0.92      0.50      0.60      0.34       399
  25,000-49,999       0.76      0.54      0.88      0.63      0.69      0.46       525
    5,000-9,999       0.34      0.21      0.96      0.26      0.45      0.19       109
  50,000-99,999       0.33      0.49      0.93      0.39      0.67      0.43        82

    avg / total       0.62      0.47      0.90      0.51      0.64      0.40      1250



In [35]:
# List the features sorted in descending order by feature importance
make_dict = sorted(zip(clf.feature_importances_, X.columns), reverse=True)
make_dict

[(0.1174667137595566, 'miles'),
 (0.1021479879783411, 'year'),
 (0.018411783143244174, 'trim_Base'),
 (0.013940763484361284, 'make_Ford'),
 (0.01215427576395488, 'make_RAM'),
 (0.012064997946671369, 'make_Toyota'),
 (0.011801134430459567, 'make_Honda'),
 (0.011213638754357644, 'make_Chevrolet'),
 (0.010428685763227854, 'make_Mercedes-Benz'),
 (0.010149370284106653, 'make_Jeep'),
 (0.010025360660631978, 'make_Porsche'),
 (0.009585505022875867, 'trim_SE'),
 (0.00891625236472175, 'make_Nissan'),
 (0.008278677177965578, 'trim_S'),
 (0.008116939121086103, 'make_Kia'),
 (0.008099601759433583, 'trim_Limited'),
 (0.007455413897890736, 'make_GMC'),
 (0.007275058781434938, 'make_Hyundai'),
 (0.007138718284254246, 'model_911'),
 (0.007030344882983147, 'model_F-150'),
 (0.006977969902684918, 'model_Corvette'),
 (0.006976058889856323, 'trim_LX'),
 (0.00695861454405992, 'model_Silverado 1500'),
 (0.006802983752157827, 'make_Dodge'),
 (0.00658323579456909, 'trim_LT'),
 (0.006576978541033758, 'make_BM

# Luxury Makes
After the model focusing on makes, the next step was to process the data even more so that the makes are split on luxury and non luxury makes. This is because the pricing and quality of luxury makes create a bias when comparing to regular makes.

In [36]:
clean_df = pd.read_csv('luxury_makes.csv')

In [37]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'fuel_type', 'engine_size', 'engine_block'])

In [38]:
clean_df = clean_df.dropna()

In [39]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [40]:
clean_df['make'].unique()

array(['BMW', 'Mercedes-Benz', 'Lexus', 'Porsche', 'Audi', 'Lamborghini',
       'Alfa Romeo', 'Ferrari', 'Maserati', 'Jaguar', 'Aston Martin',
       'Bentley', 'Land Rover', 'Cadillac', 'Rolls-Royce', 'INFINITI',
       'Maybach', 'Acura', 'Fisker', 'McLaren', 'Lotus'], dtype=object)

In [41]:

luxury_make_df = clean_df.sample(n=5000)

In [42]:
luxury_make_df = luxury_make_df.dropna()

In [43]:
X = pd.get_dummies(luxury_make_df)
X = X.drop(columns=["price_0-4,999", "price_5,000-9,999", "price_10,000-14,999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = luxury_make_df['price']

In [44]:
X

Unnamed: 0,miles,year,make_Acura,make_Alfa Romeo,make_Aston Martin,make_Audi,make_BMW,make_Bentley,make_Cadillac,make_Ferrari,...,trim_xDrive28d,trim_xDrive28i,trim_xDrive30i,trim_xDrive35d,trim_xDrive35i,trim_xDrive35i Premium,trim_xDrive40e,trim_xDrive50i,transmission_Automatic,transmission_Manual
451047,92835.0,2012.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1008659,83710.0,2015.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
116218,14148.0,2018.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1074968,32174.0,2018.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
299154,21537.0,2019.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693469,100032.0,2013.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
604422,99758.0,2015.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
406889,54361.0,2017.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
706822,10485.0,2019.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3747, 580)

In [46]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [47]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  3,   0,   0,   0,   0,   1,   0],
       [  8,  28,   0,   8,   0,  15,   0],
       [  1,   0,  19,   0,   0,   0,   4],
       [ 10,  31,  11, 113,  51,   0,   1],
       [  4,   6,  78,  77, 480,   2,  72],
       [ 21,   6,   0,   1,   0,  13,   0],
       [  0,   0,  55,   8,  36,   0,  87]], dtype=int64)

In [48]:
balanced_accuracy_score(y_test, y_pred)

0.5699127502352936

In [49]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.06      0.75      0.96      0.12      0.85      0.71         4
  10,000-14,999       0.39      0.47      0.96      0.43      0.68      0.44        59
100,000-400,000       0.12      0.79      0.88      0.20      0.84      0.69        24
  15,000-24,999       0.55      0.52      0.91      0.53      0.69      0.45       217
  25,000-49,999       0.85      0.67      0.84      0.75      0.75      0.55       719
    5,000-9,999       0.42      0.32      0.99      0.36      0.56      0.29        41
  50,000-99,999       0.53      0.47      0.93      0.50      0.66      0.41       186

    avg / total       0.70      0.59      0.87      0.63      0.72      0.50      1250



In [50]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

[(0.15330334375442983, 'miles'),
 (0.1358761762960699, 'year'),
 (0.030532140123813556, 'trim_Base'),
 (0.01763352970405818, 'make_Cadillac'),
 (0.017380718214061366, 'make_Mercedes-Benz'),
 (0.015768516286401267, 'make_BMW'),
 (0.015371941863788699, 'trim_Premium'),
 (0.014687287294947955, 'make_Audi'),
 (0.013784217158376823, 'make_Porsche'),
 (0.012766221878963212, 'make_Lexus'),
 (0.012065305584738981, 'make_Acura'),
 (0.011697001490578207, 'transmission_Automatic'),
 (0.011633585776283326, 'make_Land Rover'),
 (0.011620512560996546, 'trim_350'),
 (0.011021028568009834, 'transmission_Manual'),
 (0.010497287091125327, 'model_ES'),
 (0.01032652037328537, 'make_INFINITI'),
 (0.008404334679973051, 'model_RX'),
 (0.008173505522675322, 'model_MDX'),
 (0.008146573427783157, 'trim_300'),
 (0.00807426559196727, 'model_E-Class'),
 (0.0077521928186525995, 'model_3 Series'),
 (0.007214532115752809, 'model_C-Class'),
 (0.007015843065280542, 'model_Q50'),
 (0.0069296910443202575, 'model_A4'),
 (

# Non-Luxury Makes

In [51]:
clean_df = pd.read_csv('regular_makes.csv')

In [52]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'fuel_type', 'engine_size', 'engine_block'])

In [53]:
clean_df = clean_df.dropna()

In [54]:
clean_df['make'].unique()

array(['Chevrolet', 'Mitsubishi', 'Dodge', 'RAM', 'Ford', 'Mercury',
       'GMC', 'smart', 'Jeep', 'Pontiac', 'Volvo', 'Scion', 'Buick',
       'Toyota', 'Lincoln', 'Honda', 'FIAT', 'Saturn', 'Oldsmobile',
       'Kia', 'Chrysler', 'Saab', 'Volkswagen', 'Isuzu', 'Subaru',
       'Am General', 'Hummer', 'Mazda', 'MINI', 'Hyundai', 'Nissan',
       'Suzuki', 'GENESIS', 'KARMA', 'Plymouth', 'Geo', 'Eagle'],
      dtype=object)

In [55]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [56]:
regular_make_df = clean_df.sample(n=5000)

In [57]:
regular_make_df = regular_make_df.dropna()

In [58]:
X = pd.get_dummies(regular_make_df)
X = X.drop(columns=["price_0-4,999", "price_5,000-9,999", "price_10,000-14,999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = regular_make_df['price']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3750, 710)

In [60]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [61]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 10,   0,   0,   1,   0,   1,   0],
       [ 41,  19,  23,  48,  15,  18,   1],
       [  0,   0,   0,   1,   0,   0,   0],
       [ 26,  18,  40, 348,  29,  11,   4],
       [  9,   9,  42, 244,  73,  11,  62],
       [ 45,   4,  11,   3,   5,  12,   0],
       [  0,   1,  10,  37,   2,   0,  16]], dtype=int64)

In [62]:
balanced_accuracy_score(y_test, y_pred)

0.31917482144372894

In [63]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.08      0.83      0.90      0.14      0.87      0.75        12
  10,000-14,999       0.37      0.12      0.97      0.18      0.33      0.10       165
100,000-400,000       0.00      0.00      0.90      0.00      0.00      0.00         1
  15,000-24,999       0.51      0.73      0.57      0.60      0.64      0.42       476
  25,000-49,999       0.59      0.16      0.94      0.25      0.39      0.14       450
    5,000-9,999       0.23      0.15      0.96      0.18      0.38      0.13        80
  50,000-99,999       0.19      0.24      0.94      0.21      0.48      0.21        66

    avg / total       0.48      0.38      0.80      0.37      0.49      0.25      1250



In [64]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

[(0.11197108129801629, 'miles'),
 (0.09508428505905284, 'year'),
 (0.050102287581699344, 'make_Chevrolet'),
 (0.04358192007797271, 'transmission_Automatic'),
 (0.036701152390780874, 'transmission_Manual'),
 (0.03612209035661049, 'model_Corvette'),
 (0.028662444100447194, 'trim_2LT'),
 (0.024107699805068227, 'make_Ford'),
 (0.018627554179566564, 'trim_SE'),
 (0.017291804265565872, 'trim_LE'),
 (0.01711111111111111, 'model_F-150'),
 (0.01647760577915377, 'make_Toyota'),
 (0.014813576424721935, 'make_Hyundai'),
 (0.014127192982456136, 'make_Nissan'),
 (0.01386111111111111, 'model_Explorer'),
 (0.01141860165118679, 'make_Jeep'),
 (0.010102941176470587, 'trim_Base'),
 (0.009508771929824562, 'model_Terrain'),
 (0.008283333333333335, 'model_Silverado 1500'),
 (0.008055555555555555, 'model_Elantra'),
 (0.00782920536635707, 'trim_LX'),
 (0.007617647058823529, 'make_GMC'),
 (0.00736842105263158, 'make_Dodge'),
 (0.007266081871345028, 'model_Equinox'),
 (0.007257309941520469, 'trim_SLE'),
 (0.007

# Testing other Models

# Easy Ensemble AdaBoost Classifier

In [65]:
from imblearn.ensemble import EasyEnsembleClassifier


In [66]:
clean_df = pd.read_csv('make_model.csv')

In [67]:
clean_df = clean_df.drop(columns=['id','vin'])

In [68]:
clean_df

Unnamed: 0,price,miles,year,make,model,trim,body_type,vehicle_type,drivetrain,transmission,fuel_type,engine_size,engine_block
0,20998.0,115879.0,2015.0,Chevrolet,Express Cargo,Work Van,Cargo Van,Truck,RWD,Automatic,E85 / Unleaded,4.8,V
1,27921.0,7339.0,2018.0,BMW,i3,s,Hatchback,Car,RWD,Automatic,Electric / Premium Unleaded,0.6,I
2,11055.0,39798.0,2018.0,Mitsubishi,Mirage G4,SE,Sedan,Car,FWD,Automatic,Unleaded,1.2,I
3,52997.0,28568.0,2019.0,Chevrolet,Colorado,ZR2,Pickup,Truck,4WD,Automatic,Diesel,2.8,I
4,3995.0,137537.0,2000.0,Dodge,Ram Pickup,ST,Pickup,Truck,RWD,Manual,Unleaded,5.2,V
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6167618,69900.0,15270.0,2019.0,Ford,F-250 Super Duty,Lariat,Pickup,Truck,4WD,Automatic,Diesel,6.7,V
6167619,32991.0,143026.0,2011.0,Ford,F-250 Super Duty,King Ranch,Pickup,Truck,4WD,Automatic,Diesel,6.7,V
6167620,82900.0,3686.0,2021.0,Ford,F-250 Super Duty,King Ranch,Pickup,Truck,4WD,Automatic,Diesel,6.7,V
6167621,59995.0,39111.0,2019.0,Ford,F-250 Super Duty,XLT,Pickup,Truck,4WD,Automatic,Diesel,6.7,V


In [69]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [70]:
make_df = clean_df.sample(n=5000)

In [71]:
X = pd.get_dummies(make_df)
X = X.drop(columns= ["price_0-4,999", "price_5,000-9,999", "price_10,000-14,999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = make_df['price']

KeyError: "['price_10,000-14999'] not found in axis"

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

In [None]:
eec = EasyEnsembleClassifier(random_state=1, n_estimators=100)
eec.fit(X_train, y_train)

In [None]:
# Display the confusion matrix
y_pred = eec.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
balanced_accuracy_score(y_test, y_pred)

In [None]:
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
make_dict = sorted(zip(clf.feature_importances_, X.columns), reverse=True)
make_dict

Non-luxury makes only

In [None]:
clean_df = pd.read_csv('regular_makes.csv')

In [None]:
clean_df = clean_df.drop(columns=['id','vin'])

In [None]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [None]:
make_df = clean_df.sample(n=20000)

In [None]:
X = pd.get_dummies(make_df)
X = X.drop(columns= ["price_0-4,999", "price_5,000-9,999", "price_10,000-14,999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = make_df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

In [None]:
eec = EasyEnsembleClassifier(random_state=1, n_estimators=100)
eec.fit(X_train, y_train)

In [None]:
# Display the confusion matrix
y_pred = eec.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
balanced_accuracy_score(y_test, y_pred)

In [None]:
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
make_dict = sorted(zip(clf.feature_importances_, X.columns), reverse=True)
make_dict

# Database

In [None]:
db_string = f"postgresql://postgres:ilovedata@localhost:5432/final_project_db"

In [None]:
engine=create_engine(db_string)

In [None]:
clean_df.to_sql(name='used_car_sample', con=engine, if_exists='replace')
make_df.to_sql(name='make_sample', con=engine, if_exists='replace')
luxury_make_df.to_sql(name='luxury_sample', con=engine, if_exists='replace')
regular_make_df.to_sql(name='regular_sample', con=engine, if_exists='replace')