# Supervised Learning Model
This notebook contains supervised learning model using Balanced Random Forest Classifier to see what features from the used car dataset influence the target of price.

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
from config import password

The first model used is BRFC used on all features besides identifier and geographical columns. The BRFC is then used on a sample of 5000. It is also worth mentioning that the price column is converted into 7 seperate bins to reduce the amount of unique prices.

In [2]:
file_path = 'Used_Cars.csv'
df = pd.read_csv(file_path)
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.1,Unnamed: 0,id,vin,price,miles,stock_no,year,make,model,trim,...,drivetrain,transmission,fuel_type,engine_size,engine_block,seller_name,street,city,state,zip
0,0,38b2f52e-8f5d,1GCWGFCF3F1284719,20998.0,115879.0,W1T503168C,2015.0,Chevrolet,Express Cargo,Work Van,...,RWD,Automatic,E85 / Unleaded,4.8,V,nissan ellicott city,8569 Baltimore National Pike,Ellicott City,MD,21043
1,1,97ba4955-ccf0,WBY7Z8C59JVB87514,27921.0,7339.0,P33243,2018.0,BMW,i3,s,...,RWD,Automatic,Electric / Premium Unleaded,0.6,I,hendrick honda pompano beach,5381 N Federal Highway,Pompano Beach,FL,33064
2,2,be1da9fd-0f34,ML32F4FJ2JHF10325,11055.0,39798.0,WM2091A,2018.0,Mitsubishi,Mirage G4,SE,...,FWD,Automatic,Unleaded,1.2,I,russ darrow toyota,2700 West Washington St.,West Bend,WI,53095
3,3,84327e45-6cb6,1GCPTEE15K1291189,52997.0,28568.0,9U2Y425A,2019.0,Chevrolet,Colorado,ZR2,...,4WD,Automatic,Diesel,2.8,I,young kia,308 North Main Street,Layton,UT,84041
4,6,43847b9a-6fed,1B7HC16Y8YS543285,3995.0,137537.0,BP8246A,2000.0,Dodge,Ram Pickup,ST,...,RWD,Manual,Unleaded,5.2,V,baumann auto group,2379 W. State St.,Fremont,OH,43420


In [3]:
clean_df = df.drop(columns=['Unnamed: 0', 'id', 'vin', 'seller_name', 'street', 'stock_no', 'city', 'state', 'zip',])
clean_df.drop(clean_df.loc[clean_df['miles'] == 0].index, inplace=True)
clean_df = clean_df.sample(n=5000)
clean_df.head()

Unnamed: 0,price,miles,year,make,model,trim,body_type,vehicle_type,drivetrain,transmission,fuel_type,engine_size,engine_block
3157429,25995.0,40067.0,2019.0,Lincoln,MKC,Select,SUV,Truck,FWD,Automatic,Unleaded,2.0,I
5343413,21995.0,17799.0,2015.0,Honda,Accord,Sport,Sedan,Car,FWD,Automatic,Unleaded,2.4,I
3964742,24100.0,7790.0,2020.0,Nissan,Rogue,S,Crossover,Truck,FWD,Automatic,Unleaded,2.5,I
718024,7745.0,125523.0,2009.0,Honda,Accord,EX,Sedan,Car,FWD,Automatic,Unleaded,2.4,I
2928559,22991.0,26343.0,2011.0,GMC,Sierra 1500,SLE,Pickup,Truck,RWD,Automatic,E85 / Unleaded,5.3,V


In [4]:
clean_df.describe()

Unnamed: 0,price,miles,year,engine_size
count,5000.0,5000.0,5000.0,5000.0
mean,28057.256,52238.1982,2016.7474,3.11098
std,16621.071408,42287.662736,3.242298,1.340305
min,2790.0,1.0,1997.0,0.6
25%,17500.0,22747.25,2016.0,2.0
50%,24708.5,39105.5,2018.0,2.5
75%,35291.5,72429.75,2019.0,3.6
max,333383.0,359920.0,2021.0,7.3


In [5]:
bins = [0, 5000, 10000, 15000, 25000, 50000, 100000, 400000]
labels = ['0-4,999', '5,000-9,999', '10,000-14999', '15,000-24,999', '25,000-49,999', '50,000-99,999', '100,000-400,000']

In [6]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )


In [7]:
clean_df

Unnamed: 0,price,miles,year,make,model,trim,body_type,vehicle_type,drivetrain,transmission,fuel_type,engine_size,engine_block
3157429,"25,000-49,999",40067.0,2019.0,Lincoln,MKC,Select,SUV,Truck,FWD,Automatic,Unleaded,2.0,I
5343413,"15,000-24,999",17799.0,2015.0,Honda,Accord,Sport,Sedan,Car,FWD,Automatic,Unleaded,2.4,I
3964742,"15,000-24,999",7790.0,2020.0,Nissan,Rogue,S,Crossover,Truck,FWD,Automatic,Unleaded,2.5,I
718024,"5,000-9,999",125523.0,2009.0,Honda,Accord,EX,Sedan,Car,FWD,Automatic,Unleaded,2.4,I
2928559,"15,000-24,999",26343.0,2011.0,GMC,Sierra 1500,SLE,Pickup,Truck,RWD,Automatic,E85 / Unleaded,5.3,V
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3930598,"15,000-24,999",23489.0,2018.0,Toyota,RAV4,XLE,SUV,Truck,4WD,Automatic,Unleaded,2.5,I
5775382,"25,000-49,999",59844.0,2016.0,Ford,Explorer,XLT,SUV,Truck,FWD,Automatic,E85 / Unleaded,3.5,V
3923734,"15,000-24,999",135989.0,2015.0,Toyota,RAV4,Limited,SUV,Truck,4WD,Automatic,Unleaded,2.5,I
1920072,"25,000-49,999",51021.0,2015.0,BMW,5 Series,535i,Sedan,Car,4WD,Automatic,Premium Unleaded,3.0,I


In [8]:
clean_df.nunique()

price              7
miles           4866
year              25
make              43
model            442
trim             476
body_type         16
vehicle_type       2
drivetrain         3
transmission       2
fuel_type         13
engine_size       51
engine_block       3
dtype: int64

In [9]:
clean_df['price'].describe()

count              5000
unique                7
top       25,000-49,999
freq               2032
Name: price, dtype: object

In [10]:
clean_df = clean_df.dropna()

In [11]:
drop_columns = ['price_0-4,999', 'price_5,000-9,999', 'price_10,000-14999', 'price_15,000-24,999', 'price_25,000-49,999', 'price_50,000-99,999', 'price_100,000-400,000']

In [12]:

X = pd.get_dummies(clean_df)
X = X.drop(columns=drop_columns)
y = clean_df['price']

In [13]:
X

Unnamed: 0,miles,year,engine_size,make_Acura,make_Alfa Romeo,make_Audi,make_BMW,make_Bentley,make_Buick,make_Cadillac,...,fuel_type_Electric / Premium Unleaded,fuel_type_Electric / Unleaded,fuel_type_Premium Unleaded,fuel_type_Premium Unleaded / Unleaded,fuel_type_Premium Unleaded; Unleaded,fuel_type_Unleaded,fuel_type_Unleaded / Unleaded,engine_block_H,engine_block_I,engine_block_V
3157429,40067.0,2019.0,2.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5343413,17799.0,2015.0,2.4,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3964742,7790.0,2020.0,2.5,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
718024,125523.0,2009.0,2.4,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2928559,26343.0,2011.0,5.3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3930598,23489.0,2018.0,2.5,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5775382,59844.0,2016.0,3.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3923734,135989.0,2015.0,2.5,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1920072,51021.0,2015.0,3.0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [14]:
X.describe()

Unnamed: 0,miles,year,engine_size,make_Acura,make_Alfa Romeo,make_Audi,make_BMW,make_Bentley,make_Buick,make_Cadillac,...,fuel_type_Electric / Premium Unleaded,fuel_type_Electric / Unleaded,fuel_type_Premium Unleaded,fuel_type_Premium Unleaded / Unleaded,fuel_type_Premium Unleaded; Unleaded,fuel_type_Unleaded,fuel_type_Unleaded / Unleaded,engine_block_H,engine_block_I,engine_block_V
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,52238.1982,2016.7474,3.11098,0.0152,0.0016,0.018,0.044,0.0002,0.0152,0.0152,...,0.0048,0.0202,0.22,0.0006,0.001,0.6546,0.0008,0.0286,0.5058,0.4656
std,42287.662736,3.242298,1.340305,0.12236,0.039972,0.132964,0.205116,0.014142,0.12236,0.12236,...,0.069122,0.140698,0.414288,0.02449,0.03161,0.475546,0.028276,0.166696,0.500016,0.498865
min,1.0,1997.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22747.25,2016.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,39105.5,2018.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
75%,72429.75,2019.0,3.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
max,359920.0,2021.0,7.3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3750, 1003)

In [16]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [17]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  5,   0,   0,   0,   0,   5,   0],
       [ 13,  62,   5,  22,   5,  17,   3],
       [  0,   0,   3,   0,   0,   0,   0],
       [ 10,  64,  14, 260,  70,   8,  13],
       [  0,   1,  44, 104, 249,   0,  99],
       [ 24,  18,   1,   1,   1,  31,   3],
       [  0,   0,  31,   2,  10,   0,  52]], dtype=int64)

In [18]:
balanced_accuracy_score(y_test, y_pred)

0.5744605174605378

In [19]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.10      0.50      0.96      0.16      0.69      0.46        10
   10,000-14999       0.43      0.49      0.93      0.46      0.67      0.43       127
100,000-400,000       0.03      1.00      0.92      0.06      0.96      0.93         3
  15,000-24,999       0.67      0.59      0.84      0.63      0.71      0.49       439
  25,000-49,999       0.74      0.50      0.89      0.60      0.67      0.43       497
    5,000-9,999       0.51      0.39      0.97      0.44      0.62      0.36        79
  50,000-99,999       0.31      0.55      0.90      0.39      0.70      0.47        95

    avg / total       0.63      0.53      0.88      0.56      0.68      0.45      1250



In [20]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

[(0.09394590297994633, 'year'),
 (0.08896593232456786, 'miles'),
 (0.06463679499746655, 'engine_size'),
 (0.0245411833659355, 'fuel_type_Unleaded'),
 (0.023154176373687833, 'fuel_type_Premium Unleaded'),
 (0.02044200724534808, 'drivetrain_FWD'),
 (0.019861271704586334, 'drivetrain_4WD'),
 (0.019251057818901452, 'vehicle_type_Car'),
 (0.01879059644377965, 'engine_block_I'),
 (0.01863590556202437, 'engine_block_V'),
 (0.014348940136034072, 'body_type_SUV'),
 (0.014082075848076107, 'body_type_Pickup'),
 (0.013806404210756112, 'body_type_Sedan'),
 (0.013086902831347386, 'make_Ford'),
 (0.012257244461205925, 'drivetrain_RWD'),
 (0.010725164564818406, 'make_Toyota'),
 (0.010171231708228853, 'trim_Base'),
 (0.009438909013036732, 'vehicle_type_Truck'),
 (0.008440975221331417, 'make_Chevrolet'),
 (0.008055329312603941, 'body_type_Coupe'),
 (0.007955549717967515, 'trim_SE'),
 (0.007817946034557679, 'make_Nissan'),
 (0.0072642182529823775, 'fuel_type_E85 / Unleaded'),
 (0.00724026447588758, 'make

Looking at the feature importances we can see that year and miles has the most influence over price, which is then followed by enginesize, fuel type, and the drivetrain. This makes sense as the year and miles on a car play a big part in deciding if the car is worth what they are selling it for. While this model is informatiive, it doesn't answer the question of what make has most influence on price.

# BRFC Focused on Make
The next model is a BRFC with the used car dataset, except this time the data is cleaned to focus on the make of the car. This means we drop some of the columns such as body_type, fuel_type and more, so the feature importances highlights the makes that influence price.

In [21]:
clean_df = pd.read_csv('make_model.csv')

In [22]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'transmission', 'fuel_type', 'engine_size', 'engine_block'])

In [23]:
clean_df

Unnamed: 0,price,miles,year,make,model,trim
0,20998.0,115879.0,2015.0,Chevrolet,Express Cargo,Work Van
1,27921.0,7339.0,2018.0,BMW,i3,s
2,11055.0,39798.0,2018.0,Mitsubishi,Mirage G4,SE
3,52997.0,28568.0,2019.0,Chevrolet,Colorado,ZR2
4,3995.0,137537.0,2000.0,Dodge,Ram Pickup,ST
...,...,...,...,...,...,...
6167618,69900.0,15270.0,2019.0,Ford,F-250 Super Duty,Lariat
6167619,32991.0,143026.0,2011.0,Ford,F-250 Super Duty,King Ranch
6167620,82900.0,3686.0,2021.0,Ford,F-250 Super Duty,King Ranch
6167621,59995.0,39111.0,2019.0,Ford,F-250 Super Duty,XLT


In [24]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [25]:

make_df = clean_df.sample(n=5000)


In [26]:
make_df

Unnamed: 0,price,miles,year,make,model,trim
3640542,"15,000-24,999",35754.0,2020.0,Chevrolet,Equinox,LT
2497573,"25,000-49,999",43854.0,2017.0,Ford,F-150,XLT
1204748,"10,000-14999",92725.0,2014.0,Dodge,Grand Caravan,American Value Package
4598761,"50,000-99,999",51057.0,2019.0,Chevrolet,Silverado 2500HD,LTZ
4062198,"15,000-24,999",101787.0,2015.0,Jeep,Cherokee,Trailhawk
...,...,...,...,...,...,...
5941428,"25,000-49,999",23471.0,2019.0,RAM,Ram 1500 Pickup,Big Horn/Lone Star
4547085,"25,000-49,999",17064.0,2019.0,RAM,Ram 1500 Pickup,Big Horn/Lone Star
5826303,"25,000-49,999",66600.0,2017.0,Dodge,Durango,GT
3181381,"15,000-24,999",13.0,2019.0,Ford,Ecosport,SE


In [27]:
X = pd.get_dummies(make_df)
X = X.drop(columns= ["price_0-4,999", "price_5,000-9,999", "price_10,000-14999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = make_df['price']

In [28]:
X.describe()

Unnamed: 0,miles,year,make_Acura,make_Alfa Romeo,make_Aston Martin,make_Audi,make_BMW,make_Bentley,make_Buick,make_Cadillac,...,trim_s Grand Touring,trim_sDrive28i,trim_sDrive35i,trim_xDrive28d,trim_xDrive28i,trim_xDrive30i,trim_xDrive35d,trim_xDrive35i,trim_xDrive40e,trim_xDrive50i
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,52829.1544,2016.65,0.0142,0.002,0.0002,0.0164,0.0434,0.0002,0.0104,0.0116,...,0.0002,0.0002,0.0006,0.0004,0.002,0.0022,0.0004,0.0016,0.0002,0.0004
std,43278.869171,3.397324,0.118326,0.044681,0.014142,0.127021,0.203776,0.014142,0.101459,0.107088,...,0.014142,0.014142,0.02449,0.019998,0.044681,0.046857,0.019998,0.039972,0.014142,0.019998
min,0.0,1999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22893.75,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,39080.0,2018.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,74024.75,2019.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,310192.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3750, 1018)

In [30]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [31]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  7,   1,   0,   0,   0,   2,   0],
       [ 13,  67,   5,  25,   0,  51,   0],
       [  0,   0,   4,   0,   0,   0,   0],
       [  5,  87,  18, 223,  34,  25,  19],
       [  1,  28,  55, 108, 214,   5,  91],
       [ 19,  13,   2,   2,   0,  44,   0],
       [  0,   2,  19,   1,  15,   0,  45]], dtype=int64)

In [32]:
balanced_accuracy_score(y_test, y_pred)

0.5976862074672545

In [33]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.16      0.70      0.97      0.25      0.82      0.66        10
   10,000-14999       0.34      0.42      0.88      0.37      0.61      0.35       161
100,000-400,000       0.04      1.00      0.92      0.07      0.96      0.93         4
  15,000-24,999       0.62      0.54      0.84      0.58      0.67      0.44       411
  25,000-49,999       0.81      0.43      0.93      0.56      0.63      0.38       502
    5,000-9,999       0.35      0.55      0.93      0.43      0.71      0.49        80
  50,000-99,999       0.29      0.55      0.91      0.38      0.71      0.48        82

    avg / total       0.62      0.48      0.89      0.52      0.65      0.41      1250



In [34]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

[(0.13401994463029213, 'miles'),
 (0.10974095002207085, 'year'),
 (0.013500016278633791, 'make_Mercedes-Benz'),
 (0.013495838209286935, 'trim_Base'),
 (0.013052564631885337, 'make_Ford'),
 (0.012485967247521308, 'make_Chevrolet'),
 (0.011360106564712515, 'make_Honda'),
 (0.011105330611009401, 'trim_SE'),
 (0.010818506644296802, 'make_Toyota'),
 (0.008669566867271508, 'make_Nissan'),
 (0.008288536840559305, 'trim_LX'),
 (0.00817759200931279, 'make_Jeep'),
 (0.008161433613304556, 'make_BMW'),
 (0.007863597820631849, 'trim_S'),
 (0.007421946518717577, 'trim_Limited'),
 (0.007300052485573562, 'make_GMC'),
 (0.00729341133993624, 'make_Dodge'),
 (0.007050083856341386, 'make_Porsche'),
 (0.006740080701081175, 'model_911'),
 (0.00673615821342588, 'model_F-150'),
 (0.006359882481837229, 'make_Kia'),
 (0.006162045566053071, 'make_Hyundai'),
 (0.005636620295054441, 'make_RAM'),
 (0.005545148768298139, 'trim_EX'),
 (0.005513463798182704, 'trim_1LT'),
 (0.005456654532010472, 'model_Ram 1500 Pickup'

# Luxury Makes
After the model focusing on makes, the next step was to process the data even more so that the makes are split on luxury and non luxury makes. This is because the pricing and quality of luxury makes create a bias when comparing to regular makes.

In [35]:
clean_df = pd.read_csv('luxury_makes.csv')

In [36]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'fuel_type', 'engine_size', 'engine_block'])

In [37]:
clean_df = clean_df.dropna()

In [38]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [39]:
clean_df['make'].unique()

array(['BMW', 'Mercedes-Benz', 'Lexus', 'Porsche', 'Audi', 'Lamborghini',
       'Alfa Romeo', 'Ferrari', 'Maserati', 'Jaguar', 'Aston Martin',
       'Bentley', 'Land Rover', 'Cadillac', 'Rolls-Royce', 'INFINITI',
       'Maybach', 'Acura', 'Fisker', 'McLaren', 'Lotus'], dtype=object)

In [40]:

luxury_make_df = clean_df.sample(n=5000)

In [41]:
X = pd.get_dummies(luxury_make_df)
X = X.drop(columns=["price_0-4,999", "price_5,000-9,999", "price_10,000-14999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = luxury_make_df['price']

In [42]:
X

Unnamed: 0,miles,year,make_Acura,make_Alfa Romeo,make_Aston Martin,make_Audi,make_BMW,make_Bentley,make_Cadillac,make_Ferrari,...,trim_xDrive28i,trim_xDrive30i,trim_xDrive35d,trim_xDrive35i,trim_xDrive35i Premium,trim_xDrive40e,trim_xDrive48i,trim_xDrive50i,transmission_Automatic,transmission_Manual
9932,32891.0,2019.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
89993,39600.0,2018.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
840273,124944.0,2007.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1050898,43719.0,2018.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
891833,11469.0,2018.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92174,22336.0,2017.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1057277,41561.0,2017.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
422294,43851.0,2018.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
114611,19075.0,2018.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3750, 559)

In [44]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

ValueError: Input contains NaN

In [None]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
balanced_accuracy_score(y_test, y_pred)

In [None]:
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

# Non-Luxury Makes

In [45]:
clean_df = pd.read_csv('regular_makes.csv')

In [46]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'fuel_type', 'engine_size', 'engine_block'])

In [47]:
clean_df = clean_df.dropna()

In [48]:
clean_df['make'].unique()

array(['Chevrolet', 'Mitsubishi', 'Dodge', 'RAM', 'Ford', 'Mercury',
       'GMC', 'smart', 'Jeep', 'Pontiac', 'Volvo', 'Scion', 'Buick',
       'Toyota', 'Lincoln', 'Honda', 'FIAT', 'Saturn', 'Oldsmobile',
       'Kia', 'Chrysler', 'Saab', 'Volkswagen', 'Isuzu', 'Subaru',
       'Am General', 'Hummer', 'Mazda', 'MINI', 'Hyundai', 'Nissan',
       'Suzuki', 'GENESIS', 'KARMA', 'Plymouth', 'Geo', 'Eagle'],
      dtype=object)

In [49]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [50]:
regular_make_df = clean_df.sample(n=5000)

In [51]:
X = pd.get_dummies(regular_make_df)
X = X.drop(columns=["price_0-4,999", "price_5,000-9,999", "price_10,000-14999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = regular_make_df['price']

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3750, 716)

In [53]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [54]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  5,   1,   1,   0,   0,   3,   0],
       [ 12, 108,   2,   6,   9,  18,   3],
       [  0,   0,   2,   0,   0,   0,   0],
       [ 10, 113,  40, 111, 163,  12,  43],
       [  1,  41,  79,  31, 190,   3,  73],
       [ 43,  24,   1,   0,   0,  32,   1],
       [  0,   4,  21,   3,  16,   0,  25]], dtype=int64)

In [55]:
balanced_accuracy_score(y_test, y_pred)

0.5061214340269301

In [56]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.07      0.50      0.95      0.12      0.69      0.45        10
   10,000-14999       0.37      0.68      0.83      0.48      0.75      0.56       158
100,000-400,000       0.01      1.00      0.88      0.03      0.94      0.89         2
  15,000-24,999       0.74      0.23      0.95      0.35      0.46      0.20       492
  25,000-49,999       0.50      0.45      0.77      0.48      0.59      0.34       418
    5,000-9,999       0.47      0.32      0.97      0.38      0.55      0.29       101
  50,000-99,999       0.17      0.36      0.90      0.23      0.57      0.31        69

    avg / total       0.55      0.38      0.87      0.40      0.56      0.31      1250



In [57]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

[(0.12951188496544289, 'miles'),
 (0.10980796489652513, 'year'),
 (0.032971074537560895, 'make_Chevrolet'),
 (0.026461198616886374, 'model_Corvette'),
 (0.023662342412183332, 'make_Ford'),
 (0.01843895961138068, 'make_Toyota'),
 (0.017231509950165345, 'transmission_Automatic'),
 (0.015411531279255626, 'model_Ram 1500 Pickup'),
 (0.015297946997911769, 'model_F-150'),
 (0.014702922053003172, 'trim_3ZR'),
 (0.014480654578421175, 'trim_2LT'),
 (0.01387947540826918, 'make_Honda'),
 (0.013672090058439086, 'make_Dodge'),
 (0.01255989877570844, 'transmission_Manual'),
 (0.012174947494564553, 'trim_Base'),
 (0.011796563953845748, 'make_RAM'),
 (0.011485384961557554, 'make_GMC'),
 (0.011170870794059285, 'make_Nissan'),
 (0.010860555463286665, 'trim_SE'),
 (0.01016186666613428, 'trim_S'),
 (0.009633049233344255, 'trim_LX'),
 (0.008899776640824832, 'make_Jeep'),
 (0.00888603507280981, 'model_F-250 Super Duty'),
 (0.008211014638543733, 'trim_XLT'),
 (0.007854672943799032, 'model_Silverado 1500'),
 

In [66]:
db_string = f"postgresql://postgres:{password}@localhost:5432/final_project_db"

In [67]:
engine=create_engine(db_string)
clean_df.to_sql(name='used_car_sample', con=engine)
make_df.to_sql(name='make_sample', con=engine)
# luxury_make_df.to_sql(name='luxury_sample', con=engine)
regular_make_df.to_sql(name='regular_sample', con=engine)

OperationalError: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5432 failed: FATAL:  password authentication failed for user "postgres"

(Background on this error at: https://sqlalche.me/e/14/e3q8)