# Supervised Learning Model
This notebook contains supervised learning model using Balanced Random Forest Classifier to see what features from the used car dataset influence the target of price.

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
from config import password
import psycopg2

The first model used is BRFC used on all features besides identifier and geographical columns. The BRFC is then used on a sample of 5000. It is also worth mentioning that the price column is converted into 7 seperate bins to reduce the amount of unique prices.

In [2]:
file_path = 'Used_Cars.csv'
df = pd.read_csv(file_path)
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.1,Unnamed: 0,id,vin,price,miles,stock_no,year,make,model,trim,...,drivetrain,transmission,fuel_type,engine_size,engine_block,seller_name,street,city,state,zip
0,0,38b2f52e-8f5d,1GCWGFCF3F1284719,20998.0,115879.0,W1T503168C,2015.0,Chevrolet,Express Cargo,Work Van,...,RWD,Automatic,E85 / Unleaded,4.8,V,nissan ellicott city,8569 Baltimore National Pike,Ellicott City,MD,21043
1,1,97ba4955-ccf0,WBY7Z8C59JVB87514,27921.0,7339.0,P33243,2018.0,BMW,i3,s,...,RWD,Automatic,Electric / Premium Unleaded,0.6,I,hendrick honda pompano beach,5381 N Federal Highway,Pompano Beach,FL,33064
2,2,be1da9fd-0f34,ML32F4FJ2JHF10325,11055.0,39798.0,WM2091A,2018.0,Mitsubishi,Mirage G4,SE,...,FWD,Automatic,Unleaded,1.2,I,russ darrow toyota,2700 West Washington St.,West Bend,WI,53095
3,3,84327e45-6cb6,1GCPTEE15K1291189,52997.0,28568.0,9U2Y425A,2019.0,Chevrolet,Colorado,ZR2,...,4WD,Automatic,Diesel,2.8,I,young kia,308 North Main Street,Layton,UT,84041
4,6,43847b9a-6fed,1B7HC16Y8YS543285,3995.0,137537.0,BP8246A,2000.0,Dodge,Ram Pickup,ST,...,RWD,Manual,Unleaded,5.2,V,baumann auto group,2379 W. State St.,Fremont,OH,43420


In [3]:
clean_df = df.drop(columns=['Unnamed: 0', 'id', 'vin', 'seller_name', 'street', 'stock_no', 'city', 'state', 'zip',])
clean_df.drop(clean_df.loc[clean_df['miles'] == 0].index, inplace=True)
clean_df = clean_df.sample(n=70000)
clean_df.head()

Unnamed: 0,price,miles,year,make,model,trim,body_type,vehicle_type,drivetrain,transmission,fuel_type,engine_size,engine_block
2540148,31700.0,26014.0,2018.0,Lexus,IS,300,Sedan,Car,4WD,Automatic,Premium Unleaded,3.5,V
1094832,41250.0,51102.0,2016.0,Ford,F-150,King Ranch,Pickup,Truck,4WD,Automatic,Unleaded,3.5,V
6144656,67995.0,55302.0,2017.0,GMC,Sierra 3500 Denali HD,Denali,Pickup,Truck,4WD,Automatic,Diesel,6.6,V
2445346,12999.0,96495.0,2016.0,Nissan,Altima,SL,Sedan,Car,FWD,Automatic,Unleaded,2.5,I
1926298,13358.0,134147.0,2010.0,Ford,Explorer Sport Trac,Limited,Pickup,Truck,RWD,Automatic,Unleaded,4.0,V


In [4]:
clean_df.describe()

Unnamed: 0,price,miles,year,engine_size
count,70000.0,70000.0,70000.0,70000.0
mean,27827.648614,52648.9666,2016.672929,3.081884
std,17466.978765,42930.038095,3.328332,1.331415
min,1295.0,1.0,1994.0,0.6
25%,17254.75,22611.0,2016.0,2.0
50%,24549.5,39308.0,2018.0,2.5
75%,34995.0,73910.75,2019.0,3.6
max,619988.0,999999.0,2022.0,8.4


In [5]:
bins = [0, 5000, 10000, 15000, 25000, 50000, 100000, 400000]
labels = ['0-4,999', '5,000-9,999', '10,000-14,999', '15,000-24,999', '25,000-49,999', '50,000-99,999', '100,000-400,000']

In [6]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [7]:
clean_df

Unnamed: 0,price,miles,year,make,model,trim,body_type,vehicle_type,drivetrain,transmission,fuel_type,engine_size,engine_block
2540148,"25,000-49,999",26014.0,2018.0,Lexus,IS,300,Sedan,Car,4WD,Automatic,Premium Unleaded,3.5,V
1094832,"25,000-49,999",51102.0,2016.0,Ford,F-150,King Ranch,Pickup,Truck,4WD,Automatic,Unleaded,3.5,V
6144656,"50,000-99,999",55302.0,2017.0,GMC,Sierra 3500 Denali HD,Denali,Pickup,Truck,4WD,Automatic,Diesel,6.6,V
2445346,"10,000-14,999",96495.0,2016.0,Nissan,Altima,SL,Sedan,Car,FWD,Automatic,Unleaded,2.5,I
1926298,"10,000-14,999",134147.0,2010.0,Ford,Explorer Sport Trac,Limited,Pickup,Truck,RWD,Automatic,Unleaded,4.0,V
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5086392,"15,000-24,999",38892.0,2018.0,Chevrolet,Cruze,LT,Sedan,Car,FWD,Automatic,Unleaded,1.4,I
4885559,"25,000-49,999",4707.0,2020.0,Ford,Fusion,Titanium,Sedan,Car,FWD,Automatic,Unleaded,2.0,I
4810944,"25,000-49,999",889.0,2018.0,Volvo,S90,Inscription,Sedan,Car,4WD,Automatic,Electric / Premium Unleaded,2.0,I
2368102,"15,000-24,999",70487.0,2017.0,Mazda,CX-5,Touring,Crossover,Truck,FWD,Automatic,Unleaded,2.5,I


In [8]:
clean_df.nunique()

price               7
miles           49249
year               27
make               51
model             763
trim             1107
body_type          19
vehicle_type        2
drivetrain          3
transmission        2
fuel_type          26
engine_size        64
engine_block        3
dtype: int64

In [9]:
clean_df['price'].describe()

count             69992
unique                7
top       25,000-49,999
freq              28520
Name: price, dtype: object

In [10]:
clean_df = clean_df.dropna()

In [11]:
drop_columns = ['price_0-4,999', 'price_5,000-9,999', 'price_10,000-14,999', 'price_15,000-24,999', 'price_25,000-49,999', 'price_50,000-99,999', 'price_100,000-400,000']

In [12]:
X = pd.get_dummies(clean_df)
X = X.drop(columns=drop_columns)
y = clean_df['price']

In [13]:
X

Unnamed: 0,miles,year,engine_size,make_Acura,make_Alfa Romeo,make_Aston Martin,make_Audi,make_BMW,make_Bentley,make_Buick,...,fuel_type_Electric / Unleaded,fuel_type_Premium Unleaded,fuel_type_Premium Unleaded / Unleaded,fuel_type_Premium Unleaded; Unleaded,fuel_type_Unleaded,fuel_type_Unleaded / E85,fuel_type_Unleaded / Unleaded,engine_block_H,engine_block_I,engine_block_V
2540148,26014.0,2018.0,3.5,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1094832,51102.0,2016.0,3.5,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
6144656,55302.0,2017.0,6.6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2445346,96495.0,2016.0,2.5,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1926298,134147.0,2010.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5086392,38892.0,2018.0,1.4,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4885559,4707.0,2020.0,2.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4810944,889.0,2018.0,2.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2368102,70487.0,2017.0,2.5,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [14]:
X.describe()

Unnamed: 0,miles,year,engine_size,make_Acura,make_Alfa Romeo,make_Aston Martin,make_Audi,make_BMW,make_Bentley,make_Buick,...,fuel_type_Electric / Unleaded,fuel_type_Premium Unleaded,fuel_type_Premium Unleaded / Unleaded,fuel_type_Premium Unleaded; Unleaded,fuel_type_Unleaded,fuel_type_Unleaded / E85,fuel_type_Unleaded / Unleaded,engine_block_H,engine_block_I,engine_block_V
count,69992.0,69992.0,69992.0,69992.0,69992.0,69992.0,69992.0,69992.0,69992.0,69992.0,...,69992.0,69992.0,69992.0,69992.0,69992.0,69992.0,69992.0,69992.0,69992.0,69992.0
mean,52654.530475,2016.672663,3.081556,0.014216,0.001414,0.000229,0.018716,0.037919,0.000229,0.012601,...,0.017816,0.207552,0.000286,0.0013,0.665933,0.000229,0.000386,0.029075,0.510658,0.460267
std,42929.312299,3.328408,1.331078,0.118381,0.037583,0.015118,0.135523,0.191001,0.015118,0.111547,...,0.132284,0.405557,0.016902,0.036034,0.471667,0.015118,0.019637,0.168017,0.49989,0.498422
min,1.0,1994.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22616.75,2016.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,39314.0,2018.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,73920.5,2019.0,3.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
max,999999.0,2022.0,8.4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(52494, 1975)

In [16]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [17]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 146,    6,    0,    1,    0,   35,    0],
       [  56, 1268,   14,  169,   32,  344,   18],
       [   0,    0,   74,    0,    0,    0,    3],
       [  36,  893,   44, 3988,  848,   90,   41],
       [   2,   17,  138,  900, 5182,    9,  839],
       [ 270,  168,    1,    7,    2,  669,    1],
       [   0,    0,  114,    1,   91,    0,  981]], dtype=int64)

In [18]:
balanced_accuracy_score(y_test, y_pred)

0.7474391042967633

In [19]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.29      0.78      0.98      0.42      0.87      0.74       188
  10,000-14,999       0.54      0.67      0.93      0.60      0.79      0.60      1901
100,000-400,000       0.19      0.96      0.98      0.32      0.97      0.94        77
  15,000-24,999       0.79      0.67      0.91      0.72      0.78      0.59      5940
  25,000-49,999       0.84      0.73      0.91      0.78      0.81      0.65      7087
    5,000-9,999       0.58      0.60      0.97      0.59      0.76      0.56      1118
  50,000-99,999       0.52      0.83      0.94      0.64      0.88      0.77      1187

    avg / total       0.74      0.70      0.92      0.71      0.80      0.63     17498



In [20]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

[(0.1463915055871372, 'miles'),
 (0.1295798418425014, 'year'),
 (0.06959122656373279, 'engine_size'),
 (0.021097221967439392, 'fuel_type_Unleaded'),
 (0.019755572306794464, 'fuel_type_Premium Unleaded'),
 (0.017911019028100855, 'engine_block_V'),
 (0.017437344554690443, 'drivetrain_FWD'),
 (0.01479968386002788, 'drivetrain_4WD'),
 (0.014472893065959298, 'engine_block_I'),
 (0.010431353607830618, 'trim_Base'),
 (0.010157349953460133, 'body_type_Sedan'),
 (0.009980844048508889, 'vehicle_type_Car'),
 (0.009326275220094197, 'vehicle_type_Truck'),
 (0.009073734393920986, 'drivetrain_RWD'),
 (0.008643120659605848, 'body_type_Pickup'),
 (0.008168976098740607, 'body_type_SUV'),
 (0.008034081946488105, 'transmission_Automatic'),
 (0.0080017845227511, 'transmission_Manual'),
 (0.007751964623634778, 'make_Ford'),
 (0.006751623323356918, 'trim_SE'),
 (0.006751188867091176, 'fuel_type_Diesel'),
 (0.0060482228440595165, 'make_Chevrolet'),
 (0.005973218494340114, 'make_Toyota'),
 (0.00590113012162823

Looking at the feature importances we can see that year and miles has the most influence over price, which is then followed by enginesize, fuel type, and the drivetrain. This makes sense as the year and miles on a car play a big part in deciding if the car is worth what they are selling it for. While this model is informatiive, it doesn't answer the question of what make has most influence on price.

# BRFC Focused on Make
The next model is a BRFC with the used car dataset, except this time the data is cleaned to focus on the make of the car. This means we drop some of the columns such as body_type, fuel_type and more, so the feature importances highlights the makes that influence price.

In [21]:
clean_df = pd.read_csv('make_model.csv')

In [22]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'transmission', 'fuel_type', 'engine_size', 'engine_block'])

In [23]:
clean_df

Unnamed: 0,price,miles,year,make,model,trim
0,20998.0,115879.0,2015.0,Chevrolet,Express Cargo,Work Van
1,27921.0,7339.0,2018.0,BMW,i3,s
2,11055.0,39798.0,2018.0,Mitsubishi,Mirage G4,SE
3,52997.0,28568.0,2019.0,Chevrolet,Colorado,ZR2
4,3995.0,137537.0,2000.0,Dodge,Ram Pickup,ST
...,...,...,...,...,...,...
6167618,69900.0,15270.0,2019.0,Ford,F-250 Super Duty,Lariat
6167619,32991.0,143026.0,2011.0,Ford,F-250 Super Duty,King Ranch
6167620,82900.0,3686.0,2021.0,Ford,F-250 Super Duty,King Ranch
6167621,59995.0,39111.0,2019.0,Ford,F-250 Super Duty,XLT


In [24]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [25]:
make_df = clean_df.sample(n=70000)

In [26]:
make_df = make_df.dropna()
make_df

Unnamed: 0,price,miles,year,make,model,trim
1036927,"15,000-24,999",91554.0,2018.0,Ford,Flex,Limited
1567326,"15,000-24,999",3992.0,2020.0,Ford,Ecosport,SE
1835553,"15,000-24,999",45413.0,2018.0,Hyundai,Elantra,Value Edition
5810767,"10,000-14,999",71238.0,2012.0,Cadillac,SRX,Luxury Collection
187676,"25,000-49,999",10985.0,2019.0,Subaru,WRX,Base
...,...,...,...,...,...,...
1187566,"10,000-14,999",88526.0,2016.0,Dodge,Journey,SXT
855380,"15,000-24,999",24346.0,2019.0,Toyota,Camry,SE
4533295,"25,000-49,999",16027.0,2019.0,Toyota,Tundra,SR5
2249600,"15,000-24,999",24611.0,2018.0,Honda,CR-V,LX


In [27]:
make_df.isnull().values.sum()

0

In [28]:
X = pd.get_dummies(make_df)
X = X.drop(columns= ["price_0-4,999", "price_5,000-9,999", "price_10,000-14,999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = make_df['price']

In [29]:
X.describe()

Unnamed: 0,miles,year,make_Acura,make_Alfa Romeo,make_Aston Martin,make_Audi,make_BMW,make_Bentley,make_Buick,make_Cadillac,...,trim_tS,trim_xDrive28i,trim_xDrive30i,trim_xDrive35d,trim_xDrive35i,trim_xDrive35i Premium,trim_xDrive35i Sport Activity,trim_xDrive40e,trim_xDrive48i,trim_xDrive50i
count,69997.0,69997.0,69997.0,69997.0,69997.0,69997.0,69997.0,69997.0,69997.0,69997.0,...,69997.0,69997.0,69997.0,69997.0,69997.0,69997.0,69997.0,69997.0,69997.0,69997.0
mean,52545.08,2016.684358,0.014443,0.001429,0.0001,0.017972,0.038916,0.000143,0.012372,0.014515,...,2.9e-05,0.001171,0.002157,0.000314,0.002429,8.6e-05,1.4e-05,0.000429,1.4e-05,0.000543
std,43263.72,3.315897,0.119311,0.037771,0.01,0.132851,0.193396,0.011952,0.11054,0.119601,...,0.005345,0.034207,0.046396,0.017726,0.049222,0.009258,0.00378,0.020698,0.00378,0.023294
min,0.0,1991.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22558.0,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,39404.0,2018.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,73911.0,2019.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1111111.0,2022.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(52497, 1896)

In [31]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [32]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 150,    3,    0,    2,    0,   18,    0],
       [ 102, 1238,    3,  170,   21,  338,    3],
       [   0,    0,   68,    1,    0,    0,    3],
       [  59, 1081,   32, 3874,  558,  138,   58],
       [  12,  197,  135,  922, 4947,   26, 1054],
       [ 335,  181,    0,    9,    2,  594,    2],
       [   0,    3,   87,    8,   76,    2,  988]], dtype=int64)

In [33]:
balanced_accuracy_score(y_test, y_pred)

0.7422504908964832

In [34]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.23      0.87      0.97      0.36      0.92      0.83       173
  10,000-14,999       0.46      0.66      0.91      0.54      0.77      0.58      1875
100,000-400,000       0.21      0.94      0.99      0.34      0.96      0.93        72
  15,000-24,999       0.78      0.67      0.90      0.72      0.78      0.59      5800
  25,000-49,999       0.88      0.68      0.94      0.77      0.80      0.62      7293
    5,000-9,999       0.53      0.53      0.97      0.53      0.72      0.49      1123
  50,000-99,999       0.47      0.85      0.93      0.60      0.89      0.78      1164

    avg / total       0.74      0.68      0.92      0.69      0.79      0.61     17500



In [35]:
# List the features sorted in descending order by feature importance
make_dict = sorted(zip(clf.feature_importances_, X.columns), reverse=True)
make_dict

[(0.19423778322109211, 'miles'),
 (0.1566970908148274, 'year'),
 (0.012492461866693476, 'make_Porsche'),
 (0.011087498200271743, 'trim_Base'),
 (0.00970042874014797, 'make_Mercedes-Benz'),
 (0.008642601548119664, 'trim_SE'),
 (0.007090999721443319, 'model_911'),
 (0.006962692267791231, 'model_Corvette'),
 (0.0060709332346621, 'trim_S'),
 (0.005875068880230954, 'make_Ford'),
 (0.00569443046672021, 'trim_Limited'),
 (0.005673941963976561, 'model_G-Class'),
 (0.005415344812787318, 'make_Toyota'),
 (0.005211488609855929, 'make_Nissan'),
 (0.0049776246937308045, 'trim_LX'),
 (0.0048361641040257156, 'make_Honda'),
 (0.004828471848960021, 'make_GMC'),
 (0.004701671879638847, 'make_Hyundai'),
 (0.0046302621666952145, 'model_Silverado 1500'),
 (0.004543747690627085, 'model_F-150'),
 (0.004450066679374592, 'model_Equinox'),
 (0.004322760867116932, 'trim_Sport'),
 (0.004278105338120597, 'make_Cadillac'),
 (0.004243688760120556, 'trim_LS'),
 (0.004217440042630287, 'trim_XLT'),
 (0.0040820528876654

# Luxury Makes
After the model focusing on makes, the next step was to process the data even more so that the makes are split on luxury and non luxury makes. This is because the pricing and quality of luxury makes create a bias when comparing to regular makes.

In [36]:
clean_df = pd.read_csv('luxury_makes.csv')

In [37]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'fuel_type', 'engine_size', 'engine_block'])

In [38]:
clean_df = clean_df.dropna()

In [39]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [40]:
clean_df['make'].unique()

array(['BMW', 'Mercedes-Benz', 'Lexus', 'Porsche', 'Audi', 'Lamborghini',
       'Alfa Romeo', 'Ferrari', 'Maserati', 'Jaguar', 'Aston Martin',
       'Bentley', 'Land Rover', 'Cadillac', 'Rolls-Royce', 'INFINITI',
       'Maybach', 'Acura', 'Fisker', 'McLaren', 'Lotus'], dtype=object)

In [41]:
luxury_make_df = clean_df.sample(n=70000)

In [42]:
luxury_make_df = luxury_make_df.dropna()

In [43]:
X = pd.get_dummies(luxury_make_df)
X = X.drop(columns=["price_0-4,999", "price_5,000-9,999", "price_10,000-14,999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = luxury_make_df['price']

In [44]:
X

Unnamed: 0,miles,year,make_Acura,make_Alfa Romeo,make_Aston Martin,make_Audi,make_BMW,make_Bentley,make_Cadillac,make_Ferrari,...,trim_xDrive30i,trim_xDrive35d,trim_xDrive35i,trim_xDrive35i Premium,trim_xDrive35i Sport Activity,trim_xDrive40e,trim_xDrive48i,trim_xDrive50i,transmission_Automatic,transmission_Manual
423656,22891.0,2018.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
735937,38947.0,2018.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1092586,38090.0,2014.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
270636,140563.0,2012.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
446396,1341.0,2021.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1032796,56092.0,2016.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
894744,57512.0,2012.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
207452,26140.0,2017.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
527526,42465.0,2018.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(52482, 954)

In [46]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [47]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  41,    0,    0,    0,    1,   11,    0],
       [  29,  586,    0,   58,    0,  116,    1],
       [   0,    0,  307,    0,    0,    0,   13],
       [  17,  532,    5, 2138,  210,   28,   22],
       [  15,   76,  181, 1287, 7622,   12, 1176],
       [ 103,   62,    0,    0,    0,  280,    0],
       [   0,    3,  241,   23,  218,    1, 2079]], dtype=int64)

In [48]:
balanced_accuracy_score(y_test, y_pred)

0.7676860436392449

In [49]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.20      0.77      0.99      0.32      0.88      0.75        53
  10,000-14,999       0.47      0.74      0.96      0.57      0.84      0.70       790
100,000-400,000       0.42      0.96      0.98      0.58      0.97      0.93       320
  15,000-24,999       0.61      0.72      0.91      0.66      0.81      0.64      2952
  25,000-49,999       0.95      0.74      0.94      0.83      0.83      0.68     10369
    5,000-9,999       0.62      0.63      0.99      0.63      0.79      0.60       445
  50,000-99,999       0.63      0.81      0.92      0.71      0.86      0.74      2565

    avg / total       0.80      0.75      0.93      0.76      0.83      0.68     17494



In [50]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

[(0.23133207916877274, 'miles'),
 (0.2007073678383578, 'year'),
 (0.018927657695716575, 'trim_Base'),
 (0.011386184962354174, 'make_Porsche'),
 (0.008379231741499668, 'trim_300'),
 (0.007864654085507336, 'transmission_Automatic'),
 (0.007844364013128281, 'model_911'),
 (0.007817624695109526, 'make_Lexus'),
 (0.007622051585642476, 'make_BMW'),
 (0.0074527757778625895, 'make_Acura'),
 (0.006924800546200745, 'trim_350'),
 (0.00691076618728705, 'model_G-Class'),
 (0.006811069467987801, 'make_INFINITI'),
 (0.0065624684897543275, 'make_Cadillac'),
 (0.006330851089345984, 'trim_Premium'),
 (0.0063219160266074865, 'transmission_Manual'),
 (0.006066760587083725, 'make_Land Rover'),
 (0.006047697217167418, 'make_Mercedes-Benz'),
 (0.005783238950459013, 'make_Audi'),
 (0.005576701080422684, 'model_3 Series'),
 (0.005373314405317104, 'make_Ferrari'),
 (0.005029100983786311, 'model_Escalade'),
 (0.004554703840202048, 'model_Range Rover'),
 (0.004199045159140861, 'model_RX'),
 (0.0039768100335295, '

# Non-Luxury Makes

In [51]:
clean_df = pd.read_csv('regular_makes.csv')

In [52]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'fuel_type', 'engine_size', 'engine_block'])

In [53]:
clean_df = clean_df.dropna()

In [54]:
clean_df['make'].unique()

array(['Chevrolet', 'Mitsubishi', 'Dodge', 'RAM', 'Ford', 'Mercury',
       'GMC', 'smart', 'Jeep', 'Pontiac', 'Volvo', 'Scion', 'Buick',
       'Toyota', 'Lincoln', 'Honda', 'FIAT', 'Saturn', 'Oldsmobile',
       'Kia', 'Chrysler', 'Saab', 'Volkswagen', 'Isuzu', 'Subaru',
       'Am General', 'Hummer', 'Mazda', 'MINI', 'Hyundai', 'Nissan',
       'Suzuki', 'GENESIS', 'KARMA', 'Plymouth', 'Geo', 'Eagle'],
      dtype=object)

In [55]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [56]:
regular_make_df = clean_df.sample(n=70000)

In [57]:
regular_make_df = regular_make_df.dropna()

In [58]:
X = pd.get_dummies(regular_make_df)
X = X.drop(columns=["price_0-4,999", "price_5,000-9,999", "price_10,000-14,999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = regular_make_df['price']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(52500, 1259)

In [60]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [61]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 165,    5,    0,    2,    1,   20,    0],
       [ 138, 1116,    1,  311,   36,  563,   13],
       [   0,    0,   12,    0,    0,    0,    0],
       [  96,  877,   55, 4131, 1003,  287,  193],
       [  32,  167,  167,  960, 3573,   60, 1320],
       [ 432,  204,    0,   11,    3,  678,    0],
       [   1,    3,   50,   12,   71,    0,  731]], dtype=int64)

In [62]:
balanced_accuracy_score(y_test, y_pred)

0.7015739881154579

In [63]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.19      0.85      0.96      0.31      0.91      0.81       193
  10,000-14,999       0.47      0.51      0.92      0.49      0.69      0.45      2178
100,000-400,000       0.04      1.00      0.98      0.08      0.99      0.99        12
  15,000-24,999       0.76      0.62      0.88      0.68      0.74      0.53      6642
  25,000-49,999       0.76      0.57      0.90      0.65      0.72      0.50      6279
    5,000-9,999       0.42      0.51      0.94      0.46      0.69      0.46      1328
  50,000-99,999       0.32      0.84      0.91      0.47      0.87      0.76       868

    avg / total       0.67      0.59      0.90      0.62      0.73      0.52     17500



In [64]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

[(0.152785083756469, 'miles'),
 (0.14802311291679057, 'year'),
 (0.032333265411882924, 'model_Corvette'),
 (0.015590972610530763, 'make_Chevrolet'),
 (0.014289549654389971, 'transmission_Manual'),
 (0.013447557782711702, 'transmission_Automatic'),
 (0.012399669967019465, 'make_Ford'),
 (0.011156479417564025, 'trim_SE'),
 (0.010128275173448045, 'trim_3LT'),
 (0.009208487344630869, 'make_RAM'),
 (0.008819086249392306, 'make_Toyota'),
 (0.008818770392786422, 'trim_TRX'),
 (0.00873344814285008, 'trim_Limited'),
 (0.008085773563074439, 'make_Nissan'),
 (0.007949911628300546, 'model_Silverado 1500'),
 (0.007944253070206147, 'make_Honda'),
 (0.007882452017780596, 'model_Ram 1500 Pickup'),
 (0.007702330055444665, 'make_GMC'),
 (0.007304738175790921, 'model_F-150'),
 (0.00717370721148962, 'trim_S'),
 (0.007049146086162185, 'trim_Base'),
 (0.0068065855788983, 'trim_XLT'),
 (0.006482008262857437, 'make_Kia'),
 (0.006358617914326335, 'trim_1LT'),
 (0.006239845337508998, 'make_Hyundai'),
 (0.006140

# Database

In [65]:
db_string = f"postgresql://postgres:{password}@localhost:5432/final_project_db"

In [66]:
engine=create_engine(db_string)

In [67]:
clean_df.to_sql(name='used_car_sample', con=engine, if_exists='replace')
make_df.to_sql(name='make_sample', con=engine, if_exists='replace')
luxury_make_df.to_sql(name='luxury_sample', con=engine, if_exists='replace')
regular_make_df.to_sql(name='regular_sample', con=engine, if_exists='replace')