# Supervised Learning Model
This notebook contains supervised learning model using Balanced Random Forest Classifier to see what features from the used car dataset influence the target of price.

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

The first model used is BRFC used on all features besides identifier and geographical columns. The BRFC is then used on a sample of 5000. It is also worth mentioning that the price column is converted into 7 seperate bins to reduce the amount of unique prices.

In [2]:
file_path = 'Used_Cars.csv'
df = pd.read_csv(file_path)
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0.1,Unnamed: 0,id,vin,price,miles,stock_no,year,make,model,trim,...,drivetrain,transmission,fuel_type,engine_size,engine_block,seller_name,street,city,state,zip
0,0,38b2f52e-8f5d,1GCWGFCF3F1284719,20998.0,115879.0,W1T503168C,2015.0,Chevrolet,Express Cargo,Work Van,...,RWD,Automatic,E85 / Unleaded,4.8,V,nissan ellicott city,8569 Baltimore National Pike,Ellicott City,MD,21043
1,1,97ba4955-ccf0,WBY7Z8C59JVB87514,27921.0,7339.0,P33243,2018.0,BMW,i3,s,...,RWD,Automatic,Electric / Premium Unleaded,0.6,I,hendrick honda pompano beach,5381 N Federal Highway,Pompano Beach,FL,33064
2,2,be1da9fd-0f34,ML32F4FJ2JHF10325,11055.0,39798.0,WM2091A,2018.0,Mitsubishi,Mirage G4,SE,...,FWD,Automatic,Unleaded,1.2,I,russ darrow toyota,2700 West Washington St.,West Bend,WI,53095
3,3,84327e45-6cb6,1GCPTEE15K1291189,52997.0,28568.0,9U2Y425A,2019.0,Chevrolet,Colorado,ZR2,...,4WD,Automatic,Diesel,2.8,I,young kia,308 North Main Street,Layton,UT,84041
4,6,43847b9a-6fed,1B7HC16Y8YS543285,3995.0,137537.0,BP8246A,2000.0,Dodge,Ram Pickup,ST,...,RWD,Manual,Unleaded,5.2,V,baumann auto group,2379 W. State St.,Fremont,OH,43420


In [3]:
clean_df = df.drop(columns=['Unnamed: 0', 'id', 'vin', 'seller_name', 'street', 'stock_no', 'city', 'state', 'zip',])
clean_df.drop(clean_df.loc[clean_df['miles'] == 0].index, inplace=True)
clean_df = clean_df.sample(n=5000)
clean_df.head()

Unnamed: 0,price,miles,year,make,model,trim,body_type,vehicle_type,drivetrain,transmission,fuel_type,engine_size,engine_block
1559219,15995.0,82102.0,2012.0,BMW,7 Series,750i,Sedan,Car,RWD,Automatic,Premium Unleaded,4.4,V
3316495,24991.0,14411.0,2020.0,Honda,Civic,Sport,Sedan,Car,FWD,Automatic,Unleaded,2.0,I
3979927,19038.0,31191.0,2018.0,Nissan,Rogue,S,SUV,Truck,4WD,Automatic,Unleaded,2.5,I
4093102,27462.0,31738.0,2018.0,Nissan,Pathfinder,SL,SUV,Truck,FWD,Automatic,Unleaded,3.5,V
994957,7481.0,170632.0,2010.0,Nissan,Murano,SL,Crossover,Truck,4WD,Automatic,Unleaded,3.5,V


In [4]:
clean_df.describe()

Unnamed: 0,price,miles,year,engine_size
count,5000.0,5000.0,5000.0,5000.0
mean,27789.0054,52882.846,2016.6038,3.08504
std,17947.566589,43252.89107,3.443428,1.332033
min,2000.0,1.0,1997.0,0.6
25%,16999.0,22136.75,2015.0,2.0
50%,24326.0,39361.5,2018.0,2.5
75%,34948.25,74299.5,2019.0,3.6
max,388800.0,371298.0,2021.0,8.1


In [5]:
bins = [0, 5000, 10000, 15000, 25000, 50000, 100000, 400000]
labels = ['0-4,999', '5,000-9,999', '10,000-14999', '15,000-24,999', '25,000-49,999', '50,000-99,999', '100,000-400,000']

In [6]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )


In [7]:
clean_df

Unnamed: 0,price,miles,year,make,model,trim,body_type,vehicle_type,drivetrain,transmission,fuel_type,engine_size,engine_block
1559219,"15,000-24,999",82102.0,2012.0,BMW,7 Series,750i,Sedan,Car,RWD,Automatic,Premium Unleaded,4.4,V
3316495,"15,000-24,999",14411.0,2020.0,Honda,Civic,Sport,Sedan,Car,FWD,Automatic,Unleaded,2.0,I
3979927,"15,000-24,999",31191.0,2018.0,Nissan,Rogue,S,SUV,Truck,4WD,Automatic,Unleaded,2.5,I
4093102,"25,000-49,999",31738.0,2018.0,Nissan,Pathfinder,SL,SUV,Truck,FWD,Automatic,Unleaded,3.5,V
994957,"5,000-9,999",170632.0,2010.0,Nissan,Murano,SL,Crossover,Truck,4WD,Automatic,Unleaded,3.5,V
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352435,"50,000-99,999",9796.0,2020.0,Lexus,GX,PREMIUM,SUV,Truck,4WD,Automatic,Premium Unleaded,4.6,V
5649346,"10,000-14999",85034.0,2010.0,Toyota,Avalon,XLS,Sedan,Car,FWD,Automatic,Unleaded,3.5,V
4558149,"10,000-14999",165779.0,2009.0,Dodge,Ram 1500 Pickup,Laramie,Pickup,Truck,4WD,Automatic,E85 / Unleaded; Unleaded,5.7,V
5413560,"15,000-24,999",24915.0,2018.0,Kia,Sportage,EX,SUV,Truck,4WD,Automatic,Unleaded,2.4,I


In [8]:
clean_df.nunique()

price              7
miles           4837
year              25
make              44
model            473
trim             494
body_type         17
vehicle_type       2
drivetrain         3
transmission       2
fuel_type         14
engine_size       52
engine_block       3
dtype: int64

In [9]:
clean_df['price'].describe()

count              5000
unique                7
top       25,000-49,999
freq               1997
Name: price, dtype: object

In [10]:
clean_df = clean_df.dropna()

In [11]:
drop_columns = ['price_0-4,999', 'price_5,000-9,999', 'price_10,000-14999', 'price_15,000-24,999', 'price_25,000-49,999', 'price_50,000-99,999', 'price_100,000-400,000']

In [12]:

X = pd.get_dummies(clean_df)
X = X.drop(columns=drop_columns)
y = clean_df['price']

In [13]:
X

Unnamed: 0,miles,year,engine_size,make_Acura,make_Alfa Romeo,make_Audi,make_BMW,make_Buick,make_Cadillac,make_Chevrolet,...,fuel_type_Electric / Unleaded,fuel_type_Premium Unleaded,fuel_type_Premium Unleaded / Unleaded,fuel_type_Premium Unleaded; Unleaded,fuel_type_Unleaded,fuel_type_Unleaded / E85,fuel_type_Unleaded / Unleaded,engine_block_H,engine_block_I,engine_block_V
1559219,82102.0,2012.0,4.4,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3316495,14411.0,2020.0,2.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3979927,31191.0,2018.0,2.5,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
4093102,31738.0,2018.0,3.5,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
994957,170632.0,2010.0,3.5,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352435,9796.0,2020.0,4.6,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
5649346,85034.0,2010.0,3.5,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4558149,165779.0,2009.0,5.7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5413560,24915.0,2018.0,2.4,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [14]:
X.describe()

Unnamed: 0,miles,year,engine_size,make_Acura,make_Alfa Romeo,make_Audi,make_BMW,make_Buick,make_Cadillac,make_Chevrolet,...,fuel_type_Electric / Unleaded,fuel_type_Premium Unleaded,fuel_type_Premium Unleaded / Unleaded,fuel_type_Premium Unleaded; Unleaded,fuel_type_Unleaded,fuel_type_Unleaded / E85,fuel_type_Unleaded / Unleaded,engine_block_H,engine_block_I,engine_block_V
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,52882.846,2016.6038,3.08504,0.0164,0.002,0.0164,0.0334,0.0146,0.0154,0.1164,...,0.0164,0.207,0.0006,0.0014,0.6644,0.0004,0.0004,0.0334,0.5002,0.4664
std,43252.89107,3.443428,1.332033,0.127021,0.044681,0.127021,0.179697,0.119957,0.12315,0.320736,...,0.127021,0.405196,0.02449,0.037394,0.472247,0.019998,0.019998,0.179697,0.50005,0.49892
min,1.0,1997.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22136.75,2015.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,39361.5,2018.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,74299.5,2019.0,3.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
max,371298.0,2021.0,8.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3750, 1055)

In [16]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [17]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 10,   1,   0,   0,   0,   2,   0],
       [ 13,  83,   3,  12,   0,  29,   1],
       [  0,   0,   6,   0,   0,   0,   0],
       [  8,  73,  11, 253,  42,  17,   9],
       [  0,   4,  50,  92, 258,   1,  96],
       [ 27,  24,   0,   1,   0,  32,   0],
       [  0,   0,  15,   1,   4,   0,  72]], dtype=int64)

In [18]:
balanced_accuracy_score(y_test, y_pred)

0.6641435981452206

In [19]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.17      0.77      0.96      0.28      0.86      0.73        13
   10,000-14999       0.45      0.59      0.91      0.51      0.73      0.52       141
100,000-400,000       0.07      1.00      0.94      0.13      0.97      0.94         6
  15,000-24,999       0.70      0.61      0.87      0.66      0.73      0.52       413
  25,000-49,999       0.85      0.51      0.94      0.64      0.70      0.46       501
    5,000-9,999       0.40      0.38      0.96      0.39      0.60      0.34        84
  50,000-99,999       0.40      0.78      0.91      0.53      0.84      0.70        92

    avg / total       0.68      0.57      0.91      0.60      0.72      0.50      1250



In [20]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

[(0.0931624874173323, 'miles'),
 (0.09277516043061482, 'year'),
 (0.056888318216152284, 'engine_size'),
 (0.025508516788784522, 'fuel_type_Unleaded'),
 (0.024466945031446915, 'fuel_type_Premium Unleaded'),
 (0.023549805673164923, 'drivetrain_FWD'),
 (0.023279604358973117, 'engine_block_I'),
 (0.01999069741838262, 'drivetrain_4WD'),
 (0.017823215115976662, 'engine_block_V'),
 (0.0167972879687903, 'vehicle_type_Car'),
 (0.01628684721193326, 'body_type_SUV'),
 (0.01508326356520572, 'vehicle_type_Truck'),
 (0.012987466176202486, 'body_type_Sedan'),
 (0.012940519628616537, 'transmission_Manual'),
 (0.012770670479774583, 'body_type_Pickup'),
 (0.012308687304548559, 'transmission_Automatic'),
 (0.010938921833546413, 'drivetrain_RWD'),
 (0.010532545229530567, 'trim_SE'),
 (0.010342402126860882, 'make_Ford'),
 (0.010331416664744717, 'trim_Base'),
 (0.010276987796955487, 'make_Chevrolet'),
 (0.008785425007467863, 'make_Porsche'),
 (0.008684458636309731, 'make_Mercedes-Benz'),
 (0.008415126224107

Looking at the feature importances we can see that year and miles has the most influence over price, which is then followed by enginesize, fuel type, and the drivetrain. This makes sense as the year and miles on a car play a big part in deciding if the car is worth what they are selling it for. While this model is informatiive, it doesn't answer the question of what make has most influence on price.

# BRFC Focused on Make
The next model is a BRFC with the used car dataset, except this time the data is cleaned to focus on the make of the car. This means we drop some of the columns such as body_type, fuel_type and more, so the feature importances highlights the makes that influence price.

In [21]:
clean_df = pd.read_csv('make_model.csv')

In [22]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'transmission', 'fuel_type', 'engine_size', 'engine_block'])

In [23]:
clean_df

Unnamed: 0,price,miles,year,make,model,trim
0,20998.0,115879.0,2015.0,Chevrolet,Express Cargo,Work Van
1,27921.0,7339.0,2018.0,BMW,i3,s
2,11055.0,39798.0,2018.0,Mitsubishi,Mirage G4,SE
3,52997.0,28568.0,2019.0,Chevrolet,Colorado,ZR2
4,3995.0,137537.0,2000.0,Dodge,Ram Pickup,ST
...,...,...,...,...,...,...
6167618,69900.0,15270.0,2019.0,Ford,F-250 Super Duty,Lariat
6167619,32991.0,143026.0,2011.0,Ford,F-250 Super Duty,King Ranch
6167620,82900.0,3686.0,2021.0,Ford,F-250 Super Duty,King Ranch
6167621,59995.0,39111.0,2019.0,Ford,F-250 Super Duty,XLT


In [24]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [25]:

make_df = clean_df.sample(n=5000)


In [26]:
make_df

Unnamed: 0,price,miles,year,make,model,trim
2113319,"5,000-9,999",130810.0,2012.0,MINI,Countryman,Countryman S
3229938,"50,000-99,999",29909.0,2019.0,Porsche,Macan,Base
5314150,"15,000-24,999",49940.0,2019.0,Acura,ILX,Base
5607317,"0-4,999",108000.0,2004.0,Toyota,Camry Solara,SE
3060696,"50,000-99,999",725.0,2020.0,Chevrolet,Silverado 3500HD,LTZ
...,...,...,...,...,...,...
302425,"25,000-49,999",24786.0,2017.0,BMW,3 Series,330i
2070644,"15,000-24,999",18605.0,2019.0,Ford,Escape,SEL
1896084,"100,000-400,000",7224.0,2020.0,Porsche,911,S
1405421,"25,000-49,999",32125.0,2016.0,Chevrolet,Silverado 1500,High Country


In [27]:
X = pd.get_dummies(make_df)
X = X.drop(columns= ["price_0-4,999", "price_5,000-9,999", "price_10,000-14999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = make_df['price']

In [28]:
X.describe()

Unnamed: 0,miles,year,make_Acura,make_Alfa Romeo,make_Aston Martin,make_Audi,make_BMW,make_Bentley,make_Buick,make_Cadillac,...,trim_s Grand Touring,trim_s Touring,trim_sDrive28i,trim_sDrive35i,trim_xDrive28i,trim_xDrive30i,trim_xDrive35d,trim_xDrive35i,trim_xDrive40e,trim_xDrive50i
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,52542.65,2016.609,0.0138,0.0014,0.0002,0.0196,0.0424,0.0002,0.0124,0.0168,...,0.0004,0.0004,0.0004,0.0006,0.0018,0.0026,0.0006,0.003,0.0004,0.0002
std,47234.64,3.442861,0.116672,0.037394,0.014142,0.138635,0.20152,0.014142,0.110674,0.128534,...,0.019998,0.019998,0.019998,0.02449,0.042392,0.050929,0.02449,0.054696,0.019998,0.014142
min,0.0,1997.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22106.75,2015.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,39572.0,2018.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,72873.25,2019.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1534761.0,2021.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3750, 1014)

In [30]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [31]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 10,   1,   0,   1,   0,   3,   0],
       [ 10,  68,   0,  10,   1,  21,   0],
       [  0,   1,   6,   0,   0,   0,   2],
       [  6, 117,  21, 195,  60,  22,   3],
       [  4,  33,  70,  79, 277,   4,  53],
       [ 27,  22,   0,   0,   0,  40,   1],
       [  0,   1,  25,   2,  18,   0,  36]], dtype=int64)

In [32]:
balanced_accuracy_score(y_test, y_pred)

0.5467974220390235

In [33]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.18      0.67      0.96      0.28      0.80      0.62        15
   10,000-14999       0.28      0.62      0.85      0.39      0.72      0.51       110
100,000-400,000       0.05      0.67      0.91      0.09      0.78      0.59         9
  15,000-24,999       0.68      0.46      0.89      0.55      0.64      0.39       424
  25,000-49,999       0.78      0.53      0.89      0.63      0.69      0.46       520
    5,000-9,999       0.44      0.44      0.96      0.44      0.65      0.40        90
  50,000-99,999       0.38      0.44      0.95      0.41      0.65      0.40        82

    avg / total       0.64      0.51      0.90      0.55      0.67      0.43      1250



In [34]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

[(0.1413487249648312, 'miles'),
 (0.11467768617881925, 'year'),
 (0.013648483866792802, 'trim_SE'),
 (0.013340708723422096, 'make_Ford'),
 (0.012324426686536527, 'trim_Base'),
 (0.01178028343903708, 'make_Toyota'),
 (0.011524149132563699, 'make_Chevrolet'),
 (0.011340267031042578, 'model_Range Rover'),
 (0.011218238188312288, 'model_Corvette'),
 (0.009966939576004136, 'make_Porsche'),
 (0.009692973877380946, 'trim_S'),
 (0.009578510636700406, 'make_Land Rover'),
 (0.009117939425286053, 'make_Honda'),
 (0.00877785061030197, 'make_Nissan'),
 (0.00877047986549571, 'model_911'),
 (0.00807464814081832, 'make_Hyundai'),
 (0.007822870204788314, 'make_GMC'),
 (0.007627705291700283, 'trim_Limited'),
 (0.0074573670954058355, 'trim_LX'),
 (0.007390047645730695, 'model_F-150'),
 (0.006870742842260516, 'make_Kia'),
 (0.006643072995855347, 'make_BMW'),
 (0.006572266455640206, 'make_Ferrari'),
 (0.0059485849892428775, 'make_Mercedes-Benz'),
 (0.005905072205538538, 'trim_SV'),
 (0.005651840395167623, 

# Luxury Makes
After the model focusing on makes, the next step was to process the data even more so that the makes are split on luxury and non luxury makes. This is because the pricing and quality of luxury makes create a bias when comparing to regular makes.

In [35]:
clean_df = pd.read_csv('luxury_makes.csv')

In [36]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'fuel_type', 'engine_size', 'engine_block'])

In [37]:
clean_df = clean_df.dropna()

In [38]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [39]:
clean_df['make'].unique()

array(['BMW', 'Mercedes-Benz', 'Lexus', 'Porsche', 'Audi', 'Lamborghini',
       'Alfa Romeo', 'Ferrari', 'Maserati', 'Jaguar', 'Aston Martin',
       'Bentley', 'Land Rover', 'Cadillac', 'Rolls-Royce', 'INFINITI',
       'Maybach', 'Acura', 'Fisker', 'McLaren', 'Lotus'], dtype=object)

In [40]:

make_df = clean_df.sample(n=5000)

In [66]:
X = pd.get_dummies(make_df)
X = X.drop(columns=["price_0-4,999", "price_5,000-9,999", "price_10,000-14999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = make_df['price']

In [42]:
X

Unnamed: 0,miles,year,make_Acura,make_Alfa Romeo,make_Aston Martin,make_Audi,make_BMW,make_Bentley,make_Cadillac,make_Ferrari,...,trim_xDrive28i,trim_xDrive30i,trim_xDrive35d,trim_xDrive35i,trim_xDrive35i Premium,trim_xDrive35i Sport Activity,trim_xDrive40e,trim_xDrive50i,transmission_Automatic,transmission_Manual
1093057,30088.0,2018.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
324562,19692.0,2018.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
534750,140544.0,2013.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
86203,60792.0,2019.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
708631,40147.0,2018.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664787,38662.0,2018.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
716417,30540.0,2018.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
166774,70110.0,2016.0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1045002,48528.0,2015.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3750, 706)

In [68]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [69]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  7,   0,   0,   0,   0,   5,   0],
       [  4,  48,   6,  27,   0,  63,  10],
       [  0,   0,   0,   0,   0,   0,   0],
       [  6,  44,  10, 303,   0,  31,  81],
       [  0,  15,  48, 191,   9,   4, 184],
       [ 19,   7,   2,   5,   0,  64,   0],
       [  0,   0,  12,   2,   0,   0,  43]], dtype=int64)

In [None]:
balanced_accuracy_score(y_test, y_pred)

In [None]:
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

# Non-Luxury Makes

In [50]:
clean_df = pd.read_csv('regular_makes.csv')

In [51]:
clean_df = clean_df.drop(columns=['id', 'vin', 'body_type', 'vehicle_type', 'drivetrain', 'fuel_type', 'engine_size', 'engine_block'])

In [52]:
clean_df = clean_df.dropna()

In [53]:
clean_df['make'].unique()

array(['Chevrolet', 'Mitsubishi', 'Dodge', 'RAM', 'Ford', 'Mercury',
       'GMC', 'smart', 'Jeep', 'Pontiac', 'Volvo', 'Scion', 'Buick',
       'Toyota', 'Lincoln', 'Honda', 'FIAT', 'Saturn', 'Oldsmobile',
       'Kia', 'Chrysler', 'Saab', 'Volkswagen', 'Isuzu', 'Subaru',
       'Am General', 'Hummer', 'Mazda', 'MINI', 'Hyundai', 'Nissan',
       'Suzuki', 'GENESIS', 'KARMA', 'Plymouth', 'Geo', 'Eagle'],
      dtype=object)

In [54]:
clean_df['price'] = pd.cut(clean_df['price'], bins, labels=labels )

In [55]:
make_df = clean_df.sample(n=5000)

In [56]:
X = pd.get_dummies(make_df)
X = X.drop(columns=["price_0-4,999", "price_5,000-9,999", "price_10,000-14999", "price_15,000-24,999", "price_25,000-49,999", "price_50,000-99,999", "price_100,000-400,000"])
y = make_df['price']

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(3750, 706)

In [58]:
clf = BalancedRandomForestClassifier(random_state=1, n_estimators=100).fit(X_train, y_train)

In [59]:
# Display the confusion matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[  7,   0,   0,   0,   0,   5,   0],
       [  4,  48,   6,  27,   0,  63,  10],
       [  0,   0,   0,   0,   0,   0,   0],
       [  6,  44,  10, 303,   0,  31,  81],
       [  0,  15,  48, 191,   9,   4, 184],
       [ 19,   7,   2,   5,   0,  64,   0],
       [  0,   0,  12,   2,   0,   0,  43]], dtype=int64)

In [60]:
balanced_accuracy_score(y_test, y_pred)



0.4931934953295225

In [61]:
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

        0-4,999       0.19      0.58      0.98      0.29      0.75      0.55        12
   10,000-14999       0.42      0.30      0.94      0.35      0.53      0.27       158
100,000-400,000       0.00      0.00      0.94      0.00      0.00      0.00         0
  15,000-24,999       0.57      0.64      0.71      0.60      0.67      0.45       475
  25,000-49,999       1.00      0.02      1.00      0.04      0.14      0.02       451
    5,000-9,999       0.38      0.66      0.91      0.48      0.78      0.59        97
  50,000-99,999       0.14      0.75      0.77      0.23      0.76      0.58        57

    avg / total       0.67      0.38      0.86      0.34      0.48      0.29      1250



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
# List the features sorted in descending order by feature importance
sorted(zip(clf.feature_importances_, X.columns), reverse=True)

[(0.12026883950412143, 'miles'),
 (0.0924291199552008, 'year'),
 (0.030896044703387155, 'make_Ford'),
 (0.029959396335557064, 'trim_TRX'),
 (0.028965386192492094, 'make_RAM'),
 (0.02816962511000784, 'model_Ram 1500 Pickup'),
 (0.027229846092819473, 'make_Chevrolet'),
 (0.021679866856524916, 'trim_SE'),
 (0.020790691659625347, 'transmission_Automatic'),
 (0.015238254765756163, 'make_GMC'),
 (0.014223992794176889, 'trim_S'),
 (0.012606515025239897, 'trim_LT'),
 (0.01260172213846967, 'make_Honda'),
 (0.012392898244993551, 'make_Toyota'),
 (0.011774298529127926, 'model_F-150'),
 (0.011563381247827935, 'make_Nissan'),
 (0.01152612437447313, 'make_Dodge'),
 (0.010897343813664901, 'make_Hyundai'),
 (0.01071688181818662, 'trim_Base'),
 (0.009941603777124228, 'model_Camry'),
 (0.00990258470192944, 'make_Jeep'),
 (0.009129043177774352, 'trim_2LT'),
 (0.008947918619488399, 'trim_Sport'),
 (0.008944056606254578, 'model_Corvette'),
 (0.008776202050813274, 'transmission_Manual'),
 (0.008530112114456