In [2]:
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [3]:
dt = pd.read_csv('clean_vehicles.csv')

In [4]:
dt.head()

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,VIN,drive,size,type,paint_color,image_url,description,state,lat,long,posting_date
0,7316814884,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://auburn.craigslist.org,180874.674367,2014.0,gmc,sierra 1500 crew cab slt,good,8 cylinders,gas,57923.0,clean,other,3GTP1VEC4EG551563,fwd,full-size,pickup,white,https://images.craigslist.org/00R0R_lwWjXSEWNa...,Carvana is the safer way to buy a car During t...,al,32.59,-85.48,2021-05-04T12:31:18-0500
1,7316814758,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://auburn.craigslist.org,180874.674367,2010.0,chevrolet,silverado 1500,good,8 cylinders,gas,71229.0,clean,other,1GCSCSE06AZ123805,fwd,full-size,pickup,blue,https://images.craigslist.org/00R0R_lwWjXSEWNa...,Carvana is the safer way to buy a car During t...,al,32.59,-85.48,2021-05-04T12:31:08-0500
2,7316814989,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://auburn.craigslist.org,180874.674367,2020.0,chevrolet,silverado 1500 crew,good,8 cylinders,gas,19160.0,clean,other,3GCPWCED5LG130317,fwd,full-size,pickup,red,https://images.craigslist.org/01212_jjirIWa0y0...,Carvana is the safer way to buy a car During t...,al,32.59,-85.48,2021-05-04T12:31:25-0500
3,7316743432,https://auburn.craigslist.org/ctd/d/auburn-uni...,auburn,https://auburn.craigslist.org,180874.674367,2017.0,toyota,tundra double cab sr,good,8 cylinders,gas,41124.0,clean,other,5TFRM5F17HX120972,fwd,full-size,pickup,red,https://images.craigslist.org/00x0x_1y9kIOzGCF...,Carvana is the safer way to buy a car During t...,al,32.59,-85.48,2021-05-04T10:41:31-0500
4,7316356412,https://auburn.craigslist.org/cto/d/auburn-uni...,auburn,https://auburn.craigslist.org,27910.346076,2013.0,ford,f-150 xlt,excellent,6 cylinders,gas,128000.0,clean,automatic,missing,rwd,full-size,truck,black,https://images.craigslist.org/00404_l4loxHvdQe...,2013 F-150 XLT V6 4 Door. Good condition. Leve...,al,32.592,-85.5189,2021-05-03T14:02:03-0500


In [5]:
dt.drop(columns=[
    'url',
    'region',
    'region_url',
    'model',
    'VIN',
    'image_url',
    'description',
    'posting_date',
    'state',
    'paint_color'
], inplace=True)

In [6]:
dt.head()

Unnamed: 0,id,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,lat,long
0,7316814884,180874.674367,2014.0,gmc,good,8 cylinders,gas,57923.0,clean,other,fwd,full-size,pickup,32.59,-85.48
1,7316814758,180874.674367,2010.0,chevrolet,good,8 cylinders,gas,71229.0,clean,other,fwd,full-size,pickup,32.59,-85.48
2,7316814989,180874.674367,2020.0,chevrolet,good,8 cylinders,gas,19160.0,clean,other,fwd,full-size,pickup,32.59,-85.48
3,7316743432,180874.674367,2017.0,toyota,good,8 cylinders,gas,41124.0,clean,other,fwd,full-size,pickup,32.59,-85.48
4,7316356412,27910.346076,2013.0,ford,excellent,6 cylinders,gas,128000.0,clean,automatic,rwd,full-size,truck,32.592,-85.5189


## Scaling Year using MinMaxScaler

In [7]:
scaler = MinMaxScaler()

In [8]:
dt['year'] = scaler.fit_transform(dt[['year']])

In [9]:
dt.head()

Unnamed: 0,id,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,lat,long
0,7316814884,180874.674367,0.934426,gmc,good,8 cylinders,gas,57923.0,clean,other,fwd,full-size,pickup,32.59,-85.48
1,7316814758,180874.674367,0.901639,chevrolet,good,8 cylinders,gas,71229.0,clean,other,fwd,full-size,pickup,32.59,-85.48
2,7316814989,180874.674367,0.983607,chevrolet,good,8 cylinders,gas,19160.0,clean,other,fwd,full-size,pickup,32.59,-85.48
3,7316743432,180874.674367,0.959016,toyota,good,8 cylinders,gas,41124.0,clean,other,fwd,full-size,pickup,32.59,-85.48
4,7316356412,27910.346076,0.92623,ford,excellent,6 cylinders,gas,128000.0,clean,automatic,rwd,full-size,truck,32.592,-85.5189


## Working with manufacturer, condition, cylinders, fuel, title_status, transmission, drive, size, type

In [10]:
cols = [
    'manufacturer',
    'condition',
    'cylinders',
    'fuel',
    'title_status',
    'transmission',
    'drive',
    'size',
    'type'
]

for i in cols:
    print(f'{i} --> {dt[i].nunique()}')

manufacturer --> 41
condition --> 6
cylinders --> 8
fuel --> 5
title_status --> 6
transmission --> 3
drive --> 3
size --> 4
type --> 13


In [11]:
labeler = LabelEncoder()

In [12]:
for i in cols:
    dt[i] = labeler.fit_transform(dt[[i]])

In [13]:
dt.head()

Unnamed: 0,id,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,lat,long
0,7316814884,180874.674367,0.934426,14,2,6,2,57923.0,0,2,1,1,8,32.59,-85.48
1,7316814758,180874.674367,0.901639,7,2,6,2,71229.0,0,2,1,1,8,32.59,-85.48
2,7316814989,180874.674367,0.983607,7,2,6,2,19160.0,0,2,1,1,8,32.59,-85.48
3,7316743432,180874.674367,0.959016,38,2,6,2,41124.0,0,2,1,1,8,32.59,-85.48
4,7316356412,27910.346076,0.92623,13,0,5,2,128000.0,0,0,2,1,10,32.592,-85.5189


# Random Forest

In [14]:
dt.shape

(383487, 15)

In [15]:
def feature_selection(data, n_iterations=100):
    feature_importances = np.zeros(data.shape[1])
    for _ in range(n_iterations):

        random_labels = np.random.randint(0, len(data), size=len(data))

        rf = RandomForestClassifier(n_estimators=10, n_jobs=30)
        rf.fit(data, random_labels)

        feature_importances += rf.feature_importances_

    avg_importance = feature_importances / n_iterations

    feature_names = list(data.columns)
    sorted_features = sorted(zip(feature_names, avg_importance), key=lambda x: x[1], reverse=True)
    return sorted_features

In [16]:
dt_train, dt_test = train_test_split(dt, test_size=0.25, random_state=42)

In [17]:
print(dt_train.shape)
print(dt_test.shape)

(287615, 15)
(95872, 15)


In [18]:
feature_importances = feature_selection(dt_test.iloc[:,1:], n_iterations = 1)

In [19]:
for i in feature_importances:
    print(i)

('odometer', 0.2082950467068729)
('lat', 0.18672127488614793)
('long', 0.18662450097696767)
('year', 0.1340447120256896)
('manufacturer', 0.09996135786509319)
('size', 0.03852721862934799)
('cylinders', 0.026785944291381005)
('condition', 0.02234244217426378)
('drive', 0.02129349729558164)
('price', 0.02118579882858278)
('type', 0.020777713965165846)
('transmission', 0.015602771089974167)
('fuel', 0.01313232161847703)
('title_status', 0.004705399646454622)
