In [101]:
import pandas as pd
from pathlib import Path

In [102]:
use_sample= True
random_state = 1024

In [103]:
path_data = Path('data/vehicles.feather')
if not path_data.exists():
    df_cars = pd.read_csv("data/vehicles.csv")
#     save as .feather for future. Faster.
    df_cars.to_feather(path_data)
else :
    df_cars = pd.read_feather(path_data)

In [104]:
df_cars.isna().sum()

id                   0
url                  0
region               0
region_url           0
price                0
year              1205
manufacturer     17646
model             5277
condition       174104
cylinders       177678
fuel              3013
odometer          4400
title_status      8242
transmission      2556
VIN             161042
drive           130567
size            306361
type             92858
paint_color     130203
image_url           68
description         70
county          426880
state                0
lat               6549
long              6549
posting_date        68
dtype: int64

Since there are a lot of missing values, we will use a few columns with few missing values

In [105]:
cols = ['id','year','fuel','odometer','transmission', 'price']
df_cars = df_cars[cols]

In [106]:
df_cars = df_cars.loc[~df_cars.isna().any(axis = 1)]

In [107]:
if use_sample:
    df = df_cars.sample(frac = .1, random_state=random_state)
else:
    df = df_cars.copy()

Ensure that the ids are unique in the dataset.

If not, select first instance of each id.

In [108]:
try:
    assert len(df.id.unique()) == df.shape[0]
except:
    print("there are multiple rows for the same ID")
    print("selecting the first row for each ID")
    df.groupby(['id']).first().reset_index(drop =False,inplace=True)

convert all string dtypes to categorical

In [109]:
# from pandas.api.types import is_string_dtype
# for col in df.columns:
#     if is_string_dtype(df[col]):
#         df[col] = pd.Categorical(df[col])
# df.dtypes

Split to train and test

In [111]:
from sklearn.model_selection import train_test_split

In [112]:
categorical = ['fuel','transmission']
numeric = ['year','odometer']
target = 'price'

In [118]:
df[categorical] = df[categorical].astype(str)

In [119]:
dftrain, dftest = train_test_split(df, random_state=random_state, train_size=.75)
dftest.shape, dftrain.shape
xtrain, ytrain = dftrain[numeric + categorical], dftrain[target].values
xtest, ytest = dftest[numeric + categorical], dftest[target].values

Fit DictVectorizer for categorical columns

In [126]:
from sklearn.feature_extraction import DictVectorizer


In [127]:
dv = DictVectorizer()
train_dicts = dftrain.to_dict(orient = 'records')
xtrain = dv.fit_transform(train_dicts)
test_dicts = dftest.to_dict(orient = 'records')
xtest = dv.transform(test_dicts)

Train Models

In [128]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb