In [1]:
import category_encoders as ce
from sklearn.pipeline import make_pipeline 
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import r2_score
from pdpbox.pdp import pdp_isolate, pdp_plot
from pdpbox.pdp import pdp_interact,pdp_interact_plot
import eli5
from eli5.sklearn import PermutationImportance
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import shap
from joblib import dump
from joblib import load
%config IPCompleter.greedy=True

In [2]:
plt.rcParams['figure.dpi'] = 72
df1 = pd.read_csv('vehicles_trimmed1.csv')
df2 = df1
df2['price'] = df2['price'].replace(0,np.NaN)
df2 = df2.dropna(subset=['price'])
df2 = df2[(df2['price'] >= np.percentile(df2['price'], 0.05)) &
          (df2['price'] <= np.percentile(df2['price'], 99.95))]
train_orig, test = train_test_split(df2.drop(columns=['county','Unnamed: 0']) , test_size = 0.1, train_size = 0.9)
train, val = train_test_split(train_orig, train_size = 0.9, test_size = 0.1)
train

Unnamed: 0,id,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,image_url,state,lat,long
272270,7042969687,western massachusetts,1900.0,2009.0,hyundai,sonata,good,6 cylinders,gas,176000.0,clean,automatic,fwd,,sedan,blue,https://images.craigslist.org/00E0E_4t4u7wuVmd...,ma,42.0853,-72.5584
270916,7036736893,south coast,9995.0,2006.0,,hummer h3 luxury,excellent,5 cylinders,gas,113000.0,clean,automatic,4wd,mid-size,SUV,white,https://images.craigslist.org/00M0M_yUxa9GW5ZF...,ma,41.6409,-70.8825
139376,7045087645,fort collins / north CO,15800.0,2015.0,honda,cr-v,,,gas,118982.0,clean,automatic,,,,,https://images.craigslist.org/01515_gLvDR8klwc...,co,40.3907,-104.7550
94856,7047539800,modesto,39980.0,2016.0,ford,e-350,,,gas,23857.0,clean,automatic,,,other,white,https://images.craigslist.org/00Y0Y_4e3o3tTAU1...,ca,38.7966,-121.2160
17778,7044596043,bellingham,16999.0,2014.0,audi,a6,good,4 cylinders,gas,77968.0,clean,automatic,,,sedan,grey,https://images.craigslist.org/00z0z_2iQJmkkyJB...,wa,48.7548,-122.4700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72620,7040313035,fayetteville,33995.0,2018.0,ram,5500 regular cab 2wd,good,,diesel,99881.0,clean,automatic,,,other,white,https://images.craigslist.org/00r0r_33hP2dZ40t...,ar,36.2392,-94.1376
449238,7046639948,williamsport,4995.0,2004.0,toyota,prius,excellent,4 cylinders,hybrid,157000.0,clean,automatic,fwd,compact,sedan,blue,https://images.craigslist.org/00p0p_4ymIVbeuXo...,pa,40.9044,-76.7908
154216,7038913554,delaware,8600.0,1979.0,chevrolet,camaro,excellent,8 cylinders,gas,500000.0,clean,automatic,rwd,full-size,coupe,silver,https://images.craigslist.org/01111_c8LjiU0N32...,de,38.5734,-75.3263
456722,7042943020,columbia,1900.0,2005.0,,Scion xa,,,gas,,clean,automatic,,,,,https://images.craigslist.org/00k0k_9FNce5AWhN...,sc,34.0138,-81.0437


In [3]:
## Final model for app usage

target = 'price'
features1 = ['year', 'manufacturer', 'cylinders','fuel', 'odometer']
X_train1 = train[features1]
y_train = train[target]
X_val1 = val[features1]
y_val = val[target]
X_test1 = test[features1]
y_test = test[target]

In [9]:
pipeline1 = make_pipeline(
    ce.TargetEncoder(),
    SimpleImputer(strategy='median'),
    RandomForestRegressor(max_depth=30,
                                      n_estimators=50, n_jobs=-3,
                                        random_state=30)
)
pipeline1.fit(X_train1, y_train)

y_pred = pipeline1.predict(X_val1)
print('MAE:', mean_absolute_error(y_val, y_pred))
print('R^2:', r2_score(y_val, y_pred))

MAE: 2886.2397449386103
R^2: 0.7651232850307356


In [5]:
# dump(pipeline1, 'pipeline_drive.joblib', compress=True)

In [14]:
X_train1['manufacturer'].unique()

array(['gmc', 'volvo', 'chevrolet', 'ford', 'mercedes-benz', 'kia',
       'jeep', 'mazda', 'acura', 'ram', 'bmw', 'hyundai', 'chrysler',
       'honda', 'subaru', nan, 'volkswagen', 'toyota', 'nissan',
       'infiniti', 'lincoln', 'mini', 'audi', 'dodge', 'cadillac',
       'buick', 'lexus', 'pontiac', 'rover', 'ferrari', 'fiat', 'saturn',
       'mitsubishi', 'harley-davidson', 'mercury', 'jaguar', 'alfa-romeo',
       'porche', 'tesla', 'datsun', 'aston-martin', 'land rover',
       'morgan', 'hennessey'], dtype=object)