In [43]:
import numpy as np
import pandas as pd
import scipy.stats as scs
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, precision_score




def eval(model, X, y):
    y_prdict = model.predict(X)
    rsme = np.sqrt(mean_squared_error(y,y_prdict))
    
    #precision = precision_score(y,y_prdict)
    
    y_base = np.ones(len(y))*y.median()
    rmse_base = np.sqrt(mean_squared_error(y,y_base))
    
    print(model)
    
    print('R2 of model: {:2.2f}'.format(r2_score(y,y_prdict)))
    
    print('RMSE of model: {:2.2f}'.format(mean_squared_error(y,y_prdict)))
    print('Base RMSE: {}'.format(rmse_base))
    
    print('Normalized RSME (RMSE of predict/RMSE of base) : {:2.2f}'.format(rsme/rmse_base))
    return rsme

In [44]:
all_data = pd.read_csv('comb_all_data.csv')
print(len(all_data))
no_dups = all_data.drop_duplicates()
no_nan = no_dups.dropna()
print(len(no_dups))
print(len(no_nan))
no_nan = no_nan.sample(frac=1).reset_index(drop=True)
no_nan = no_nan.dropna()
config = no_nan.head(100000)
print(len(config))
config

1184606
1184606
1184606
100000


Unnamed: 0,Price,Make,Model,Body Style,City,State,Milage,Color,Age
0,24990,Chevrolet,TrailBlazer,SUV,BROOKLYN CENTER,MN,4.199617e-06,Blue,0.000000
1,19500,Chevrolet,Avalanche,Crew Cab Pickup,Carmel,IN,2.941412e-02,Silver,0.089286
2,14990,Toyota,Corolla,Sedan,Warr Acres,OK,9.564029e-03,Red,0.017857
3,52980,Dodge,Charger,Sedan,Commerce,CA,9.410143e-04,Black,0.044643
4,65725,Land Rover,Discovery,SUV,Mt Kisco,NY,3.509680e-05,Blue,0.008929
...,...,...,...,...,...,...,...,...,...
99995,59030,RAM,1500,Crew Cab Pickup,Nappanee,IN,2.999727e-07,Blue,0.008929
99996,35743,Chevrolet,Traverse,SUV,Feasterville-Trevose,PA,7.298635e-03,White,0.017857
99997,44995,Ford,F-150,Crew Cab Pickup,Pikeville,KY,8.444531e-03,Black,0.026786
99998,16998,Hyundai,Sonata,Sedan,Canoga Park,CA,9.903298e-03,White,0.035714


In [45]:
no_nan['Color'].unique()

array(['Blue', 'Silver', 'Red', 'Black', 'White', 'Gray', 'Orange',
       'Brown', 'Beige', 'Yellow', 'Purple', 'Other', 'Green', 'Gold',
       'Pink'], dtype=object)

In [46]:
dummies = config.select_dtypes('object').columns
y = config.pop('Price')
X = pd.get_dummies(config, columns = dummies, dummy_na = True, prefix=dummies)

X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.10, random_state = 10)

In [47]:
#print X-train for make sure it doesn't have price
#X_train['Price'] error ---> no price in it

#double check Milage
X_train['Milage']

40051    0.017965
99400    0.005700
38475    0.001577
82212    0.025715
59706    0.000002
           ...   
9372     0.011830
93553    0.000003
50496    0.004348
94735    0.000003
83209    0.000001
Name: Milage, Length: 90000, dtype: float64

# Gradient Boosting Regressor

In [48]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=1500, max_depth=5, verbose=1 , learning_rate=0.01)
gbr.fit(X_train, y_train)
eval(gbr, X_test,y_test)

      Iter       Train Loss   Remaining Time 
         1   379871991.1974          158.19m
         2   376145441.1458          157.31m
         3   372467441.7417          157.08m
         4   368886389.2870          156.87m
         5   365375845.5860          156.54m
         6   361909939.4781          156.50m
         7   358525562.8446          156.54m
         8   355189452.5520          156.39m
         9   351907232.8532          156.55m
        10   348689939.7801          156.66m
        20   319862069.3715          156.23m
        30   295894385.7100          155.17m
        40   275874415.1116          154.07m
        50   258828562.4308          153.21m
        60   244216292.5523          152.36m
        70   231689971.3959          151.45m
        80   220934718.0616          150.42m
        90   211271827.2413          149.03m
       100   203178018.0063          147.63m
       200   152360013.7602          131.81m
       300   129591035.1554          115.91m
       40

8396.158346286451

In [49]:
gbFI=pd.DataFrame({'feature': X.columns.values})
gbFI['weight']=gbr.feature_importances_
gbFI.sort_values(by='weight', ascending=False, inplace=True)
gbFI

Unnamed: 0,feature,weight
0,Milage,0.280601
17,Make_Ferrari,0.093255
1026,Body Style_Crew Cab Pickup,0.071274
1,Age,0.061905
48,Make_Porsche,0.051841
...,...,...
1178,City_Beloit,0.000000
1179,City_Belton,0.000000
1180,City_Belvidere,0.000000
1181,City_Bensalem,0.000000


In [50]:
y_train

40051     9800
99400    28590
38475    30289
82212    31960
59706    50167
         ...  
9372     15199
93553    25980
50496    32999
94735    53565
83209    20476
Name: Price, Length: 90000, dtype: int64

# Linear Regrssor

In [51]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

linear = LinearRegression(normalize = True)

linear.fit(X_train, y_train)
eval(linear, X_test,y_test)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
R2 of model: -4481714632736346427686387712.00
RMSE of model: 1595129406083560880867481268986576896.00
Base RMSE: 19298.145644594973
Normalized RSME (RMSE of predict/RMSE of base) : 65445890429644.88


1.262984325351491e+18

# Random Forest

In [52]:
from sklearn.ensemble import  RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


rfr = RandomForestRegressor(max_depth=10, n_estimators=500)
rfr.fit(X_train, y_train)
eval(rfr, X_test,y_test)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
R2 of model: 0.60
RMSE of model: 140828582.32
Base RMSE: 19298.145644594973
Normalized RSME (RMSE of predict/RMSE of base) : 0.61


11867.12190566267

In [53]:
rfrFI=pd.DataFrame({'feature': X.columns.values})
rfrFI['weight']=rfr.feature_importances_
rfrFI.sort_values(by='weight', ascending=False, inplace=True)
rfrFI

Unnamed: 0,feature,weight
0,Milage,0.325294
17,Make_Ferrari,0.110458
1026,Body Style_Crew Cab Pickup,0.090525
1,Age,0.070885
48,Make_Porsche,0.062745
...,...,...
1196,City_Bethel Park,0.000000
1197,City_Bettendorf,0.000000
1203,City_Bixby,0.000000
1205,City_Blacksburg,0.000000


In [56]:
#save models as pkl for later use
from sklearn.externals import joblib

joblib.dump(rfr, 'rfr.pkl')
joblib.dump(gbr, 'gbr.pkl') 

['gbr.pkl']