In [1]:
import numpy as np
import pandas as pd
import scipy.stats as scs
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, precision_score




def eval(model, X, y):
    y_prdict = model.predict(X)
    rsme = np.sqrt(mean_squared_error(y,y_prdict))
    
    #precision = precision_score(y,y_prdict)
    
    y_base = np.ones(len(y))*y.median()
    rmse_base = np.sqrt(mean_squared_error(y,y_base))
    
    print(model)
    
    print('R2 of model: {:2.2f}'.format(r2_score(y,y_prdict)))
    
    print('RMSE of model: {:2.2f}'.format(mean_squared_error(y,y_prdict)))
    print('Base RMSE: {}'.format(rmse_base))
    
    print('Normalized RSME (RMSE of predict/RMSE of base) : {:2.2f}'.format(rsme/rmse_base))
    return rsme

In [8]:
all_data = pd.read_csv('comb_all_data.csv')
print(len(all_data))
no_dups = all_data.drop_duplicates()
no_nan = no_dups.dropna()
print(len(no_dups))
print(len(no_nan))

config = no_nan.head(100000)
print(len(config))
config

1184606
1184606
1184606
100000


Unnamed: 0,Price,Make,Model,Body Style,City,State,Milage,Color,Age
0,49876,AC,Shelby Cobra,Convertible,Matthews,NC,0.000322,Red,0.500000
1,44999,AC,Shelby Cobra,Convertible,Nashville,TN,0.003643,Blue,0.500000
2,7500,Acura,Integra,Coupe,College Station,TX,0.045896,Red,0.241071
3,7500,Acura,Integra,Coupe,College Station,TX,0.045896,Black,0.241071
4,7500,Acura,Integra,Coupe,College Station,TX,0.045896,White,0.241071
...,...,...,...,...,...,...,...,...,...
99995,32699,Dodge,Charger,Sedan,Spartanburg,SC,0.000000,White,0.008929
99996,24190,Dodge,Charger,Sedan,Chandler,OK,0.000100,Black,0.008929
99997,18950,Dodge,Journey,SUV,Eden,NC,0.000002,Gray,0.008929
99998,17749,Dodge,Journey,SUV,Merrillville,IN,0.000004,Gray,0.008929


In [9]:
print(len(all_data))
no_dups = all_data.drop_duplicates()
no_nan = no_dups.dropna()
print(len(no_dups))
print(len(no_nan))

1184606
1184606
1184606


In [4]:
no_nan['Color'].unique()

array(['White', 'Black', 'Silver', 'Gray', 'Red', 'Other', 'Blue',
       'Green', 'Gold', 'Brown', 'Beige', 'Orange', 'Yellow', 'Purple',
       'Pink'], dtype=object)

In [10]:
dummies = config.select_dtypes('object').columns
y = config.pop('Price')
X = pd.get_dummies(config, columns = dummies, dummy_na = True, prefix=dummies)

X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.20, random_state = 10)

# Gradient Boosting Regressor

In [11]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=1500, max_depth=5, verbose=1 , learning_rate=0.01)
gbr.fit(X_train, y_train)
eval(gbr, X_test,y_test)

      Iter       Train Loss   Remaining Time 
         1   390942857.9533           66.71m
         2   387555579.1339           66.40m
         3   384234500.5158           66.29m
         4   380976640.9717           66.25m
         5   377782613.4166           66.21m
         6   374648326.7073           66.17m
         7   371575084.2665           66.21m
         8   368583710.6922           66.16m
         9   365566735.4777           66.08m
        10   362656282.3098           66.06m
        20   335768965.3350           66.00m
        30   312998091.1099           65.84m
        40   293829787.3344           65.60m
        50   273689535.6558           65.51m
        60   258801758.1280           65.25m
        70   246543080.0053           64.89m
        80   235295177.9052           64.48m
        90   225470968.0755           64.01m
       100   217274611.9972           63.53m
       200   168498516.8583           57.32m
       300   143211175.5431           51.01m
       40

10778.075063127995

In [12]:
gbFI=pd.DataFrame({'feature': X.columns.values})
gbFI['weight']=gbr.feature_importances_
gbFI.sort_values(by='weight', ascending=False, inplace=True)
gbFI

Unnamed: 0,feature,weight
0,Milage,0.269005
1,Age,0.108113
9,Make_Bentley,0.061173
8,Make_BMW,0.059753
1526,Color_White,0.053612
...,...,...
616,City_Covina,0.000000
615,City_Courtland,0.000000
614,City_Cottonwood,0.000000
613,City_Cottondale,0.000000


In [13]:
y_train

31152     8995
33499     3995
18594    24000
97132    24700
56015    14999
         ...  
9372     21495
93553    27185
50496    13990
94735    17250
83209    22999
Name: Price, Length: 80000, dtype: int64

# Linear Regrssor

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

linear = LinearRegression(normalize = True)

linear.fit(X_train, y_train)
eval(linear, X_test,y_test)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
R2 of model: -481357502996395831559454720.00
RMSE of model: 187522749422631366650724586293821440.00
Base RMSE: 20113.33450881529
Normalized RSME (RMSE of predict/RMSE of base) : 21529944210704.40


4.3303896986602874e+17

# Random Forest

In [None]:
from sklearn.ensemble import  RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


rfr = RandomForestRegressor(max_depth=10, n_estimators=500)
rfr.fit(X_train, y_train)
eval(rfr, X_test,y_test)