In [1]:
import numpy as np
import pandas as pd
import scipy.stats as scs
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, precision_score


In [2]:
 def eval(model, X, y):
    y_prdict = model.predict(X)
    rsme = np.sqrt(mean_squared_error(y,y_prdict))
    
    #precision = precision_score(y,y_prdict)
    
    y_base = np.ones(len(y))*y.median()
    rmse_base = np.sqrt(mean_squared_error(y,y_base))
    
    print(model)
    
    print('R2 of model: {:2.2f}'.format(r2_score(y,y_prdict)))
    
    print('RMSE of model: {:2.2f}'.format(rsme))
    print('Base RMSE: {}'.format(rmse_base))
    
    pass


In [4]:
## adding what type of model the car is would have inscreased by column count by an extra 20k
# I'm curiouse to see what these columns will do to my modeling
all_data = pd.read_csv('fiftyK.csv', low_memory=False)
all_data.head()

Unnamed: 0,Price,Make,Milage,Year,Body_Sytle,City,State
0,42990.0,Audi,51699.0,2018.0,Convertible,Tampa,FL
1,30221.0,Ford,36.0,2020.0,Crew Cab Pickup,Spartanburg,SC
2,19500.0,Dodge,37426.0,2019.0,Minivan,Franklin,TN
3,66245.0,Ford,12.0,2020.0,Crew Cab Pickup,Dickson City,PA
4,25060.0,Kia,0.0,2020.0,Sedan,DeLand,FL


In [4]:
all_data.dtypes

Price         float64
Make           object
Milage        float64
Year          float64
Body_Sytle     object
City           object
State          object
dtype: object

In [6]:
y = all_data.pop('Price')

dummies = all_data.select_dtypes('object').columns
X = pd.get_dummies(all_data, columns = dummies, dummy_na = True, prefix=dummies)

X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.20)

In [7]:
X.dtypes

Milage             float64
Year               float64
Make_AC              uint8
Make_Acura           uint8
Make_Alfa Romeo      uint8
                    ...   
State_TN             uint8
State_VA             uint8
State_VT             uint8
State_WV             uint8
State_nan            uint8
Length: 1753, dtype: object

# Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

linear = LinearRegression(normalize = True)


## I think I should see a really good score...
model = linear.fit(X_train, y_train)
model.score(X_test, y_test)


-1.923700025346116e+26

In [9]:
eval(model, X_test,y_test)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
R2 of model: -192370002534611605354708992.00
RMSE of model: 55731267473477670296321136342859776.00
Base RMSE: 17256.49886718209
Normalized RSME (RMSE of predict/RMSE of base) : 13680336273070.03


2.36074707398903e+17

# Ridge

In [10]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0, normalize=True)
ridge.fit(X_train,y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=True,
      random_state=None, solver='auto', tol=0.001)

In [11]:
eval(ridge, X_test,y_test)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=True,
      random_state=None, solver='auto', tol=0.001)
R2 of model: 0.47
RMSE of model: 152825514.00
Base RMSE: 17256.49886718209
Normalized RSME (RMSE of predict/RMSE of base) : 0.72


12362.26168623251

# Lasso

In [12]:
from sklearn import linear_model

lasso = linear_model.Lasso(alpha=0.1, normalize = True)

lasso.fit(X_train,y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000, normalize=True,
      positive=False, precompute=False, random_state=None, selection='cyclic',
      tol=0.0001, warm_start=False)

In [13]:
eval(lasso, X_test,y_test)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000, normalize=True,
      positive=False, precompute=False, random_state=None, selection='cyclic',
      tol=0.0001, warm_start=False)
R2 of model: 0.43
RMSE of model: 165647292.82
Base RMSE: 17256.49886718209
Normalized RSME (RMSE of predict/RMSE of base) : 0.75


12870.40375520464

# RandomForest Regressor

In [14]:
from sklearn.ensemble import  RandomForestRegressor

rfr = RandomForestRegressor(max_depth=10, n_estimators=200)
rfr.fit(X_train, y_train)


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [15]:
eval(rfr, X_test,y_test)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
R2 of model: 0.49
RMSE of model: 148675480.61
Base RMSE: 17256.49886718209
Normalized RSME (RMSE of predict/RMSE of base) : 0.71


12193.255537705456

In [20]:
rfr = RandomForestRegressor(max_depth=30, n_estimators=300)
rfr.fit(X_train, y_train)
eval(rfr, X_test,y_test)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=30, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
R2 of model: 0.68
RMSE of model: 93075522.61
Base RMSE: 17256.49886718209
Normalized RSME (RMSE of predict/RMSE of base) : 0.56


9647.56563128475

# Gradient Boosting Regressor

In [17]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=2000, max_depth=10, verbose=1 , learning_rate=0.05)
gbr.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1  1092204312.2259           86.22m
         2  1001012518.9843           86.14m
         3   918899455.7291           86.11m
         4   844518107.8124           86.13m
         5   777076826.1748           85.95m
         6   716382577.9894           86.02m
         7   660921665.5797           86.00m
         8   610965664.4752           86.11m
         9   565938947.6760           86.16m
        10   524885582.4557           86.19m
        20   272110304.7371           85.69m
        30   168901674.7730           84.65m
        40   124356659.8413           82.26m
        50   102246856.6666           80.89m
        60    89681880.9327           79.69m
        70    81737297.1173           78.27m
        80    76226330.1038           77.13m
        90    72504207.2916           76.10m
       100    69259859.6375           75.12m
       200    57231309.0170           68.66m
       300    53520704.5195           64.05m
       40

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.05, loss='ls',
                          max_depth=10, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=2000,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=1, warm_start=False)

In [18]:
eval(gbr, X_test,y_test)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.05, loss='ls',
                          max_depth=10, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=2000,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=1, warm_start=False)
R2 of model: 0.67
RMSE of model: 97010741.21
Base RMSE: 17256.49886718209
Normalized RSME (RMSE of predict/RMSE of base) : 0.57


9849.403088948995

In [19]:
gbFI=pd.DataFrame({'feature': X.columns.values})
gbFI['weight']=gbr.feature_importances_
gbFI.sort_values(by='weight', ascending=False, inplace=True)
gbFI

Unnamed: 0,feature,weight
32,Make_Land Rover,0.235816
1,Year,0.215832
0,Milage,0.196652
515,City_Edison,0.098071
9,Make_Bugatti,0.075183
...,...,...
604,City_Franklin Square,0.000000
1335,City_Royersford,0.000000
618,City_Gaffney,0.000000
623,City_Garden City,0.000000


# Tensor Flow

In [None]:
from tensorflow import keras
from 