In [1]:
import numpy as np
import pandas as pd
import scipy.stats as scs
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, precision_score

In [2]:
 def eval(model, X, y):
    y_prdict = model.predict(X)
    rsme = np.sqrt(mean_squared_error(y,y_prdict))
    
    #precision = precision_score(y,y_prdict)
    
    y_base = np.ones(len(y))*y.median()
    rmse_base = np.sqrt(mean_squared_error(y,y_base))
    
    print(model)
    
    print('R2 of model: {:2.2f}'.format(r2_score(y,y_prdict)))
    
    print('RMSE of model: {:2.2f}'.format(mean_squared_error(y,y_prdict)))
    print('Base RMSE: {}'.format(rmse_base))
    
    print('Normalized RSME (RMSE of predict/RMSE of base) : {:2.2f}'.format(rsme/rmse_base))
    return rsme

In [3]:
all_data = pd.read_csv('all_data_clean.csv', low_memory=False)
all_data.head()

Unnamed: 0,Price,Make,Model,Year,Body_Style,City,State,Milage
0,17985,Toyota,86,2017,Coupe,Triadelphia,WV,32817
1,18470,Toyota,86,2017,Coupe,Stone Mountain,GA,34120
2,19400,Toyota,86,2017,Coupe,Glen Burnie,MD,33467
3,19490,Toyota,86,2017,Coupe,North Wilkesboro,NC,56212
4,20000,Toyota,86,2017,Coupe,East Stroudsburg,PA,37486


In [4]:
all_data.dtypes

Price          int64
Make          object
Model         object
Year          object
Body_Style    object
City          object
State         object
Milage         int64
dtype: object

In [4]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 987554 entries, 0 to 987553
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Price       987554 non-null  int64 
 1   Make        987554 non-null  object
 2   Model       987554 non-null  object
 3   Year        987554 non-null  object
 4   Body_Style  986839 non-null  object
 5   City        986842 non-null  object
 6   State       987552 non-null  object
 7   Milage      987554 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 60.3+ MB


In [5]:
y = all_data.pop('Price')

dummies = all_data.select_dtypes('object').columns
X = pd.get_dummies(all_data, columns = dummies, dummy_na = True, prefix=dummies)

X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.20)
len(X_train)

790043

In [7]:
X_train.head()

Unnamed: 0,Milage,Make_AC,Make_Acura,Make_Alfa Romeo,Make_Am General,Make_American Motors,Make_Aston Martin,Make_Audi,Make_Austin-Healey,Make_BMW,...,State_PA,State_PR,State_RI,State_SC,State_TN,State_VA,State_VI,State_VT,State_WV,State_nan
7823,2,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
203793,33136,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
875307,41733,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
512922,28339,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
879522,3948,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Linear Modeling

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

linear = LinearRegression(normalize = True)

model = linear.fit(X_train, y_train)

In [9]:
eval(model, X_test,y_test)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
R2 of model: -55989033157250816279576576.00
RMSE of model: 38055603347263955523670069710159872.00
Base RMSE: 26359.52446642024
Normalized RSME (RMSE of predict/RMSE of base) : 7400681851732.11


1.9507845433892477e+17

# Ridge

In [10]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0, normalize=True)
ridge.fit(X_train,y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=True,
      random_state=None, solver='auto', tol=0.001)

In [11]:
eval(ridge, X_test,y_test)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=True,
      random_state=None, solver='auto', tol=0.001)
R2 of model: 0.44
RMSE of model: 379328636.37
Base RMSE: 26359.52446642024
Normalized RSME (RMSE of predict/RMSE of base) : 0.74


19476.36096332305

# Lasso

In [12]:
from sklearn import linear_model

lasso = linear_model.Lasso(alpha=0.1, normalize = True)

lasso.fit(X_train,y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000, normalize=True,
      positive=False, precompute=False, random_state=None, selection='cyclic',
      tol=0.0001, warm_start=False)

In [13]:
eval(lasso, X_test,y_test)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000, normalize=True,
      positive=False, precompute=False, random_state=None, selection='cyclic',
      tol=0.0001, warm_start=False)
R2 of model: 0.51
RMSE of model: 333694914.15
Base RMSE: 26359.52446642024
Normalized RSME (RMSE of predict/RMSE of base) : 0.69


18267.318198214496

# Random Forest Regressor

In [6]:
from sklearn.ensemble import  RandomForestRegressor

rfr = RandomForestRegressor(max_depth=30, n_estimators=400, verbose=1=)
rfr.fit(X_train, y_train)
eval(rfr, X_test,y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators=2000, max_depth=10, verbose=1 , learning_rate=0.01)
gbr.fit(X_train, y_train)
eval(gbr, X_test,y_test)

In [None]:
gbFI=pd.DataFrame({'feature': X.columns.values})
gbFI['weight']=gbr.feature_importances_
gbFI.sort_values(by='weight', ascending=False, inplace=True)
gbFI