In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train=pd.read_csv('preprocessedtraindata.csv')
test=pd.read_csv('preprocessedtestdata.csv')

In [3]:
X_train=train.drop(['Price'],axis=1)
y_train=train['Price']

X_test=test.drop(['Price'],axis=1)
y_test=test['Price']

In [4]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,accuracy_score
def predict(ml_model):
    print('Model is: {}'.format(ml_model))
    model= ml_model.fit(X_train,y_train)
    print("Training score: {}".format(model.score(X_train,y_train)))
    print("Test score:{}".format(model.score(X_test,y_test)))
    predictions = model.predict(X_test)
    print("Predictions are: {}".format(predictions))
    print('\n')
    r2score=r2_score(y_test,predictions) 
    print("r2 score is: {}".format(r2score))
          
    print('MAE:{}'.format(mean_absolute_error(y_test,predictions)))
    print('MSE:{}'.format(mean_squared_error(y_test,predictions)))
    print('RMSE:{}'.format(np.sqrt(mean_squared_error(y_test,predictions))))
    
    return model

***Importing Machine Learning Model Libraries***

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

**Random Forest Regressor**

In [6]:
predict(RandomForestRegressor())

Model is: RandomForestRegressor()
Training score: 0.9514370023920196
Test score:0.8056525147153096
Predictions are: [7393.27       9280.31       8307.175      ... 4856.51       3863.02666667
 4205.13      ]


r2 score is: 0.8056525147153096
MAE:1190.7641523841926
MSE:4011116.333012196
RMSE:2002.777155105429


RandomForestRegressor()

**Decision Tree Regressor**

In [7]:
predict(DecisionTreeRegressor())

Model is: DecisionTreeRegressor()
Training score: 0.9705440821762137
Test score:0.6857414949113263
Predictions are: [7295. 8018. 8476. ... 4804. 3873. 4409.]


r2 score is: 0.6857414949113263
MAE:1376.681557572862
MSE:6485946.657363168
RMSE:2546.7521782386225


DecisionTreeRegressor()

**XG Boost Regressor**

In [8]:
predict(XGBRegressor())

Model is: XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)
Training score: 0.9376103949977539
Test score:0.8397650861649033
Predictions are: [8700.959  9433.354  8514.881  ... 4763.8467 3753.632  4421.112 ]


r2 score is: 0.8397650861649033
MAE:1159.9651990545121
MSE:3307070.729838705
RMSE:1818.5353254305248


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

**Linear Regression**

In [9]:
predict(LinearRegression())

Model is: LinearRegression()
Training score: 0.5691534771979423
Test score:0.5914029336672325
Predictions are: [ 6403.55235171 11415.55235171 10075.55235171 ...  3239.55235171
  5675.55235171  4196.05235171]


r2 score is: 0.5914029336672325
MAE:1961.2558075050044
MSE:8432989.82740857
RMSE:2903.9610581770153


LinearRegression()

**Support Vector Regression**

In [10]:
predict(SVR())

Model is: SVR()
Training score: 0.050371821275320805
Test score:0.05653888321299427
Predictions are: [7831.20733187 8446.98978809 8377.76457094 ... 7919.36665243 7945.64429697
 7931.90439889]


r2 score is: 0.05653888321299427
MAE:3403.751032292937
MSE:19471990.026332457
RMSE:4412.70778845965


SVR()

**K Nearest Neighbours**

In [11]:
predict(KNeighborsRegressor())

Model is: KNeighborsRegressor()
Training score: 0.8186724338770556
Test score:0.7509124328730765
Predictions are: [6156.2 9088.4 9696.2 ... 4888.  3860.2 4450. ]


r2 score is: 0.7509124328730765
MAE:1402.750215002389
MSE:5140890.850167225
RMSE:2267.3532698208332


KNeighborsRegressor()

**So we will choose the best model which came out to be XGBRegressor**

In [12]:
model=predict(XGBRegressor())

Model is: XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)
Training score: 0.9376103949977539
Test score:0.8397650861649033
Predictions are: [8700.959  9433.354  8514.881  ... 4763.8467 3753.632  4421.112 ]


r2 score is: 0.8397650861649033
MAE:1159.9651990545121
MSE:3307070.729838705
RMSE:1818.5353254305248


# Hyperparamater Optimization Model

In [13]:
from sklearn.model_selection import RandomizedSearchCV

In [14]:
# Create the random grid

random_grid = {
              'learning_rate': [0.01, 0.1],
              'max_depth': [3, 5, 7, 10],
              'min_child_weight': [1, 3, 5,7,9],
              'subsample': [0.3,0.5,0.7,0.9],
              'colsample_bytree': [0.5,0.7,0.9,1],
              'n_estimators' : [100, 200],
              'objective': ['reg:squarederror']
              }

In [15]:
# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations
xgb_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid,scoring='neg_mean_absolute_error', n_iter = 10, cv = 5, verbose=2, random_state=1, n_jobs = -1)

In [16]:
xgb_random.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   25.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   42.3s finished


RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          gpu_id=-1, importance_type='gain',
                                          interaction_constraints='',
                                          learning_rate=0.300000012,
                                          max_delta_step=0, max_depth=6,
                                          min_child_weight=1, missing=nan,
                                          monotone_constraints='()',
                                          n_estimators=100, n_jobs=0,
                                          num_par...
                                          tree_method='exact',
                                          validate_parameters=1,
                   

In [17]:
xgb_random.best_params_

{'subsample': 0.9,
 'objective': 'reg:squarederror',
 'n_estimators': 200,
 'min_child_weight': 1,
 'max_depth': 5,
 'learning_rate': 0.1,
 'colsample_bytree': 0.9}

In [18]:
prediction = xgb_random.predict(X_test)

In [19]:
r2score=r2_score(y_test,prediction) 
print("r2 score is: {}".format(r2score))
          
print('MAE:{}'.format(mean_absolute_error(y_test,prediction)))
print('MSE:{}'.format(mean_squared_error(y_test,prediction)))
print('RMSE:{}'.format(np.sqrt(mean_squared_error(y_test,prediction))))

r2 score is: 0.8416873063026332
MAE:1190.6310822276823
MSE:3267398.2402317575
RMSE:1807.594600631391


# Saving Model 

In [22]:
import pickle
#open a file where you want to store the data
file = open('optimizedmodel.pkl','wb')

#dump information
pickle.dump(xgb_random,file)

In [23]:
model= open('optimizedmodel.pkl','rb')
xgb=pickle.load(model)