In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.model_selection import cross_val_score

from joblib import dump,load
import plotly.express as px

In [7]:
df=pd.read_csv("https://raw.githubusercontent.com/digipodium/Datasets/main/house_pricing.csv")
df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 814 entries, 0 to 813
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        814 non-null    object 
 1   Type        814 non-null    object 
 2   Beds        814 non-null    int64  
 3   Baths       814 non-null    int64  
 4   SquareFeet  814 non-null    int64  
 5   Price       814 non-null    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 38.3+ KB


Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46
2,SACRAMENTO,Residential,2,1,796,119095.12


In [8]:
X =df[['Beds','Baths','SquareFeet']]
y =df['Price']
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=.2,random_state=1)

In [9]:
print('Random  forest tree')
model2=RandomForestRegressor(max_depth=15)
model2.fit(xtrain,ytrain)
print("score:",model2.score(xtest,ytest) *100)
pred=model2.predict(X)
print("mse",mean_squared_error(y,pred))
print("mae",mean_absolute_error(y,pred))


Random  forest tree
score: 72.23947294229306
mse 819907023.5817709
mae 15229.960955681312


In [10]:
forest_score=cross_val_score(model2,X,y,cv=10)
print(forest_score,f"average:{forest_score.mean():.2f}",f"std:{forest_score.std():.2f}")

[ 0.55703072  0.76911224  0.51941237  0.72903885  0.00475218  0.57705946
 -0.06968475  0.24423209  0.55969438  0.46992734] average:0.44 std:0.27


In [11]:

params={'n_estimators':list(range(100,501,200)),
'criterion' : ["squared_error", "absolute_error", "poisson"],
'max_depth':list(range(5,51,25)),
}
params

{'n_estimators': [100, 300, 500],
 'criterion': ['squared_error', 'absolute_error', 'poisson'],
 'max_depth': [5, 30]}

In [12]:
grid=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params,cv=3,n_jobs=-1,verbose=1)

In [13]:
grid.fit(scaledX,y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['squared_error', 'absolute_error',
                                       'poisson'],
                         'max_depth': [5, 30],
                         'n_estimators': [100, 300, 500]},
             verbose=1)

In [14]:
gf=pd.DataFrame(grid.cv_results_)


In [15]:
gf.sort_values(by='rank_test_score',inplace=True)


In [16]:
gf.head(2)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
8,7.190246,0.021949,0.236491,0.056778,absolute_error,5,500,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824887,0.767583,0.61526,0.73591,0.088462,1
7,3.361909,0.713057,0.090945,0.027016,absolute_error,5,300,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824556,0.767079,0.615225,0.73562,0.088307,2


In [18]:
#grid._select_best_index#
grid.best_estimator_
grid._select_best_index

<function sklearn.model_selection._search.BaseSearchCV._select_best_index(refit, refit_metric, results)>

In [19]:
grid._select_best_index
grid.best_estimator_

RandomForestRegressor(criterion='absolute_error', max_depth=5, n_estimators=500)

In [20]:
from joblib import dump,load

In [21]:
dump(grid.best_estimator_,"house_priceing_model_.pkl")

['house_priceing_model_.pkl']