In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.model_selection import cross_val_score

from joblib import dump,load
import plotly.express as px

In [2]:
df=pd.read_csv("https://raw.githubusercontent.com/digipodium/Datasets/main/house_pricing.csv")
df.info()
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 814 entries, 0 to 813
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        814 non-null    object 
 1   Type        814 non-null    object 
 2   Beds        814 non-null    int64  
 3   Baths       814 non-null    int64  
 4   SquareFeet  814 non-null    int64  
 5   Price       814 non-null    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 38.3+ KB


Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46
2,SACRAMENTO,Residential,2,1,796,119095.12


In [3]:
X =df[['Beds','Baths','SquareFeet']]
y =df['Price']
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=.2,random_state=1)

In [4]:
print('Random  forest tree')
model2=RandomForestRegressor(max_depth=15)
model2.fit(xtrain,ytrain)
print("score:",model2.score(xtest,ytest) *100)
pred=model2.predict(X)
print("mse",mean_squared_error(y,pred))
print("mae",mean_absolute_error(y,pred))


Random  forest tree
score: 71.51544019747233
mse 876471289.0730479
mae 15428.160291733742


In [5]:
forest_score=cross_val_score(model2,X,y,cv=10)
print(forest_score,f"average:{forest_score.mean():.2f}",f"std:{forest_score.std():.2f}")

[ 0.56092262  0.77170406  0.47925086  0.71722501  0.04794284  0.5728398
 -0.13584899  0.25922977  0.56860984  0.39894635] average:0.42 std:0.27


In [13]:

params={'n_estimators':list(range(100,501,100)),
'criterion' : ["squared_error", "absolute_error", "poisson"],
'max_depth':list(range(5,51,15)),
}
params

{'n_estimators': [100, 200, 300, 400, 500],
 'criterion': ['squared_error', 'absolute_error', 'poisson'],
 'max_depth': [5, 20, 35, 50]}

In [11]:
grid=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params,cv=3,n_jobs=-1,verbose=3)

In [12]:
grid.fit(X,y)

Fitting 3 folds for each of 75 candidates, totalling 225 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['squared_error', 'absolute_error',
                                       'poisson'],
                         'max_depth': [5, 15, 25, 35, 45],
                         'n_estimators': [100, 200, 300, 400, 500]},
             verbose=3)

In [15]:
gf=pd.DataFrame(grid.cv_results_)


In [27]:
gf.sort_values(by='rank_test_score',inplace=True)


In [37]:
gf.head(2)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
29,4.023973,0.011822,0.093324,0.009975,absolute_error,5,500,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824195,0.766496,0.614471,0.735054,0.088459,1
28,3.315952,0.302442,0.119989,0.045719,absolute_error,5,400,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824411,0.766263,0.614088,0.734921,0.088678,2


In [39]:
#grid._select_best_index#
grid.best_estimator_
grid._select_best_index]

<function sklearn.model_selection._search.BaseSearchCV._select_best_index(refit, refit_metric, results)>

In [40]:
grid._select_best_index
grid.best_estimator_

RandomForestRegressor(criterion='absolute_error', max_depth=5, n_estimators=500)