In [45]:
import pandas as pd
import numpy as np 

from sklearn.datasets import load_wine, load_boston
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import seaborn as sns 
import matplotlib.pyplot as plt 

### Problem Statement

Analyzing the house prices in USA.

### Data Gathering

In [46]:
df = pd.read_csv("USA_Housing.csv")
df

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.45857,5.682861,7.009188,4.09,23086.80050,1.059034e+06,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.64245,6.002900,6.730821,3.09,40173.07217,1.505891e+06,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.06718,5.865890,8.512727,5.13,36882.15940,1.058988e+06,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1.260617e+06,USS Barnett\nFPO AP 44820
4,59982.19723,5.040555,7.839388,4.23,26354.10947,6.309435e+05,USNS Raymond\nFPO AE 09386
...,...,...,...,...,...,...,...
4995,60567.94414,7.830362,6.137356,3.46,22837.36103,1.060194e+06,USNS Williams\nFPO AP 30153-7653
4996,78491.27543,6.999135,6.576763,4.02,25616.11549,1.482618e+06,"PSC 9258, Box 8489\nAPO AA 42991-3352"
4997,63390.68689,7.250591,4.805081,2.13,33266.14549,1.030730e+06,"4215 Tracy Garden Suite 076\nJoshualand, VA 01..."
4998,68001.33124,5.534388,7.130144,5.44,42625.62016,1.198657e+06,USS Wallace\nFPO AE 73316


### Exploratory Data Analysis

In [47]:
df.isna().sum()

Avg. Area Income                0
Avg. Area House Age             0
Avg. Area Number of Rooms       0
Avg. Area Number of Bedrooms    0
Area Population                 0
Price                           0
Address                         0
dtype: int64

In [48]:
df.head(5).T

Unnamed: 0,0,1,2,3,4
Avg. Area Income,79545.45857,79248.64245,61287.06718,63345.24005,59982.19723
Avg. Area House Age,5.682861,6.0029,5.86589,7.188236,5.040555
Avg. Area Number of Rooms,7.009188,6.730821,8.512727,5.586729,7.839388
Avg. Area Number of Bedrooms,4.09,3.09,5.13,3.26,4.23
Area Population,23086.8005,40173.07217,36882.1594,34310.24283,26354.10947
Price,1059033.558,1505890.915,1058987.988,1260616.807,630943.4893
Address,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...","188 Johnson Views Suite 079\nLake Kathleen, CA...","9127 Elizabeth Stravenue\nDanieltown, WI 06482...",USS Barnett\nFPO AP 44820,USNS Raymond\nFPO AE 09386


### Feature Selection

In [49]:
df.drop("Address",axis=1,inplace=True)

### Model Training

In [52]:
x = df.drop('Price',axis=1)
y = df['Price']

x_train,x_test, y_train,y_test = train_test_split(x,y, random_state=1, test_size=0.3)
x_train.shape,x_test.shape, y_train.shape,y_test.shape

((3500, 5), (1500, 5), (3500,), (1500,))

In [53]:
model_rf = RandomForestRegressor()
model_rf.fit(x_train, y_train)

### Model Evaluation

In [54]:
### Testing Data 

def model_eval(actual,pred):
    
    mse = mean_squared_error(actual,pred)
    print(f"MSE = {mse}")
    print(f"RMSE = {np.sqrt(mse)}")
    
    mae = mean_absolute_error(actual,pred)
    print(f"MAE = {mae}")
    
    r2 = r2_score(actual,pred)
    print(f"R-Squared Value = {r2}")
    

y_pred = model_rf.predict(x_test)
model_eval(y_test,y_pred)

MSE = 14465203816.78152
RMSE = 120271.37571667467
MAE = 95505.10985980586
R-Squared Value = 0.8859631659567686


In [55]:
### Training Data 

y_pred = model_rf.predict(x_train)
model_eval(y_train,y_pred)

MSE = 2010543155.3192081
RMSE = 44839.0806698711
MAE = 35354.75183870372
R-Squared Value = 0.9837509047340985


#### Random Forest

In [56]:
model_rf = RandomForestRegressor(random_state=3)
model_rf.fit(x_train,y_train)

In [57]:
y_pred = model_rf.predict(x_test)
y_pred[0:6]

array([1564793.50186 , 1562677.74432 ,  985759.268816,  985215.469476,
       1183739.616015,  635585.637188])

In [58]:
y_test.head(6)

2764    1.413580e+06
4767    1.618721e+06
3814    8.413925e+05
3499    8.814439e+05
2735    1.174748e+06
3922    2.662989e+05
Name: Price, dtype: float64

In [59]:
model_eval(y_test,y_pred)

MSE = 14613045126.108156
RMSE = 120884.4287991971
MAE = 96021.8770972296
R-Squared Value = 0.8847976549090185


In [60]:
model_eval(y_train,y_train)

MSE = 0.0
RMSE = 0.0
MAE = 0.0
R-Squared Value = 1.0


In [61]:
model_rf.score(x_train,y_train)

0.9835638384164156

In [62]:
model_rf.score(x_test,y_test)

0.8847976549090185

### Hyperparameter Tuning

In [63]:
model_rf_hyp = RandomForestRegressor(random_state=3)

hyp = {
    'n_estimators':np.arange(50,250,10),
    'criterion':['squared_error','absolute_error'],
    'max_depth':np.arange(5,15),
    'min_samples_split':np.arange(2,15),
    'min_samples_leaf':np.arange(2,15)
}

rscv = RandomizedSearchCV(model_rf_hyp,hyp, cv = 5)
rscv.fit(x_train,y_train)
rscv.best_estimator_

KeyboardInterrupt: 

##### rscv.best_params_

In [36]:
model_rf_tune = RandomForestRegressor(criterion='absolute_error', max_depth=9,
                      min_samples_leaf=6, min_samples_split=11, n_estimators=100,
                      random_state=3)
model_rf_tune

In [37]:
model_rf_tune.fit(x_train,y_train)

### Model Evaluation 

In [38]:
# Testing Data 

y_pred = model_rf_tune.predict(x_test)
model_eval(y_test,y_pred)

MSE = 15838676778.787994
RMSE = 125851.80482928321
MAE = 98834.36558672231
R-Squared Value = 0.8751353539041317


In [39]:
# Training Data 

y_pred = model_rf_tune.predict(x_train)
model_eval(y_train,y_pred)

MSE = 9951462906.924007
RMSE = 99757.01933660611
MAE = 76600.31127317714
R-Squared Value = 0.9195728435960782


In [40]:
model_rf_tune.score(x_train,y_train) 

0.9195728435960782

In [41]:
model_rf_tune.score(x_test,y_test) 

0.8751353539041317

In [43]:
import pickle 

with open('model_random_forest.pkl','wb') as file:
    pickle.dump(model_rf_tune,file)