In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error



In [44]:
#Loading data

data = pd.read_csv("housing.csv")

In [45]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [47]:
#Dropping null rows
data = data.dropna()
data = data[data['median_house_value'] <= 500000]

In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19475 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           19475 non-null  float64
 1   latitude            19475 non-null  float64
 2   housing_median_age  19475 non-null  float64
 3   total_rooms         19475 non-null  float64
 4   total_bedrooms      19475 non-null  float64
 5   population          19475 non-null  float64
 6   households          19475 non-null  float64
 7   median_income       19475 non-null  float64
 8   median_house_value  19475 non-null  float64
 9   ocean_proximity     19475 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


Like the main file we will remove median_house_value to "y" and change ocean_proximity to be one-hot encoded instead of being a category like it is now

In [49]:
data_encoded = pd.get_dummies(data, columns=['ocean_proximity'],dtype=int)

In [50]:
data_encoded

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,0,1,0,0,0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,0,1,0,0,0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,0,1,0,0,0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,0,1,0,0,0


In [51]:
X = data_encoded.drop('median_house_value', axis=1)
y = data_encoded['median_house_value']

In [52]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
random_clf = RandomForestRegressor()

In [54]:

#param_grid = {
   # 'n_estimators': [100, 200, 300],
   # 'max_leaf_nodes': [10, 20, 30],
   # 'max_depth': [None, 10, 20, 30]
#}

#grid_random_clf = GridSearchCV(random_clf, param_grid, cv=5)

#grid_random_clf.fit(X_train, y_train)


#best_model = grid_random_clf.best_estimator_


In [55]:
random_clf.fit(X_train, y_train)

In [56]:
#y_pred_grid = best_model.predict(X_test)

In [57]:
y_pred = random_clf.predict(X_test)

In [58]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = random_clf.score(X_test, y_test)

print(f"Root Mean Squared Error for untuned model: {rmse}")
print(f"R^2 Score for untuned model: {score}")


Root Mean Squared Error for untuned model: 44061.27936329118
R^2 Score for untuned model: 0.7995474660818847


In [59]:
#grid_random_clf.best_params_

First iteration:

Root Mean Squared Error for untuned model: 44061.27936329118

R^2 Score for untuned model: 0.7995474660818847

Root Mean Squared Error for tuned model: 57423.541379810034

R^2 Score for tuned model: 0.6595312242555647

Best Params: {'max_depth': None, 'max_leaf_nodes': 30, 'n_estimators': 100}
Will try to change max_leaf_nodes higher and try again. Setting n_estimators and max_depth back to default since that was the best outcome from the Grid Search
   

In [60]:
#param_grid = {
 #   'max_leaf_nodes': [30,40,50,60,70]
#}

#grid_random_clf = GridSearchCV(random_clf, param_grid, cv=5)

#grid_random_clf.fit(X_train, y_train)


#best_model = grid_random_clf.best_estimator_

In [61]:
#mse_grid = mean_squared_error(y_test, y_pred_grid)
#rmse_grid = np.sqrt(mse_grid)
#score_grid = best_model.score(X_test, y_test)
#print(grid_random_clf.best_params_)

#print(f"Root Mean Squared Error for tuned model: {rmse_grid}")
#print(f"R^2 Score for tuned model: {score_grid}")

Second iteration on tuned model made it a bit better but still not better than default it seems.

{'max_leaf_nodes': 70}

Root Mean Squared Error for tuned model: 57423.541379810034

R^2 Score for tuned model: 0.6999112487451159

Lets try higher values in max_leaf_nodes as well as default "None". 

In [62]:
#param_grid = {
 #   'max_leaf_nodes': [None,70,100,150]
#}

#grid_random_clf = GridSearchCV(random_clf, param_grid, cv=5)

#grid_random_clf.fit(X_train, y_train)


#best_model = grid_random_clf.best_estimator_

In [63]:
#mse_grid = mean_squared_error(y_test, y_pred_grid)
#rmse_grid = np.sqrt(mse_grid)
#score_grid = best_model.score(X_test, y_test)
#print(grid_random_clf.best_params_)

#print(f"Root Mean Squared Error for tuned model: {rmse_grid}")
#print(f"R^2 Score for tuned model: {score_grid}")

Third iteration on tuned model: 

{'max_leaf_nodes': None}

Root Mean Squared Error for tuned model: 53910.85083935326

R^2 Score for tuned model: 0.7986513812917744

Going over to feature engineering to further improve the model, this iteration showed that standard settings is the best apparently since max_leaf_nodes are "none" which is default value



In Sweden atleast, number of rooms usually gives a higher median value so I want to find out if the model can perform better if I create a value called "mean rooms/household"

In [67]:
data_encoded["rooms_per_household"] = data["total_rooms"] / data["households"]



In [68]:
data_encoded["rooms_per_household"]

0        6.984127
1        6.238137
2        8.288136
3        5.817352
4        6.281853
           ...   
20635    5.045455
20636    6.114035
20637    5.205543
20638    5.329513
20639    5.254717
Name: rooms_per_household, Length: 19475, dtype: float64

In [77]:
data_encoded

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,rooms_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0,6.984127
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0,6.238137
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0,8.288136
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0,5.817352
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0,6.281853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,0,1,0,0,0,5.045455
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,0,1,0,0,0,6.114035
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,0,1,0,0,0,5.205543
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,0,1,0,0,0,5.329513


In [69]:
X_new = data_encoded.drop('median_house_value', axis=1)
y_new = data_encoded["median_house_value"]

X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new,y_new,test_size=0.2,random_state=42)

In [70]:
rf_new = RandomForestRegressor()

rf_new.fit(X_train_new,y_train_new)

In [72]:
y_pred_new = rf_new.predict(X_test_new)

In [75]:
#mse_new = mean_squared_error(y_test_new, y_pred_new)
#rmse_new = np.sqrt(mse_new)
#score_new = rf_new.score(X_test_new, y_test_new)

#print(f"Root Mean Squared Error for new model: {rmse_new}")
#print(f"R^2 Score for new model: {score_new}")


Root Mean Squared Error for new model: 44531.34390959089
R^2 Score for new model: 0.7952476250237799


New model first iteration: 

Root Mean Squared Error for new model: 44531.34390959089
R^2 Score for new model: 0.7952476250237799

It got a bit worse... Standardscale the data maybe?


In [78]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [79]:
X_new_scaled = scaler.fit_transform(X_new)

In [81]:
X_train_scaled, X_test_scaled, y_train_s, y_test_s = train_test_split(X_new_scaled, y_new,test_size=0.2, random_state=42)

In [83]:
scaled_rf = RandomForestRegressor()

scaled_rf.fit(X_train_scaled, y_train_s)

In [85]:
y_pred_scaled = scaled_rf.predict(X_test_scaled)

mse_scaled = mean_squared_error(y_test_s, y_pred_scaled)
rmse_scaled = np.sqrt(mse_scaled)
score_scaled = scaled_rf.score(X_test_scaled, y_test_s)

print(f"Root Mean Squared Error for new model: {rmse_scaled}")
print(f"R^2 Score for new model: {score_scaled}")

Root Mean Squared Error for new model: 44532.794168913555
R^2 Score for new model: 0.7952342884006337


Scaled data model: 

Root Mean Squared Error for new model: 44532.794168913555

R^2 Score for new model: 0.7952342884006337

I can maybe drop some columns and try again. 

In [93]:
X_new_data = data_encoded.drop(columns=['housing_median_age', 'total_rooms', 'population', 'households', 'median_house_value'], axis=1)
y_new_data = data_encoded["median_house_value"]

In [95]:
X_new_data_s = scaler.fit_transform(X_new_data)

In [96]:
X_new_data_s_train, X_new_data_s_test, y_new_data_s_train, y_new_data_s_test = train_test_split(X_new_data_s,y_new_data,test_size=0.2,random_state=42)

In [97]:
rf_clf_last = RandomForestRegressor()

rf_clf_last.fit(X_new_data_s_train, y_new_data_s_train)

In [98]:
#y_pred_scaled_last = rf_clf_last.predict(X_new_data_s_test)

#mse_scaled_last = mean_squared_error(y_new_data_s_test, y_pred_scaled_last)
#rmse_scaled_last = np.sqrt(mse_scaled_last)
#score_scaled_last = rf_clf_last.score(X_new_data_s_test, y_new_data_s_test)

#print(f"Root Mean Squared Error for new model: {rmse_scaled_last}")
#print(f"R^2 Score for new model: {score_scaled_last}")

Root Mean Squared Error for new model: 44177.92862930322
R^2 Score for new model: 0.7984846919966677


Dropped columns and scaled data: 

Root Mean Squared Error for new model: 44177.92862930322

R^2 Score for new model: 0.7984846919966677

