In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
data = pd.read_csv('files/housing.csv')

In [34]:
data = data.dropna()
data = data.dropna()

In [35]:
from sklearn.model_selection import train_test_split

X = data.drop(['median_house_value'], axis=1)
y = data['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [36]:
train_data = X_train.join(y_train)

In [37]:
train_data['total_rooms'] = np.log(train_data['total_rooms'] + 1)
train_data['total_bedrooms'] = np.log(train_data['total_bedrooms'] + 1)
train_data['population'] = np.log(train_data['population'] + 1)
train_data['households'] = np.log(train_data['households'] + 1)


In [38]:
train_data = train_data.join(pd.get_dummies(train_data.ocean_proximity)).drop(['ocean_proximity'],axis=1)

In [39]:
train_data['bedroom_ratio'] = train_data['total_bedrooms'] / train_data['total_rooms']
train_data['household_rooms'] = train_data['total_rooms'] / train_data['households']
# plt.figure(figsize=(15,8))
# sns.heatmap( train_data.corr(), annot=True, cmap="YlGnBu")

In [40]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()


X_train, y_train = train_data.drop(['median_house_value'], axis=1), train_data['median_house_value']


reg = LinearRegression().fit(X_train, y_train)



In [41]:
test_data = X_test.join(y_test)

test_data['total_rooms'] = np.log(test_data['total_rooms'] + 1)
test_data['total_bedrooms'] = np.log(test_data['total_bedrooms'] + 1)
test_data['population'] = np.log(test_data['population'] + 1)
test_data['households'] = np.log(test_data['households'] + 1)

#fix test data so ocean proximity is one hot encoded
test_data = test_data.join(pd.get_dummies(test_data.ocean_proximity)).drop(['ocean_proximity'],axis=1)

test_data['bedroom_ratio'] = test_data['total_bedrooms'] / test_data['total_rooms']
test_data['household_rooms'] = test_data['total_rooms'] / test_data['households']

In [42]:
X_test, y_test = test_data.drop(['median_house_value'], axis=1), test_data['median_house_value']
reg.score(X_train, y_train)

0.6732371413324731

In [43]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()
forest.fit(X_train, y_train)


In [44]:
forest.score(X_test, y_test)

0.8102567970265357

In [45]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [200]

}

grid_search = GridSearchCV(forest, 
                           param_grid, 
                           cv=5, 
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(X_train, y_train)

In [46]:
best_forest = grid_search.best_estimator_
best_forest

In [47]:
best_forest.score(X_test,y_test)

0.8116683523455726

In [48]:
prediction = best_forest.predict(X_test)
prediction[0]
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,bedroom_ratio,household_rooms
2305,-119.77,36.83,19.0,8.082711,6.230481,7.229114,6.236370,4.7804,False,True,False,False,False,0.770841,1.296060
10193,-117.95,33.86,36.0,7.620215,5.840642,6.972606,5.849325,5.1970,True,False,False,False,False,0.766467,1.302751
6096,-117.87,34.12,34.0,6.912743,5.398163,6.650279,5.384495,3.8571,True,False,False,False,False,0.780900,1.283824
5401,-118.42,34.03,44.0,6.807935,5.176150,5.883322,5.068904,3.3542,True,False,False,False,False,0.760311,1.343078
13005,-121.28,38.68,16.0,8.151333,6.423247,7.299121,6.400257,3.7500,False,True,False,False,False,0.788000,1.273595
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17802,-121.81,37.36,20.0,8.067776,6.042633,7.118826,5.966147,7.5813,True,False,False,False,False,0.748984,1.352259
18070,-122.01,37.29,31.0,8.051022,6.068426,7.082549,6.023448,7.5000,True,False,False,False,False,0.753746,1.336614
5188,-118.27,33.94,34.0,6.582025,5.111988,6.495266,5.147494,2.0789,True,False,False,False,False,0.776659,1.278685
7834,-118.16,33.91,35.0,7.247081,5.826000,7.255591,5.908083,3.0967,True,False,False,False,False,0.803910,1.226638
