In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [31]:
PATH = "./housing.csv"
select_cols = ["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", "households", 
               "median_income", "median_house_value", "ocean_proximity"]
df = pd.read_csv(PATH, usecols=select_cols)
df.total_bedrooms = df.total_bedrooms.fillna(0)

In [32]:
df = df.fillna(0)

In [33]:
df['median_house_value_log'] = np.log1p(df['median_house_value'])

In [34]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.median_house_value_log.values
y_val = df_val.median_house_value_log.values
y_test = df_test.median_house_value_log.values

del df_train['median_house_value_log']
del df_val['median_house_value_log']
del df_test['median_house_value_log']

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [48]:
numerical = ["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms",
             "population", "households", "median_income"]

categorical = ["ocean_proximity"]

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

# test_dict = df_test[categorical + numerical].to_dict(orient='records')
# X_test = dv.fit_transform(test_dict)

In [49]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=1)

In [50]:
print(export_text(dt, feature_names=dv.get_feature_names()))

|--- ocean_proximity=INLAND <= 0.50
|   |--- value: [12.31]
|--- ocean_proximity=INLAND >  0.50
|   |--- value: [11.61]



In [53]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.fit_transform(val_dict)

y_pred = rf.predict(X_val)

In [55]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [56]:
rmse(y_val, y_pred)

0.24579193759466622

In [62]:
n_estimators = np.arange(10, 210, 10)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.fit_transform(val_dict)

for n_estimator in n_estimators: 
    rf = RandomForestRegressor(n_estimators=n_estimator, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)

    print('n_estimator', n_estimator, 'rmse', rmse(y_val, y_pred))

n_estimator 10 rmse 0.24579193759466625
n_estimator 20 rmse 0.2391504517187264
n_estimator 30 rmse 0.23565070217521217
n_estimator 40 rmse 0.2345424867125084
n_estimator 50 rmse 0.23305414151573647
n_estimator 60 rmse 0.23291531086861533
n_estimator 70 rmse 0.23249901407006357
n_estimator 80 rmse 0.2325668463305913
n_estimator 90 rmse 0.23221983354701684
n_estimator 100 rmse 0.23194347424665712
n_estimator 110 rmse 0.23148387282624047
n_estimator 120 rmse 0.23160217090037064
n_estimator 130 rmse 0.2317045731826061
n_estimator 140 rmse 0.23168170881570188
n_estimator 150 rmse 0.23171250374776223
n_estimator 160 rmse 0.2316734190979443
n_estimator 170 rmse 0.23160899199591195
n_estimator 180 rmse 0.2317810286045522
n_estimator 190 rmse 0.23178794216224186
n_estimator 200 rmse 0.23166252305257326


In [65]:
n_estimators = np.arange(10, 210, 10)
max_depths = [10, 15, 20, 25]
scores = []

for n_estimator in n_estimators:
    for max_depth in max_depths:
        rf = RandomForestRegressor(n_estimators=n_estimator, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_val)

        scores.append([n_estimator, max_depth, rmse(y_val, y_pred)])

In [69]:
pd.DataFrame(scores, columns=['n_estimator', 'max_depth', 'rmse']).sort_values(by='rmse')

Unnamed: 0,n_estimator,max_depth,rmse
43,110,25,0.231179
47,120,25,0.231206
55,140,25,0.231378
79,200,25,0.231437
67,170,25,0.231449
...,...,...,...
16,50,10,0.250117
12,40,10,0.250963
8,30,10,0.251514
4,20,10,0.252599


In [70]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=20, n_estimators=10, n_jobs=-1, random_state=1)

In [97]:
pd.DataFrame({'feature': dv.feature_names_, 'feature_importance': rf.feature_importances_}).sort_values(by='feature_importance')

Unnamed: 0,feature,feature_importance
7,ocean_proximity=ISLAND,0.000357
8,ocean_proximity=NEAR BAY,0.000453
5,ocean_proximity=<1H OCEAN,0.002765
9,ocean_proximity=NEAR OCEAN,0.004452
0,households,0.016141
11,total_bedrooms,0.019373
12,total_rooms,0.020465
10,population,0.030999
1,housing_median_age,0.033197
3,longitude,0.096341


In [108]:
features = ['households',
 'housing_median_age',
 'latitude',
 'longitude',
 'median_income',
 'ocean_proximity=1H OCEAN',
 'ocean_proximity=INLAND',
 'ocean_proximity=ISLAND',
 'ocean_proximity=NEAR BAY',
 'ocean_proximity=NEAR OCEAN',
 'population',
 'total_bedrooms',
 'total_rooms']
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

watchlist = [(dtrain, 'train'), (dval, 'val')]

In [114]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
model = xgb.train(xgb_params, dtrain, num_boost_round=100)
y_pred = model.predict(dval)
rmse(y_val, y_pred)

0.2260957847928297

In [113]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
model = xgb.train(xgb_params, dtrain, num_boost_round=100)
y_pred = model.predict(dval)
rmse(y_val, y_pred)

0.2305416021993131