In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [14]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [15]:
df.ocean_proximity.value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [16]:
df = df[(df.ocean_proximity == '<1H OCEAN') | (df.ocean_proximity == 'INLAND')]

In [17]:
df.ocean_proximity.value_counts()

ocean_proximity
<1H OCEAN    9136
INLAND       6551
Name: count, dtype: int64

In [20]:
df = df.fillna(0)
df.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
df_full_train, df_test = train_test_split(df, test_size=0.2,random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25,random_state=1)


In [23]:
def extract_y(df_train, df_val, df_test):
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)

    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']
    return y_train, y_val, y_test, df_train, df_val, df_test

y_train, y_val, y_test, df_train, df_val, df_test = extract_y(df_train, df_val, df_test)

In [29]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

In [26]:
def pre_train(df_train):
    train_dict = df_train.fillna(0).to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dict)
    return X_train, dv

X_train, dv = pre_train(df_train)

In [28]:
def train_model(X_train, y_train, depth=1):
    dt = DecisionTreeRegressor(max_depth=depth)
    dt.fit(X_train, y_train)
    return dt

dt = train_model(X_train, y_train, depth=1)

In [30]:
print(export_text(dt, feature_names=dv.get_feature_names_out()))

|--- ocean_proximity=<1H OCEAN <= 0.50
|   |--- value: [11.61]
|--- ocean_proximity=<1H OCEAN >  0.50
|   |--- value: [12.30]



In [31]:
##############################################################################################################
################################################ RANDOM FOREST ###############################################
from sklearn.ensemble import RandomForestRegressor

In [32]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [33]:
rf = RandomForestRegressor(n_estimators=10, random_state=1)
rf.fit(X_train, y_train)

In [36]:
val_dict = df_val.fillna(0).to_dict(orient='records')
X_val = dv.fit_transform(val_dict)

In [40]:
y_pred = rf.predict(X_val)
rmse(y_val, y_pred)

0.24459962783255346

In [42]:
for n in range(10,201,10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    print(f'{n} => {rmse(y_val, y_pred)}')

10 => 0.24459962783255346
20 => 0.237804442607681
30 => 0.23620701021138027
40 => 0.23471508514267608
50 => 0.2345764456257561
60 => 0.23419687366924186
70 => 0.23418713713506895
80 => 0.23438796698423955
90 => 0.2343044555927512
100 => 0.23413013466629137
110 => 0.2341738662455083
120 => 0.233895354809254
130 => 0.23379933611544426
140 => 0.23363722554944075
150 => 0.23352796531734998
160 => 0.2333145993842164
170 => 0.23332583790298186
180 => 0.23356357894056615
190 => 0.2338395596994813
200 => 0.23376607999078744


In [44]:
score = []
for d in [10, 15, 20, 25]:
    for n in range(10,201,10):
        rf = RandomForestRegressor(n_estimators=n, random_state=1,
                                   max_depth=d ,n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        score.append([n, d, rmse(y_val, y_pred)])
dfsrf = pd.DataFrame(score, columns=['n_estimator', 'depth', 'rmsescore'])

In [47]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 4,
                       ):
    print(dfsrf.sort_values(by='rmsescore', ascending=True))


    n_estimator  depth  rmsescore
76          170     25     0.2334
75          160     25     0.2334
74          150     25     0.2335
73          140     25     0.2336
77          180     25     0.2336
56          170     20     0.2336
55          160     20     0.2336
79          200     25     0.2337
54          150     20     0.2337
57          180     20     0.2337
72          130     25     0.2338
71          120     25     0.2338
78          190     25     0.2338
59          200     20     0.2339
53          140     20     0.2339
69          100     25     0.2340
58          190     20     0.2340
70          110     25     0.2340
66           70     25     0.2340
52          130     20     0.2341
65           60     25     0.2341
68           90     25     0.2342
36          170     15     0.2342
67           80     25     0.2342
35          160     15     0.2343
51          120     20     0.2343
63           40     25     0.2344
50          110     20     0.2344
64           5

In [48]:
rf = RandomForestRegressor(n_estimators=10,
                            max_depth=20,
                            random_state=1,
                            n_jobs=-1)
rf.fit(X_train, y_train)
rf.feature_importances_

array([0.01504314, 0.03006329, 0.10266505, 0.08585341, 0.33548579,
       0.21881985, 0.14745955, 0.02811472, 0.01543366, 0.02106153])

In [54]:
pd.DataFrame(list(zip(list(dv.get_feature_names_out()), list(rf.feature_importances_)))).sort_values(by=1, ascending=False)

Unnamed: 0,0,1
4,median_income,0.335486
5,ocean_proximity=<1H OCEAN,0.21882
6,ocean_proximity=INLAND,0.14746
2,latitude,0.102665
3,longitude,0.085853
1,housing_median_age,0.030063
7,population,0.028115
9,total_rooms,0.021062
8,total_bedrooms,0.015434
0,households,0.015043


In [55]:
################################################################################################################
#################################################### XGBOOST ###################################################

import xgboost as xgb

In [62]:
features = list(dv.get_feature_names_out())
features[5] = 'ocean_proximity=1H OCEAN'
features

['households',
 'housing_median_age',
 'latitude',
 'longitude',
 'median_income',
 'ocean_proximity=1H OCEAN',
 'ocean_proximity=INLAND',
 'population',
 'total_bedrooms',
 'total_rooms']

In [63]:
dtrain = xgb.DMatrix(X_train, y_train, feature_names=features)
dval = xgb.DMatrix(X_val, y_val, feature_names=features)

In [64]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)
y_pred = model.predict(dval)
rmse(y_val, y_pred)

0.228623199980106

In [65]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)
y_pred = model.predict(dval)
rmse(y_val, y_pred)

0.23208927121609343