<a href="https://colab.research.google.com/github/HannaPo/ML-zoomcamp/blob/master/06_trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'
!wget $data

--2023-10-23 11:28:22--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv.6’


2023-10-23 11:28:22 (18.7 MB/s) - ‘housing.csv.6’ saved [1423529/1423529]



In [3]:
df = pd.read_csv('housing.csv')

**Preparing the dataset**

In [4]:
strings = list(df.dtypes[df.dtypes == 'object'].index)
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [5]:
df = df[df['ocean_proximity'].isin(['<1h_ocean', 'inland'])].reset_index(drop=True)

In [6]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-121.97,37.64,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0,<1h_ocean
1,-121.99,37.61,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0,<1h_ocean
2,-121.97,37.57,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0,<1h_ocean
3,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0,<1h_ocean
4,-121.98,37.58,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0,<1h_ocean


In [7]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [8]:
df.total_bedrooms = df.total_bedrooms.fillna(0)

In [9]:
df.describe().round()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,15687.0,15687.0,15687.0,15687.0,15687.0,15687.0,15687.0,15687.0,15687.0
mean,-119.0,35.0,27.0,2666.0,536.0,1466.0,501.0,4.0,191943.0
std,2.0,2.0,12.0,2258.0,437.0,1180.0,393.0,2.0,108801.0
min,-124.0,33.0,1.0,2.0,0.0,3.0,2.0,0.0,14999.0
25%,-121.0,34.0,17.0,1441.0,291.0,802.0,278.0,3.0,111300.0
50%,-118.0,34.0,27.0,2118.0,429.0,1195.0,406.0,3.0,166900.0
75%,-118.0,37.0,36.0,3172.0,642.0,1777.0,602.0,5.0,241100.0
max,-114.0,42.0,52.0,39320.0,6445.0,35682.0,6082.0,15.0,500001.0


In [10]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [11]:
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')

In [12]:
dv = DictVectorizer(sparse=True)

X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)

**Question 1. Which feature is used for splitting the data?**

In [13]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [14]:
from sklearn.tree import export_text
feature = export_text(dt, feature_names=dv.feature_names_)
print(feature)

|--- ocean_proximity=<1h_ocean <= 0.50
|   |--- value: [11.61]
|--- ocean_proximity=<1h_ocean >  0.50
|   |--- value: [12.30]



**Answer:** ocean_proximity

**Question 2. What's the RMSE of this model on validation?**

In [18]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE:{rmse:.3f}")

RMSE:0.245


**Answer:** 0.245

**Question 3. After which value of n_estimators does RMSE stop improving?**

Now let's experiment with the n_estimators parameter

Try different values of this parameter from 10 to 200 with step 10.
Set random_state to 1.
Evaluate the model on the validation dataset.

In [None]:
rmse_values = []

for i in range(10, 201, 10):
    rf = RandomForestRegressor (n_estimators=i, random_state=1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))


    print('%s -> %.3f' % (i, rmse))
    rmse_values.append(rmse)

10 -> 0.245
20 -> 0.238
30 -> 0.237
40 -> 0.235
50 -> 0.235
60 -> 0.235
70 -> 0.234
80 -> 0.235
90 -> 0.234
100 -> 0.234


In [None]:
plt.figure(figsize=(6, 4))

plt.plot(range(10, 201, 10), rmse, color='black')
plt.xticks(range(0, 201, 50))

plt.title('Number of trees vs RMSE')
plt.xlabel('Number of trees')
plt.ylabel('RMSE')

plt.show()