# Random Forest

In [1]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from tqdm import tqdm

In [2]:
filename = 'kc_house_data.csv'

In [3]:
#https://www.kaggle.com/harlfoxem/housesalesprediction

#Don't think too much about the getDataset method. It is used for convenience
def getDataset(filename, **kwargs):
    url = f'https://github.com/IvanReznikov/mdx-msc-data-science/raw/main/datasets/{filename}'
    try:
        import os
        if filename not in os.listdir():
            !pip install wget
            import wget
            downloaded_filename = wget.download(url)
            print(f'{downloaded_filename} file downloaded')
        else:
            print(f'{filename} already exists')
        return pd.read_csv(filename, **kwargs)
        
    except Exception as e:
        print(e)
        return pd.read_csv(url, **kwargs)

In [4]:
df = getDataset(filename)
df.head()

kc_house_data.csv already exists


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [5]:
drop_columns = ['id', 'date']
df.drop(drop_columns, inplace=True, axis=1)

In [6]:
def adjustedR2(r2,n,k):
    return r2-(k-1)/(n-k)*(1-r2)

evaluation = pd.DataFrame({'Model': [],
                           'Details':[],
                           'Root Mean Squared Error (RMSE)':[],
                           'R-squared (training)':[],
                           'Adjusted R-squared (training)':[],
                           'R-squared (test)':[],
                           'Adjusted R-squared (test)':[],
                           '5-Fold Cross Validation':[]})

In [7]:
from sklearn.model_selection import train_test_split

target = "price"
feature_names = [x for x in df.columns if x != target]

y = df[target]
X = df[feature_names]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
clf_tree = DecisionTreeRegressor(max_depth=10, random_state=42)
clf_tree.fit(X_train, y_train)
pred = clf_tree.predict(X_test)

rmsecm = float(format(np.sqrt(metrics.mean_squared_error(y_test,pred)),'.3f'))
rtrcm = float(format(clf_tree.score(X_train,y_train),'.3f'))
artrcm = float(format(adjustedR2(clf_tree.score(X_train,y_train),X_train.shape[0],len(feature_names)),'.3f'))
rtecm = float(format(clf_tree.score(X_test,y_test),'.3f'))
artecm = float(format(adjustedR2(clf_tree.score(X_test,y_test),X_test.shape[0],len(feature_names)),'.3f'))
cv = float(format(cross_val_score(clf_tree,df[feature_names],df['price'],cv=5).mean(),'.3f'))

r = evaluation.shape[0]
evaluation.loc[r] = ['Trees-10','selected features',rmsecm,rtrcm,artrcm,rtecm,artecm,cv]

In [9]:
# Random Forest

for d in tqdm([1,5,10,25,50,100]):
    rf = RandomForestRegressor(n_estimators=d, max_depth=10, random_state = 42, n_jobs=-1)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)

    rmsecm = float(format(np.sqrt(metrics.mean_squared_error(y_test,pred)),'.3f'))
    rtrcm = float(format(rf.score(X_train,y_train),'.3f'))
    artrcm = float(format(adjustedR2(rf.score(X_train,y_train),X_train.shape[0],len(feature_names)),'.3f'))
    rtecm = float(format(rf.score(X_test,y_test),'.3f'))
    artecm = float(format(adjustedR2(rf.score(X_test,y_test),X_test.shape[0],len(feature_names)),'.3f'))
    cv = float(format(cross_val_score(rf,df[feature_names],df['price'],cv=5).mean(),'.3f'))

    r = evaluation.shape[0]
    evaluation.loc[r] = [f'Random Forest-{d}','--depth=10',rmsecm,rtrcm,artrcm,rtecm,artecm,cv]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:29<00:00,  4.87s/it]


In [10]:
evaluation.sort_values(by = '5-Fold Cross Validation', ascending=False)
#Best linear model:
#Multiple Regression-4	all features	191879.550	0.701	0.7	0.713	0.711	0.698

Unnamed: 0,Model,Details,Root Mean Squared Error (RMSE),R-squared (training),Adjusted R-squared (training),R-squared (test),Adjusted R-squared (test),5-Fold Cross Validation
6,Random Forest-100,--depth=10,152826.226,0.935,0.935,0.844,0.844,0.863
5,Random Forest-50,--depth=10,155117.369,0.933,0.933,0.839,0.839,0.86
4,Random Forest-25,--depth=10,156201.248,0.932,0.932,0.837,0.837,0.859
3,Random Forest-10,--depth=10,158865.663,0.929,0.929,0.831,0.831,0.856
2,Random Forest-5,--depth=10,167485.227,0.918,0.918,0.813,0.812,0.843
0,Trees-10,selected features,181690.514,0.918,0.918,0.78,0.779,0.792
1,Random Forest-1,--depth=10,203660.771,0.865,0.865,0.723,0.722,0.769


In [11]:
for d in tqdm([1,5,20,50]):
    rf = RandomForestRegressor(n_estimators=100, max_depth=d, random_state = 42, n_jobs=-1).fit(X_train, y_train)
    pred = rf.predict(X_test)

    rmsecm = float(format(np.sqrt(metrics.mean_squared_error(y_test,pred)),'.3f'))
    rtrcm = float(format(rf.score(X_train,y_train),'.3f'))
    artrcm = float(format(adjustedR2(rf.score(X_train,y_train),X_train.shape[0],len(feature_names)),'.3f'))
    rtecm = float(format(rf.score(X_test,y_test),'.3f'))
    artecm = float(format(adjustedR2(rf.score(X_test,y_test),X_test.shape[0],len(feature_names)),'.3f'))
    cv = float(format(cross_val_score(rf,df[feature_names],df['price'],cv=5).mean(),'.3f'))

    r = evaluation.shape[0]
    evaluation.loc[r] = ['Random Forest-100',f'--depth:{d}',rmsecm,rtrcm,artrcm,rtecm,artecm,cv]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:56<00:00, 14.13s/it]


In [12]:
evaluation.sort_values(by = '5-Fold Cross Validation', ascending=False)

Unnamed: 0,Model,Details,Root Mean Squared Error (RMSE),R-squared (training),Adjusted R-squared (training),R-squared (test),Adjusted R-squared (test),5-Fold Cross Validation
9,Random Forest-100,--depth:20,147368.362,0.981,0.981,0.855,0.855,0.877
10,Random Forest-100,--depth:50,148309.973,0.982,0.982,0.853,0.853,0.877
6,Random Forest-100,--depth=10,152826.226,0.935,0.935,0.844,0.844,0.863
5,Random Forest-50,--depth=10,155117.369,0.933,0.933,0.839,0.839,0.86
4,Random Forest-25,--depth=10,156201.248,0.932,0.932,0.837,0.837,0.859
3,Random Forest-10,--depth=10,158865.663,0.929,0.929,0.831,0.831,0.856
2,Random Forest-5,--depth=10,167485.227,0.918,0.918,0.813,0.812,0.843
0,Trees-10,selected features,181690.514,0.918,0.918,0.78,0.779,0.792
8,Random Forest-100,--depth:5,197765.005,0.795,0.795,0.739,0.738,0.77
1,Random Forest-1,--depth=10,203660.771,0.865,0.865,0.723,0.722,0.769
