# Random Forest Model

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import RandomForestRegressor
import numpy as np

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
from hyperopt.pyll import scope as ho_scope
from hyperopt.pyll.stochastic import sample as ho_sample
#import libraries

In [2]:
df_train = pd.read_csv("data/train_clean.csv")
df_test = pd.read_csv("data/test_clean.csv")
#read csv

In [3]:
df_train
#check data

Unnamed: 0,carat,cut,color,clarity,depth,x,y,z,price
0,1.50,5,3,4,61.5,7.32,7.34,4.51,9.588
1,2.01,4,1,2,60.6,8.11,8.25,4.96,9.748
2,0.50,3,1,3,61.6,5.13,5.09,3.15,7.255
3,0.25,4,3,6,61.6,4.05,4.08,2.50,6.450
4,0.52,3,4,4,62.0,5.16,5.19,3.21,7.721
...,...,...,...,...,...,...,...,...,...
40450,1.04,4,6,2,59.6,6.60,6.62,3.94,8.190
40451,0.51,4,1,3,63.3,5.09,5.05,3.21,7.246
40452,1.51,3,5,4,62.6,7.37,7.33,4.60,9.277
40453,2.02,5,5,4,61.3,8.16,8.11,4.99,9.680


In [4]:
X = df_train.drop("price", axis=1)
y = df_train["price"]
#take predictors and response variable

In [5]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)

In [10]:
params = {
        "n_estimators":hp.quniform("n_estimators", 10, 1000, 25),
        "max_depth":hp.quniform("max_depth", 4, 16, 1),
        "max_features":hp.quniform("max_features",2 ,8, 1),
        "min_samples_split":hp.quniform("min_samples_split", 2, 15, 1),
        "min_samples_leaf":hp.quniform("min_samples_leaf", 1,5,1)
}

In [11]:
def objetive(param):
    rf = RandomForestRegressor(n_estimators=int(param["n_estimators"]),
                      max_depth=int(param["max_depth"]),
                      max_features=int(param["max_features"]),
                      min_samples_split=int(param["min_samples_split"]),
                      min_samples_leaf=int(param["min_samples_leaf"]))
    
    
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_test)
    rmse = mse(y_test, y_pred)**(0.5)
    return {'loss':rmse, 'status': STATUS_OK }

In [12]:
trials_reg = Trials()
best = fmin(fn=objetive,
            space=params,
            algo=tpe.suggest,
            max_evals=10,
            trials=trials_reg
            )

100%|██████████| 10/10 [04:13<00:00, 25.34s/trial, best loss: 0.0938841522170268]


In [13]:
best

{'max_depth': 14.0,
 'max_features': 6.0,
 'min_samples_leaf': 2.0,
 'min_samples_split': 6.0,
 'n_estimators': 550.0}

In [14]:
rf_model = RandomForestRegressor(
    n_estimators=int(best["n_estimators"]),
    max_depth=int(best["max_depth"]),
    max_features=int(best["max_features"]),
    min_samples_split=int(best["min_samples_split"]),
    min_samples_leaf=int(best["min_samples_leaf"])
)

In [15]:
rf_model.fit(X_train, y_train)
y_pred=rf_model.predict(X_test)
mse(y_test, y_pred)**0.5

0.09398553895336466

In [16]:
rf_fit = rf_model.fit(
                X=X,
                y=y)
#set parameters to train the model

In [17]:
final_pred = rf_fit.predict(df_test)
#predict y from test dataframe

In [18]:
test = pd.DataFrame(df_test.index, columns=["id"])
test["price"] = final_pred
#display in a dataframe

In [19]:
test
#check data

Unnamed: 0,id,price
0,0,7.075627
1,1,8.360111
2,2,7.616833
3,3,8.477716
4,4,9.542260
...,...,...
13480,13480,8.498387
13481,13481,6.447960
13482,13482,6.673078
13483,13483,6.758508


In [20]:
test.to_csv("output/RandomForest.csv", index = False, header = True)
#save results