# Random Forest Model

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse, make_scorer
from sklearn.ensemble import RandomForestRegressor
import numpy as np
#import libraries

In [2]:
df_train = pd.read_csv("input/train.csv")
df_test = pd.read_csv("input/test.csv")
#read csv

In [3]:
df_train
#check data

Unnamed: 0,carat,cut,color,clarity,table,x,price
0,1.50,5,3,4,58.0,7.32,9.588
1,2.01,4,1,2,59.0,8.11,9.748
2,0.50,3,1,3,57.0,5.13,7.255
3,0.25,4,3,6,57.0,4.05,6.450
4,0.52,3,4,4,55.0,5.16,7.721
...,...,...,...,...,...,...,...
40450,1.04,4,6,2,57.0,6.60,8.190
40451,0.51,4,1,3,59.0,5.09,7.246
40452,1.51,3,5,4,57.0,7.37,9.277
40453,2.02,5,5,4,60.0,8.16,9.680


In [4]:
X = df_train.drop("price", axis=1)
y = df_train["price"]
#take predictors and response variable

In [5]:
sta = StandardScaler()
X = sta.fit_transform(X)
#Standarize data

In [6]:
model = RandomForestRegressor()
params = {
        "n_estimators":[150, 200],
        "max_features":[5,6,"auto"],
        "min_samples_split":[2, 3, 4],
        "min_samples_leaf":[1,2,3]
}
#set the model and the hyperparameters of the model
clf = GridSearchCV(
                estimator=model,
                param_grid=params
)
#set Grid Search CV params to select the best choice among the Random Forest hyperparameters
grid = clf.fit(X, y)
#train the grid

In [7]:
grid.best_params_
#check the best hiperparameters

{'max_features': 5,
 'min_samples_leaf': 3,
 'min_samples_split': 4,
 'n_estimators': 200}

In [8]:
best = grid.best_estimator_
#check the best estimator and keep in a variable

In [9]:
best

RandomForestRegressor(max_features=5, min_samples_leaf=3, min_samples_split=4,
                      n_estimators=200)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=444)
#split data

In [11]:
y_pred_train = best.predict(X_train)
y_pred_test = best.predict(X_test)
#predict y

In [12]:
df_y = pd.DataFrame(y_train)

In [13]:
df_y["y_pred"] = y_pred_train
df_y.head()
#display dataframe and check data

Unnamed: 0,price,y_pred
29816,9.395,9.394231
14427,7.21,7.125465
39546,9.321,9.356537
7814,6.964,6.932722
13427,7.671,7.696048


In [14]:
mse_train = mse(y_train, y_pred_train)
mse_test = mse(y_test, y_pred_test)
#compute the mean squared error

In [15]:
mse_train, mse_test
#check data

(0.005312981586151274, 0.005359073404042015)

In [17]:
score_mse = make_scorer(mse)
cross_validate(estimator=best, X=X, y=y, scoring=score_mse, cv=10)
#check the model with another tests

{'fit_time': array([19.01824498, 17.31972957, 16.04028058, 16.37665009, 18.45945311,
        15.35753345, 15.76928091, 16.4392643 , 15.98669362, 16.15506005]),
 'score_time': array([0.332623  , 0.28426123, 0.25546265, 0.2547009 , 0.25232601,
        0.27227163, 0.29122114, 0.31262255, 0.26281714, 0.27526379]),
 'test_score': array([0.01132344, 0.01228094, 0.0107154 , 0.01002448, 0.01057624,
        0.01028736, 0.01046729, 0.01082415, 0.01041409, 0.01069976])}

In [18]:
rf_fit = best.fit(
                X=X,
                y=y)
#set parameters to train the model

In [19]:
df_test_s = sta.fit_transform(df_test)
#standarize the test dataframe

In [20]:
final_pred = rf_fit.predict(df_test_s)
#predict y from test dataframe

In [21]:
test = pd.DataFrame(df_test.index, columns=["id"])
test["price"] = final_pred
#display in a dataframe

In [22]:
test
#check data

Unnamed: 0,id,price
0,0,7.134210
1,1,8.362783
2,2,7.682005
3,3,8.449184
4,4,9.503668
...,...,...
13480,13480,8.462870
13481,13481,6.501990
13482,13482,6.609425
13483,13483,6.641827


In [23]:
test.to_csv("RandomForest.csv", index = False, header = True)
#save results