# Random Forest Regression Model
---

In [1]:
import time
import numpy as np
import pandas as pd
from scipy import stats
from sklearn import metrics
from sklearn import tree
from sklearn.tree import plot_tree, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score, cross_validate

### Config

In [2]:
# Can we get better labels (?)
n = 100
m = 450
l = 1
k = 5

### Setup

In [3]:
start = time.time()

# reading already split data
xtrain = pd.read_csv("../data/84206/xTrain.csv")
xtest = pd.read_csv("../data/84206/xTest.csv")
ytrain = pd.read_csv("../data/84206/yTrain.csv")
ytest = pd.read_csv("../data/84206/yTest.csv")

# Get our data to numpy arrays
xtrain = xtrain.to_numpy()
ytrain = ytrain.to_numpy()
xtest = xtest.to_numpy()
ytest = ytest.to_numpy()

# Print the shape of our data
print("xtrain shape:", xtrain.shape)
print("ytrain shape:", ytrain.shape)

xtrain shape: (7001, 4)
ytrain shape: (7001, 1)


### Model

In [4]:
# Create the model and fit our training data to it
forest = RandomForestRegressor(n_estimators=n, max_depth=m, min_samples_leaf=l)
forest.fit(xtrain, ytrain.ravel())

# Use the trained model on the testing and training data
testPred = forest.predict(xtest)
trainPred = forest.predict(xtrain)

### Measuring Accuracy

#### R2 Score
How well the variation of the predicted matches the variation in the actual (0-1)

In [5]:
trainr2 = r2_score(trainPred, ytrain)
testr2 = r2_score(testPred, ytest)
timeElapsed = time.time() - start

print("Time Elapsed: ", timeElapsed)
print("n_estimators: ", n)
print("max_depth: ", m)
print("min_samples_leaf: ", l)

print("Train R2: ", trainr2)
print("Test R2: ", testr2)

Time Elapsed:  3.842583656311035
n_estimators:  100
max_depth:  450
min_samples_leaf:  1
Train R2:  0.9331227466104666
Test R2:  0.7139395254456623
