In [None]:
import numpy as np
from scipy.sparse import load_npz
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from scipy import stats

#### Load data

In [None]:
prepared_data = load_npz("../data/prepared_data.npz")
labels = np.load("../data/labels.npy")

In [None]:
prepared_test_data = load_npz("../data/prepared_test_data.npz")
test_labels = np.load("../data/test_labels.npy")

#### Train model

In [None]:
forest_regr = RandomForestRegressor(n_estimators=100)
forest_regr.fit(prepared_data, labels)

###### Results based on training set

In [None]:
predictions = forest_regr.predict(prepared_data)

In [None]:
forest_mse = mean_squared_error(labels, predictions)
np.sqrt(forest_mse)

###### Results based on training set with cross validation

In [None]:
scores = cross_val_score(forest_regr, prepared_data, labels, scoring="neg_mean_squared_error", cv=10)
forest_regr_rmse_scores = np.sqrt(-scores)
print(f"Scores: {forest_regr_rmse_scores}\nMean: {forest_regr_rmse_scores.mean()}\nStd: {forest_regr_rmse_scores.std()}")

###### Results based on test set

In [None]:
predictions = forest_regr.predict(prepared_test_data)

In [None]:
forest_mse = mean_squared_error(test_labels, predictions)
np.sqrt(forest_mse)

In [None]:
confidence = 0.95
squared_errors = (predictions - test_labels) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors)))