# Random Forest Regression

In [5]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


# all the functions from sklearn needed for this workshop
import sklearn
from sklearn.datasets import make_moons, make_circles, load_iris, load_wine
from sklearn.svm import LinearSVC, SVC, SVR
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.tree.export import export_text
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn import svm

import pandas as pd

### Mordred descriptors RF

In [6]:
# selecting the input (X) and output (y) variables
mdE = pd.read_csv("mordred_descriptors_E_isomer.csv")

X_md = mdE.iloc[:, 2:-2] # mordred descriptors as input variable
y_md = mdE.iloc[:,-1] # E isomer as output variable - we want to predict

In [7]:
# splitting the data set into train and test sets 80:20
X_md_train, X_md_test, y_md_train, y_md_test = train_test_split(X_md, y_md, test_size=0.20, random_state=1)

# Fitting a random forest with the training data made of 1000 decision trees
random_forest = RandomForestRegressor(n_estimators=1000, random_state=1)
random_forest.fit(X_md_train, y_md_train)

# success score of the RF with the test set
print("RF success:", random_forest.score(X_md_test, y_md_test))

RF success: 0.8153060054744579


In [8]:
# predicting output variables of test set using X_md_test
y_md_pred = random_forest.predict(X_md_test)
y_md_pred

array([321.1       , 447.279     , 346.25308333, 333.44533333,
       422.30183333, 555.6974    , 319.979     , 326.215     ,
       321.66      , 402.38516667, 395.1805    , 458.10353333,
       399.8355    , 503.50416667, 382.2555    , 327.13033333,
       424.75633333, 414.05466667, 336.473     , 399.01016667,
       346.328     , 395.18085   , 332.50066667, 555.407     ,
       339.797     , 403.12386667, 425.12766667, 407.105     ,
       407.2555    , 472.89733333, 413.546     , 530.28558333,
       338.744     , 444.82326667, 503.84466667, 380.0895    ,
       349.50883333, 440.82925   , 402.6975    , 342.15866667,
       555.828     , 332.1075    , 329.32591667, 328.43766667,
       371.09833333, 439.00716667, 326.869     , 397.155     ,
       385.63058333, 326.8       , 341.21953333, 449.58775   ,
       369.4375    , 545.1908    , 428.69408333, 422.64625   ,
       452.321     , 335.02533333, 456.93011667, 412.16366667,
       440.04376667, 443.303     , 339.344     , 416.86

In [11]:
# performance metrics of RF predicted y values compared to actual values in the test data set
print("Performance metrics of RF:")
print("")
print("RF success score:", random_forest.score(X_md_test, y_md_test))
print("Explained variance score:", explained_variance_score(y_md_test, y_md_pred))
print("Mean absolute error:", mean_absolute_error(y_md_test, y_md_pred))
print("R squared:", r2_score(y_md_test, y_md_pred))
print("RMSE", mean_squared_error(y_md_test, y_md_pred, squared=False))

Performance metrics of RF:

RF success score: 0.8153060054744579
Explained variance score: 0.8155145436112394
Mean absolute error: 13.788672362869201
R squared: 0.8153060054744579
RMSE 29.25647009158427


In [13]:
# determining feature importances of all the input variables in order of their importance and with labels

lst = list(mdE.iloc[:, 2:-2].columns.values) #list of feature names
random_forest = RandomForestRegressor(n_estimators=1000, random_state=1)
header_names = np.array(lst)
random_forest_train = random_forest.fit(X_train, y_train)
ordered = random_forest_train.feature_importances_.argsort()[::-1]
for (i, name, score) in zip(ordered, header_names[ordered], random_forest_train.feature_importances_[ordered]):
    print("{}th feature:".format(str(i).rjust(2)) , str(round(score,5)).rjust(5),  name)

571th feature: 0.15217 GATS3s
92th feature: 0.06745 ATS3se
1146th feature: 0.05565 PEOE_VSA8
263th feature: 0.04256 ATSC3s
371th feature: 0.0386 AATSC3s
1139th feature: 0.03729 PEOE_VSA1
1099th feature: 0.02988 TIC4
1039th feature: 0.02364 ECIndex
108th feature: 0.02312 ATS1are
130th feature: 0.02146 ATS5i
121th feature: 0.01707 ATS5p
1097th feature: 0.01674 TIC2
855th feature: 0.01419 Sp
1100th feature: 0.01335 TIC5
126th feature: 0.01329 ATS1i
1199th feature: 0.01159 MID_N
128th feature: 0.01092 ATS3i
851th feature: 0.00957 Sv
90th feature: 0.00952 ATS1se
99th feature: 0.0092 ATS1pe
1157th feature: 0.009 SMR_VSA6
856th feature: 0.00886 Si
1226th feature: 0.00748 apol
1168th feature: 0.00722 SlogP_VSA8
1369th feature: 0.00698 SMR
1098th feature: 0.00534 TIC3
1161th feature: 0.00528 SlogP_VSA1
771th feature: 0.00526 nBonds
840th feature: 0.0052 Xp-7dv
665th feature: 0.00467 BalabanJ
800th feature: 0.0046 Xch-5dv
499th feature: 0.00432 MATS3v
288th feature: 0.0043 ATSC1v
567th feature: 

1339th feature: 2e-05 n9FaHRing
741th feature: 2e-05 VR1_Dzare
1118th feature: 2e-05 CIC5
910th feature: 2e-05 NsOH
113th feature: 2e-05 ATS6are
28th feature: 2e-05 nS
857th feature: 2e-05 MZ
25th feature: 2e-05 nC
1171th feature: 2e-05 SlogP_VSA11
1401th feature: 2e-05 MWC04
1399th feature: 2e-05 MWC02
1412th feature: 2e-05 SRW05
1085th feature: 2e-05 fragCpx
1213th feature: 2e-05 MPC10
109th feature: 2e-05 ATS2are
726th feature: 2e-05 VE2_Dzpe
1005th feature: 2e-05 SssS
52th feature: 2e-05 ATS8d
911th feature: 2e-05 NdO
1059th feature: 2e-05 ETA_eta_RL
875th feature: 2e-05 VR2_D
654th feature: 2e-05 BCUTv-1l
 7th feature: 2e-05 SpAD_A
1366th feature: 2e-05 nRot
1172th feature: 2e-05 EState_VSA1
1409th feature: 1e-05 SRW02
862th feature: 1e-05 Mare
772th feature: 1e-05 nBondsO
1041th feature: 1e-05 AETA_alpha
1205th feature: 1e-05 MPC2
1394th feature: 1e-05 Radius
1268th feature: 1e-05 n6aHRing
 2th feature: 1e-05 nAcid
833th feature: 1e-05 Xp-0dv
1413th feature: 1e-05 SRW06
774th fea

In [18]:
# removing features with an importance of < 0.0001 from X_md and creating X_md_new dataset
top_predictors = random_forest_train.feature_importances_.argsort()[-742:][::-1]
X_md_new = X_md.iloc[:, top_predictors]

In [20]:
# making new test and training dataset with X_md_new
X_md_train_new, X_md_test_new, y_md_train_new, y_md_test_new = train_test_split(X_md_new, y_md, test_size=0.20, random_state=1)
# fitting RF with the new dataset
random_forest_improved = RandomForestRegressor(n_estimators=1000, random_state=1)
random_forest_improved.fit(X_md_train_new, y_md_train_new)

RandomForestRegressor(n_estimators=1000, random_state=1)

In [22]:
# predicting new output variables of test set using X_md_test_new
y_md_pred_new = random_forest_improved.predict(X_md_test_new)
y_md_pred_new

array([321.082     , 447.76458333, 346.40283333, 333.714     ,
       423.26783333, 555.472     , 319.787     , 326.186     ,
       321.748     , 402.45333333, 395.33046667, 457.43341667,
       400.044     , 503.61961667, 382.90523333, 327.5       ,
       424.36416667, 414.47108333, 336.25933333, 398.65426667,
       346.3685    , 394.26233333, 332.30466667, 553.1945    ,
       340.204     , 403.21      , 425.061     , 407.094     ,
       407.2698    , 471.17266667, 413.004     , 528.03625   ,
       338.361     , 444.98333333, 503.44418333, 380.59033333,
       349.72133333, 440.877     , 403.0635    , 341.736     ,
       555.5955    , 331.67      , 329.64691667, 328.85      ,
       371.85516667, 439.44566667, 326.872     , 397.544     ,
       384.29933333, 326.765     , 341.7311    , 450.07158333,
       369.58416667, 541.7885    , 428.85566667, 422.936     ,
       452.32166667, 335.385     , 457.60295   , 412.378     ,
       440.85083333, 442.31266667, 339.33383333, 416.32

In [23]:
# performance metrics of improved RF predicted y values compared to actual values in the test data set using the new dataset
print("Performance metrics of improved RF:")
print("")
print("RF success score:", random_forest_improved.score(X_md_test_new, y_md_test_new))
print("Explained variance score:", explained_variance_score(y_md_test_new, y_md_pred_new))
print("Mean absolute error:", mean_absolute_error(y_md_test_new, y_md_pred_new))
print("R squared:", r2_score(y_md_test_new, y_md_pred_new))
print("RMSE", mean_squared_error(y_md_test_new, y_md_pred_new, squared=False))

Performance metrics of improved RF:

RF success score: 0.8180520566636535
Explained variance score: 0.8182082879472086
Mean absolute error: 13.795813803496083
R squared: 0.8180520566636535
RMSE 29.0381613406367


## Rdkit descriptors RF

In [24]:
# selecting the input (X) and output (y) variables
rdE = pd.read_csv("rdkit_descriptors_E_isomer.csv")

X_rd = rdE.iloc[:, 2:-2] # rdkit descriptors as input variable
y_rd = rdE.iloc[:,-1] # E isomer as output variable

In [25]:
# splitting the data set into train and test sets 80:20
X_rd_train, X_rd_test, y_rd_train, y_rd_test = train_test_split(X_rd, y_rd, test_size=0.20, random_state=1)

# Fitting a random forest with the training data made of 1000 decision trees
random_forest_rd = RandomForestRegressor(n_estimators=1000, random_state=1)
random_forest_rd.fit(X_rd_train, y_rd_train)

# success score of the RF with the test set
print("RF success:", random_forest_rd.score(X_rd_test, y_rd_test))

RF success: 0.7936775912572477


In [26]:
# predicting output variables of test set using X_rd_test
y_rd_pred = random_forest_rd.predict(X_rd_test)
y_rd_pred

array([322.082     , 433.05493333, 344.88933333, 334.917     ,
       502.46416667, 551.283     , 318.714     , 326.09916667,
       320.223     , 408.20216667, 396.5281    , 478.62195   ,
       413.96633333, 468.22648333, 390.67155   , 325.988     ,
       443.77033333, 423.8745    , 333.483     , 408.2006    ,
       338.32166667, 385.95556667, 327.796     , 547.857     ,
       337.962     , 404.44375   , 422.305     , 403.815     ,
       409.66766667, 476.62716667, 417.663     , 506.29408333,
       335.731     , 450.74933333, 497.11811667, 374.80866667,
       347.6675    , 425.14633333, 413.933     , 341.0285    ,
       551.138     , 329.21525   , 323.364     , 323.276     ,
       367.514     , 432.7761    , 351.58733333, 405.70933333,
       377.64483333, 323.592     , 341.96183333, 426.33166667,
       370.78325   , 476.1275    , 432.62133333, 423.45906667,
       452.79533333, 329.097     , 450.794     , 414.34833333,
       441.594     , 448.19518333, 339.391     , 412.96

In [27]:
# performance metrics of RF predicted y values compared to actual values in the test data set
print("Performance metrics of RF:")
print("")
print("RF success score:", random_forest_rd.score(X_rd_test, y_rd_test))
print("Explained variance score:", explained_variance_score(y_rd_test, y_rd_pred))
print("Mean absolute error:", mean_absolute_error(y_rd_test, y_rd_pred))
print("R squared:", r2_score(y_rd_test, y_rd_pred))
print("RMSE", mean_squared_error(y_rd_test, y_rd_pred, squared=False))

Performance metrics of RF:

RF success score: 0.7936775912572477
Explained variance score: 0.793799423169075
Mean absolute error: 15.024261603375528
R squared: 0.7936775912572477
RMSE 30.92208276078525


In [29]:
# determining feature importances of all the input variables in order of their importance and with labels

lst_rd = list(rdE.iloc[:, 2:-2].columns.values) #list of feature names
random_forest_rd = RandomForestRegressor(n_estimators=1000, random_state=1)
header_names_rd = np.array(lst_rd)
random_forest_rd_train = random_forest_rd.fit(X_rd_train, y_rd_train)
ordered_rd = random_forest_rd_train.feature_importances_.argsort()[::-1]
for (i, name, score) in zip(ordered_rd, header_names_rd[ordered_rd], random_forest_rd_train.feature_importances_[ordered_rd]):
    print("{}th feature:".format(str(i).rjust(2)) , str(round(score,5)).rjust(5),  name)

31th feature: 0.13045 Chi1n
57th feature: 0.12651 PEOE_VSA8
24th feature: 0.06534 BCUT2D_MRLOW
122th feature: 0.06277 MolMR
45th feature: 0.06274 PEOE_VSA1
46th feature: 0.03136 PEOE_VSA10
25th feature: 0.02636 BalabanJ
138th feature: 0.02607 fr_NH0
65th feature: 0.02442 SMR_VSA6
101th feature: 0.02434 VSA_EState8
79th feature: 0.02277 SlogP_VSA8
152th feature: 0.02144 fr_aniline
39th feature: 0.01595 HallKierAlpha
33th feature: 0.01306 Chi2n
37th feature: 0.01284 Chi4n
28th feature: 0.01208 Chi0n
87th feature: 0.01159 EState_VSA4
183th feature: 0.01145 fr_nitro_arom_nonortho
35th feature: 0.01106 Chi3n
18th feature: 0.0105 BCUT2D_MWLOW
19th feature: 0.01008 BCUT2D_CHGHI
97th feature: 0.00987 VSA_EState4
96th feature: 0.00919 VSA_EState3
42th feature: 0.00827 Kappa2
69th feature: 0.00824 SlogP_VSA1
40th feature: 0.00804 Ipc
 4th feature: 0.00734 qed
95th feature: 0.00733 VSA_EState2
20th feature: 0.00691 BCUT2D_CHGLO
99th feature: 0.00665 VSA_EState6
11th feature: 0.00593 MinPartialCha

In [34]:
# removing features with an importance of < 0.0001 from X_rd and creating X_rd_new dataset
top_rd_predictors = random_forest_rd_train.feature_importances_.argsort()[-139:][::-1]
X_rd_new = X_rd.iloc[:, top_rd_predictors]

In [35]:
# making new test and training dataset with X_rd_new
X_rd_train_new, X_rd_test_new, y_rd_train_new, y_rd_test_new = train_test_split(X_rd_new, y_rd, test_size=0.20, random_state=1)
# fitting RF with the new dataset
random_forest_rd_improved = RandomForestRegressor(n_estimators=1000, random_state=1)
random_forest_rd_improved.fit(X_rd_train_new, y_rd_train_new)

RandomForestRegressor(n_estimators=1000, random_state=1)

In [36]:
# predicting new output variables of test set using X_rd_test_new
y_rd_pred_new = random_forest_rd_improved.predict(X_rd_test_new)
y_rd_pred_new

array([322.354     , 432.80288333, 344.33341667, 334.196     ,
       504.784     , 551.459     , 318.866     , 326.196     ,
       320.249     , 407.73616667, 396.4442    , 479.09108333,
       413.26916667, 467.73228333, 389.95086667, 325.746     ,
       443.683     , 423.12766667, 332.48933333, 408.49766667,
       338.47966667, 386.32086667, 327.89433333, 547.476     ,
       337.62766667, 404.48675   , 421.65833333, 403.4734    ,
       409.27053333, 473.83823333, 418.22466667, 505.40273333,
       335.522     , 449.979     , 497.6144    , 372.80433333,
       347.2045    , 425.12958333, 413.966     , 341.79216667,
       551.635     , 329.059     , 323.162     , 323.29      ,
       368.233     , 432.261     , 351.917     , 405.384     ,
       378.2475    , 323.905     , 341.77083333, 425.77725   ,
       371.07166667, 476.0095    , 432.4765    , 422.68393333,
       453.43066667, 329.277     , 450.09116667, 414.142     ,
       441.674     , 447.8089    , 339.614     , 413.89

In [37]:
# performance metrics of improved RF predicted y values compared to actual values in the test data set using the new dataset
print("Performance metrics of improved RF:")
print("")
print("RF success score:", random_forest_rd_improved.score(X_rd_test_new, y_rd_test_new))
print("Explained variance score:", explained_variance_score(y_rd_test_new, y_rd_pred_new))
print("Mean absolute error:", mean_absolute_error(y_rd_test_new, y_rd_pred_new))
print("R squared:", r2_score(y_rd_test_new, y_rd_pred_new))
print("RMSE", mean_squared_error(y_rd_test_new, y_rd_pred_new, squared=False))

Performance metrics of improved RF:

RF success score: 0.7976235534014412
Explained variance score: 0.797700310672537
Mean absolute error: 14.953239451476794
R squared: 0.7976235534014412
RMSE 30.62495939562273


## Morgan fingerprints RF

In [38]:
# selecting the input (X) and output (y) variables
mfE = pd.read_csv("morgan_fingerprints_E_isomer.csv")

X_mf = mfE.iloc[:, 2:-2] # morgan fingerprints as input variable
y_mf = mfE.iloc[:,-1] # E isomer as output variable

In [39]:
# splitting the data set into train and test sets 80:20
X_mf_train, X_mf_test, y_mf_train, y_mf_test = train_test_split(X_mf, y_mf, test_size=0.20, random_state=1)

# Fitting a random forest with the training data made of 1000 decision trees
random_forest_mf = RandomForestRegressor(n_estimators=1000, random_state=1)
random_forest_mf.fit(X_mf_train, y_mf_train)

# success score of the RF with the test set
print("RF success:", random_forest_mf.score(X_mf_test, y_mf_test))

RF success: 0.8769946678243178


In [40]:
# predicting output variables of test set using X_mf_test
y_mf_pred = random_forest_mf.predict(X_mf_test)
y_mf_pred

array([333.55      , 458.51653333, 341.09367857, 360.23553333,
       436.45266667, 558.743     , 318.404     , 345.08483333,
       320.971     , 400.377     , 396.866     , 493.78028333,
       391.6031    , 524.50453333, 399.356     , 327.236     ,
       453.247     , 446.506     , 327.332     , 434.081     ,
       332.695     , 382.4438619 , 329.13866667, 541.994     ,
       322.448     , 396.453     , 454.63463333, 407.172     ,
       437.275     , 390.884     , 415.135     , 534.9065    ,
       336.3465    , 447.964     , 506.7861    , 344.176     ,
       333.4275    , 408.734     , 437.026     , 333.55025   ,
       559.36      , 332.535     , 324.514     , 328.39366667,
       365.40566667, 478.5293    , 317.822     , 403.11      ,
       372.03533333, 321.054     , 337.814     , 412.863     ,
       384.80226667, 481.02      , 434.739     , 429.97086667,
       453.396     , 332.742     , 482.7042    , 410.808     ,
       445.282     , 440.95475   , 332.469     , 411.18

In [41]:
# performance metrics of RF predicted y values compared to actual values in the test data set
print("Performance metrics of RF:")
print("")
print("RF success score:", random_forest_mf.score(X_mf_test, y_mf_test))
print("Explained variance score:", explained_variance_score(y_mf_test, y_mf_pred))
print("Mean absolute error:", mean_absolute_error(y_mf_test, y_mf_pred))
print("R squared:", r2_score(y_mf_test, y_mf_pred))
print("RMSE", mean_squared_error(y_mf_test, y_mf_pred, squared=False))

Performance metrics of RF:

RF success score: 0.8769946678243178
Explained variance score: 0.877256447245377
Mean absolute error: 14.513858740204935
R squared: 0.8769946678243178
RMSE 23.875776650306218


In [42]:
# determining feature importances of all the input variables in order of their importance and with labels

lst_mf = list(mfE.iloc[:, 2:-2].columns.values) #list of feature names
random_forest_mf = RandomForestRegressor(n_estimators=1000, random_state=1)
header_names_mf = np.array(lst_mf)
random_forest_mf_train = random_forest_mf.fit(X_mf_train, y_mf_train)
ordered_mf = random_forest_mf_train.feature_importances_.argsort()[::-1]
for (i, name, score) in zip(ordered_mf, header_names_mf[ordered_mf], random_forest_mf_train.feature_importances_[ordered_mf]):
    print("{}th feature:".format(str(i).rjust(2)) , str(round(score,5)).rjust(5),  name)

180th feature: 0.26335 180
881th feature: 0.07841 881
1357th feature: 0.05122 1357
1384th feature: 0.04411 1384
1951th feature: 0.03075 1951
843th feature: 0.02821 843
618th feature: 0.01675 618
1573th feature: 0.0159 1573
875th feature: 0.01297 875
1096th feature: 0.01283 1096
1457th feature: 0.01187 1457
960th feature: 0.01165 960
80th feature: 0.01105 80
1274th feature: 0.01052 1274
1809th feature: 0.01045 1809
1504th feature: 0.01016 1504
1120th feature: 0.01015 1120
1171th feature: 0.01005 1171
202th feature: 0.00851 202
1687th feature: 0.00794 1687
92th feature: 0.0078 92
1416th feature: 0.00706 1416
1136th feature: 0.0068 1136
42th feature: 0.00677 42
407th feature: 0.00675 407
1379th feature: 0.00654 1379
1047th feature: 0.00617 1047
737th feature: 0.00602 737
932th feature: 0.00564 932
1814th feature: 0.00564 1814
675th feature: 0.00534 675
1319th feature: 0.00521 1319
769th feature: 0.00503 769
684th feature: 0.00474 684
910th feature: 0.00446 910
126th feature: 0.00431 126
3

970th feature:   0.0 970
1146th feature:   0.0 1146
1739th feature:   0.0 1739
129th feature:   0.0 129
685th feature:   0.0 685
1985th feature:   0.0 1985
1828th feature:   0.0 1828
658th feature:   0.0 658
331th feature:   0.0 331
1892th feature:   0.0 1892
327th feature:   0.0 327
142th feature:   0.0 142
1597th feature:   0.0 1597
587th feature:   0.0 587
423th feature:   0.0 423
1481th feature:   0.0 1481
1252th feature:   0.0 1252
595th feature:   0.0 595
1365th feature:   0.0 1365
1905th feature:   0.0 1905
1634th feature:   0.0 1634
1173th feature:   0.0 1173
931th feature:   0.0 931
2043th feature:   0.0 2043
754th feature:   0.0 754
1606th feature:   0.0 1606
1977th feature:   0.0 1977
273th feature:   0.0 273
217th feature:   0.0 217
676th feature:   0.0 676
1581th feature:   0.0 1581
583th feature:   0.0 583
1281th feature:   0.0 1281
1061th feature:   0.0 1061
105th feature:   0.0 105
777th feature:   0.0 777
1045th feature:   0.0 1045
2008th feature:   0.0 2008
666th feat

In [51]:
# removing features with an importance of < 0.0001 from X_mf and creating X_mf_new dataset
top_mf_predictors = random_forest_mf_train.feature_importances_.argsort()[-393:][::-1]
X_mf_new = X_mf.iloc[:, top_mf_predictors]

In [53]:
# making new test and training dataset with X_mf_new
X_mf_train_new, X_mf_test_new, y_mf_train_new, y_mf_test_new = train_test_split(X_mf_new, y_mf, test_size=0.20, random_state=1)
# fitting RF with the new dataset
random_forest_mf_improved = RandomForestRegressor(n_estimators=1000, random_state=1)
random_forest_mf_improved.fit(X_mf_train_new, y_mf_train_new)

RandomForestRegressor(n_estimators=1000, random_state=1)

In [54]:
# predicting new output variables of test set using X_mf_test_new
y_mf_pred_new = random_forest_mf_improved.predict(X_mf_test_new)
y_mf_pred_new

array([333.86      , 459.0364    , 341.37033333, 359.7423    ,
       437.68466667, 558.892     , 318.736     , 344.50166667,
       320.467     , 399.445     , 397.5967    , 493.26678333,
       391.4686    , 523.96241667, 399.155     , 326.95      ,
       455.668     , 446.195     , 325.816     , 433.43      ,
       333.771     , 381.45622857, 328.977     , 542.00225   ,
       322.962     , 396.2065    , 455.1378    , 407.809     ,
       437.4       , 389.67566667, 414.332     , 533.949     ,
       336.655     , 445.691     , 506.45398333, 344.518     ,
       333.513     , 406.765     , 437.599     , 334.18025   ,
       559.359     , 332.836     , 323.955     , 328.47866667,
       366.72333333, 479.20346667, 317.367     , 403.204     ,
       372.65833333, 320.888     , 339.236     , 411.757     ,
       384.413     , 482.52433333, 434.149     , 430.13181667,
       453.647     , 332.6       , 482.27055   , 410.961     ,
       442.831     , 441.741     , 331.591     , 411.74

In [55]:
# performance metrics of improved RF predicted y values compared to actual values in the test data set using the new dataset
print("Performance metrics of improved RF:")
print("")
print("RF success score:", random_forest_mf_improved.score(X_mf_test_new, y_mf_test_new))
print("Explained variance score:", explained_variance_score(y_mf_test_new, y_mf_pred_new))
print("Mean absolute error:", mean_absolute_error(y_mf_test_new, y_mf_pred_new))
print("R squared:", r2_score(y_mf_test_new, y_mf_pred_new))
print("RMSE", mean_squared_error(y_mf_test_new, y_mf_pred_new, squared=False))

Performance metrics of improved RF:

RF success score: 0.8767471496892426
Explained variance score: 0.8770509345130493
Mean absolute error: 14.661175467148881
R squared: 0.8767471496892426
RMSE 23.899786657200814
