In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import joblib
import sys
sys.path.append("..")
from randomforest import random_forest_CV
from tools import feature_selection
from tools import data_parser as dp
from tools import feature_selection

In [2]:
bert_data = "../data/combined_bert_df.csv"
df = pd.read_csv(bert_data)

light, heavy, temp = dp.data_extract('../data/combined_datasets.csv')

X = df
y = temp

In [3]:
reduced_data = feature_selection.feat_select(X,y,100)

In [4]:
x_train, x_test, y_train, y_test = train_test_split(reduced_data, y, test_size=0.1, random_state = 7)


In [5]:
model = joblib.load('../models/120323_RF.joblib')
result = random_forest_CV.eval_model(model,x_test,y_test)
print (result[0], result[1])

(4.018190684627044, 0.314434529888034)


In [13]:
print ("MAE: {:0.2f} C\nR2: {:0.2f}".format(result[0], result[1]))

MAE: 4.02 C
R2: 0.31


In [3]:
X_reduced_72 = feature_selection.rfe_select(X,y,72)

In [6]:
X_reduced_72

Index(['4', '7', '13', '16', '26', '30', '31', '34', '52', '53', '56', '75',
       '86', '88', '96', '100', '116', '128', '134', '138', '141', '142',
       '150', '151', '152', '159', '160', '161', '168', '183', '188', '192',
       '195', '200', '217', '221', '228', '237', '240', '257', '258', '262',
       '271', '280', '300', '304', '305', '318', '330', '333', '347', '351',
       '358', '370', '378', '390', '401', '404', '408', '411', '413', '417',
       '425', '426', '434', '451', '464', '472', '476', '485', '508', '511'],
      dtype='object')

In [5]:
X.shape

(177, 512)

In [7]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-66.829450,55.086930,-142.77501,-104.066020,-29.141940,2.080520,26.443787,37.419388,0.505268,7.853431,...,-94.306350,190.136750,5.538248,79.781480,-2.710631,159.979130,13.266889,42.356010,7.799280,101.221860
1,-2.196232,34.973366,-262.48148,-231.196200,84.637405,-33.108974,63.094994,-101.796555,-91.397600,76.100650,...,-27.862946,282.756200,-65.029090,91.264280,-99.976265,47.220090,10.331673,157.601840,-9.925574,36.679665
2,-74.652880,115.951310,-268.60522,-23.576153,31.603592,-0.576147,23.305748,-92.553840,15.400765,-153.168300,...,43.194527,201.255360,-80.560040,160.630700,-124.985634,-32.365753,54.983738,124.715480,71.650050,25.961790
3,-146.263920,-33.680030,-141.58617,-45.430990,265.647060,-19.823807,-88.542046,-63.923300,42.481440,-70.851290,...,54.162766,107.496970,78.053210,28.235650,-8.911081,60.006187,178.540830,78.480800,-17.681578,92.937530
4,-55.485330,-118.674280,-170.51550,-115.696130,-5.213447,-107.310585,-20.264847,-111.669850,6.848226,-16.880430,...,-10.708040,292.483760,-48.263004,55.001590,-42.737675,-32.717537,-78.872030,120.030860,-85.479000,-32.607390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,-164.749300,-37.677784,-175.17010,-63.817940,62.723007,-111.921950,-58.907074,-48.409400,21.922743,-86.693690,...,133.752460,214.765990,21.048810,59.170803,-54.557465,-19.103552,10.877684,119.878510,-52.312065,-75.318214
173,-197.480350,35.186302,-106.18173,-18.017980,169.700040,46.517500,3.988994,-72.692180,-18.464344,-6.450912,...,65.108990,154.484400,-16.596622,53.027428,-231.130370,16.976940,-41.386845,46.103794,-13.606163,-6.366283
174,-123.724945,-94.087810,-163.17024,-92.267490,184.618960,-73.993866,-18.981827,-56.036110,-40.685673,-16.836780,...,17.359219,218.069700,-5.133991,41.102386,-123.317890,-49.204285,1.562822,181.895770,-68.505660,-0.364858
175,-113.388530,-39.550217,-213.04642,-80.400795,-76.434875,1.729343,-66.994240,-47.455166,-14.285555,-129.914430,...,-28.272913,110.786705,-62.065520,134.582290,108.233150,78.622780,55.832733,199.224380,-14.540217,-23.999693


In [19]:
X_new = X.loc[:,X_reduced_72]

In [20]:
X_new.shape

(177, 72)

In [28]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = [1.0, 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

kfold = KFold(n_splits=5, shuffle=True, random_state=7)

In [31]:
grid_search = GridSearchCV(RandomForestRegressor(),random_grid,cv=kfold,scoring='r2')

In [32]:
grid_search.fit(X_new,y)

KeyboardInterrupt: 

In [None]:
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)