In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import joblib
import sys
sys.path.append("..")
from randomforest import random_forest_CV
from tools import feature_selection
from tools import data_parser as dp
from tools import feature_selection

In [2]:
n_estimators = [int(x) for x in np.linspace(start = 50 , stop = 800, num = 10)]
max_features = [1.0, 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

kfold = KFold(n_splits=5, shuffle=True, random_state=7)

In [3]:
bert_data = "../data/combined_bert_df.csv"
df = pd.read_csv(bert_data)

light, heavy, temp = dp.data_extract('../data/combined_datasets.csv')

X = df
y = temp

In [4]:
X_reduced_72 = feature_selection.rfe_select(X,y,72)
X_new = X.loc[:,X_reduced_72]

In [5]:
X_new.to_csv('../data/combined_datasets_72.csv',index=False)

In [7]:
random_search = random_forest_CV.random_search(X_new, y, random_grid, 100, kfold)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [8]:
random_search

In [27]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import time

def random_search2(x, y, params, iters, cv_num):
    """
    :param x: input features
    :param y: target variable
    :param params: random search parameters
    :param iters: number of iteration for the search
    :param cv_num: number of cross validations
    :return: model with the best cross-validation score
    """
    start_time = time.time()
    # Define the scoring metrics to use
    scoring = {'r2': make_scorer(r2_score), 'mae': make_scorer(mean_absolute_error)}

    # Define the random search object
    rf_random = RandomizedSearchCV(
        estimator=RandomForestRegressor(),
        param_distributions=params,
        n_iter=iters,
        cv=KFold(n_splits=cv_num),
        verbose=2,
        n_jobs=-1,
        scoring=scoring,
        refit='r2'
    )

    # Perform the random search
    rf_random.fit(x, y)

    # Extract the best hyperparameters from the random search
    best_params = rf_random.best_params_

    # Define the final model with the best hyperparameters
    final_model = RandomForestRegressor(**best_params)

    # Evaluate the final model using cross-validation
    mae_cv_scores = cross_val_score(final_model, x, y, cv=KFold(n_splits=cv_num), scoring='neg_mean_absolute_error')
    r2_cv_scores = cross_val_score(final_model, x, y, cv=KFold(n_splits=cv_num), scoring='r2')


    # Print the cross-validation scores
    print(f"MAE Cross-validation scores: {-mae_cv_scores}")
    print(f"MAE Mean cross-validation score: {-mae_cv_scores.mean()}")
    print(f"r2 Cross-validation scores: {r2_cv_scores}")
    print(f"r2 Mean cross-validation score: {r2_cv_scores.mean()}")
    print(f"Total Elapsed Time: {time.time() - start_time:.3f} seconds for {iters*cv_num} fits")

    # Fit the final model to the data
    final_model.fit(x, y)

    # Return the final model
    return final_model


In [28]:
result2 = random_search2(X_new,y,random_grid, 100, 10)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
MAE Cross-validation scores: [3.50112086 2.79784183 4.15388471 3.96066555 3.85694096 3.80446255
 5.59120718 3.7086245  3.98351762 5.36403509]
MAE Mean cross-validation score: 4.0722300850164626
r2 Cross-validation scores: [0.18602893 0.21093729 0.05587314 0.12329552 0.32950709 0.45040437
 0.28296815 0.13969944 0.27013444 0.10609101]
r2 Mean cross-validation score: 0.21549393868397654
Total Elapsed Time: 174.687 seconds for 1000 fits


In [29]:
result3 = random_search2(X_new,y,random_grid, 500, 10)

Fitting 10 folds for each of 500 candidates, totalling 5000 fits
MAE Cross-validation scores: [3.76772139 2.90056391 3.98190267 3.76585213 3.90389515 3.96907149
 5.20482456 3.79477001 4.09373065 5.13600177]
MAE Mean cross-validation score: 4.051833373700357
r2 Cross-validation scores: [0.14624269 0.25631843 0.17159683 0.17099842 0.28631602 0.43639614
 0.22131086 0.18070066 0.12979939 0.13745167]
r2 Mean cross-validation score: 0.2137131111141009
Total Elapsed Time: 941.967 seconds for 5000 fits


In [30]:
random_forest_CV.eval_avg(y)

4.650694244948769

In [34]:
joblib.dump(result3,'../models/ab72_rfkf_28032023.joblib')

['../models/ab72_rfkf_28032023.joblib']