# Random Forest Model for Smart PMI

In [2]:
import numpy as np
import pandas as pd
import sys, os
import random

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor,RandomForestClassifier, HistGradientBoostingClassifier

from ipynb.fs.full.smartpmi_preprocessing import *

## Test Train Split

In [1]:
# -- test train split
s = 500  # 500/1775 = .296

## Modeling

In [10]:
def rf_regress(x, rf = True, search = False, classify = False):
    # -- Pick model
    if rf:
        model = RandomForestRegressor(random_state=42) 
        # if not classify else RandomForestClassifier(random_state=42) 
    else:
        model = HistGradientBoostingRegressor(random_state=42, min_samples_leaf= 6,max_leaf_nodes= 5,max_depth= 6,l2_regularization= 1.5) 
        # if not classify else HistGradientBoostingClassifier(random_state=42) 
    
    if search:
        # -- Parameter Tuning
        random_grid = {}
        # Maximum number of levels in tree
        max_depth = [int(x) for x in np.linspace(3, 10, num = 3)]
        random_grid['max_depth'] = max_depth
        # Minimum number of samples required at each leaf node
        min_samples_leaf = [1, 2, 4, 6, 10]
        random_grid['min_samples_leaf'] = min_samples_leaf
    
        if rf:
            # Number of trees in random forest
            n_estimators = [int(x) for x in np.linspace(start = 50, stop = 1000, num = 10)]
            random_grid['n_estimators'] = n_estimators
            # Number of features to consider at every split
            max_features = ['auto', 'sqrt', 'log2']
            random_grid['max_features'] = max_features
            # Minimum number of samples required to split a node
            min_samples_split = [5, 10, 20, 30]
            random_grid['min_samples_split'] = min_samples_split
            # Method of selecting samples for training each tree
            bootstrap = [True, False]
            random_grid['bootstrap'] = bootstrap
        else:
            # Maximum number of leaves for each tree
            max_leaf_nodes = [3, 5, 10, 20, 30]
            random_grid['max_leaf_nodes'] = max_leaf_nodes
            # L2 Reg parameter
            l2_regularization = [0, 1.5,] # --ORIGINAL--
            # l2_regularization = [0,0.001, 0.01, 0,1, 0.5, 1, 1.5, 2, 2.5]
            random_grid['l2_regularization'] = l2_regularization
        
        model = RandomizedSearchCV(estimator = model, param_distributions = random_grid, # --ORIGINAL--
                                    n_iter = 40, cv = 3, verbose=2, random_state=42, n_jobs = -1) 
        # model = RandomizedSearchCV(estimator = model, param_distributions = random_grid,
        #                             n_iter = 45, verbose=2, random_state=42, n_jobs = -1) 

    # x = x.apply(lambda i: np.log10(i) if np.issubdtype(type(i), np.number) else i)
    model.fit(x[s:], train_y[s:])

    # print('model parameters', standard_model.get_params())
    print('...Validation...')
    pred = model.predict(x[:s])
    mae = mean_absolute_error(train_y[:s], pred)
    mse = mean_squared_error(train_y[:s], pred)
    rmse = mse**.5
    r2 = r2_score(train_y[:s], pred)
    scores = cross_val_score(model, x[:s], train_y[:s], scoring='r2', cv=5)
    
    print(' MAE', mae)
    print(' MSE', mse)
    print(' RMSE',rmse)
    print(' R^2', r2)
    print(' CV-R^2', scores)
    print(' avg CV-R^2', np.mean(scores))

    return {'MSE':mse, 'RMSE': rmse, 'R^2': r2, 'Predictions':pred, 'Model': model}


In [11]:
# print('\n no moe (baseline) .................')
base = rf_regress(no_moe)

# train_x + rdkit + mordred
print('\n the whole enchilada (everything) ..')
everything = rf_regress(big_train, rf=False, search=False)


 the whole enchilada (everything) ..
...Validation...
 MAE 0.23396576829364774
 MSE 0.0846153676457081
 RMSE 0.29088720777254556
 R^2 0.8769786224970348
 CV-R^2 [0.86954788 0.80482568 0.79702635 0.84504659 0.85376981]
 avg CV-R^2 0.8340432634915109


In [12]:
everything['Model']