# Random Forest (RF) For Enhancer Activity Classification

**Authorship:**
Adam Klie, *MM/DD/YYYY*
***
**Description:**
Notebook to train random forest classifiers for enhancer activity
***
**TODOs:**
 - <font color='red'> Add RandomCV training section </font>
 - <font color='red'> Maybe integrate hyperopt code here, as a callable function </font>
***

# Set-up

In [119]:
# Classics
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pickle

# Figure style
plt.style.use('presentation')

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

import sys
sys.path.append('/cellar/users/aklie/projects/EUGENE/bin/')
import project_utils

In [130]:
# Define parameters of the notebook (CHANGE THESE FOR USE CASE)
DATASET = "2021_OLS_Library"  # Which dataset to look at
FEATURES = "mixed-1.0"  # What features to use to train the model
LABELS = "binary"
PREPROCESS = "0.18-0.4"  # Preprocessing steps, separated by "-"
TRAIN = True
SPLIT = 0.9
SUBSET = False
HYPERPARAM = "100-trials-optuna"
OUTDIR="{}_{}_{}_RF".format(PREPROCESS, FEATURES, HYPERPARAM)  # CHANGE THIS!!!
if not os.path.exists(OUTDIR):
    os.makedirs(OUTDIR)

# Load train and validation data

In [121]:
# Load train and val
X_train = np.load('../data/{0}/{1}/{2}_X-train-{3}_{4}.npy'.format(DATASET, FEATURES.replace("-", "_"), PREPROCESS, SPLIT, FEATURES))
X_test = np.load('../data/{0}/{1}/{2}_X-test-{3}_{4}.npy'.format(DATASET, FEATURES.replace("-", "_"), PREPROCESS, round(1-SPLIT, 1), FEATURES))
y_train = np.loadtxt('../data/{0}/{1}/{2}_y-train-{3}_{1}.txt'.format(DATASET, LABELS, PREPROCESS, SPLIT), dtype=int)
y_test = np.loadtxt('../data/{0}/{1}/{2}_y-test-{3}_{1}.txt'.format(DATASET, LABELS, PREPROCESS, round(1-SPLIT, 1)), dtype=int)

# Check em
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((272591, 21), (272591,), (30288, 21), (30288,))

# Train an LR model

In [122]:
from sklearn.ensemble import RandomForestClassifier

#### <u> **Train new model from scratch using out-of-box hyperparams** </u> 

In [124]:
if TRAIN == True and HYPERPARAM == "baseline":
    clf = RandomForestClassifier(random_state=13, n_jobs=-1, verbose=1)
    print("Training new RF model with the following params: {}".format(clf.get_params()))
    clf.fit(X_train, y_train)
else:
    print("Not fitting baseline model")

Not fitting baseline model


#### <u> **Train new model from scratch using hand-crafted hyperparams** </u> 

In [125]:
if TRAIN == True and HYPERPARAM == "selected":
    clf = RandomForestClassifier(n_estimators=1000,
                                 max_depth=10,
                                 min_samples_split=10, 
                                 min_samples_leaf=10, 
                                 bootstrap=True, 
                                 max_features="auto",
                                 class_weight="balanced",
                                 random_state=13, 
                                 n_jobs=-1,
                                 verbose=1)
    print("Training new RF model with the following params: {}".format(clf.get_params()))
    clf.fit(X_train, y_train)
else:
    print("Not fitting selected model")

Not fitting selected model


#### <u> **Train a new model using Optuna identified hyperparameters** </u>

In [136]:
## Run the following first before the next cell. 
## Note, will probably take a few hours to run
if TRAIN == True and "optuna" in HYPERPARAM:
    print("screen -S opt\nblue_pill 32 128G opt\npython RF_clf_hyperopt.py \\\n \
    --preprocess {} \\\n \
    --features {} \\\n \
    --out {} \\\n \
    --num_trials 100".format(PREPROCESS, FEATURES, OUTDIR))
else:
    print("Not fitting selected model")

screen -S opt
blue_pill 32 128G opt
python RF_clf_hyperopt.py \
     --preprocess 0.18-0.4 \
     --features mixed-1.0 \
     --out 0.18-0.4_mixed-1.0_100-trials-optuna_RF \
     --num_trials 100


In [127]:
if TRAIN == True and "optuna" in HYPERPARAM:
    #study = joblib.load("{1}_{2}_RF-study_{3}.pickle".format(OUTDIR, PREPROCESS, FEATURES, HYPERPARAM))
    #params = study.best_trial.params
    params = pickle.load(open("{0}/{1}_{2}_RF-study_{3}.params.pickle".format(OUTDIR, PREPROCESS, FEATURES, HYPERPARAM), "rb"))
    for key in list(params.keys()):
        params[key.split("RF_")[1]] = params.pop(key) 
    clf = RandomForestClassifier(random_state=13, n_jobs=-1, verbose=1, **params)
    clf.fit(X_train, y_train)
else:
    print("Not fitting optuna model")

Not fitting selected model


#### <u> **Train new model using random search cross validation** </u> 

In [128]:
from sklearn.model_selection import RandomizedSearchCV

In [129]:
if TRAIN == True and HYPERPARAM == "random-CV":
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    
    # Instatiate the model
    clf = RandomForestClassifier(random_state=13, n_jobs=-1, verbose=1)
    rf_random = RandomizedSearchCV(estimator=clf, 
                               param_distributions=random_grid, 
                               n_iter=10, 
                               cv = 5, 
                               verbose=2, 
                               random_state=13, 
                               n_jobs = -1)

    # Fit the random search model
    rf_random.fit(X_train, y_train)
    clf = rf_random.best_estimator_
    print("Trained new RF model with the following params: {}".format(clf.get_params()))
else:
    print("Not fitting random-CV model")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   59.7s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  2.8min


Trained new RF model with the following params: {'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 110, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 1400, 'n_jobs': -1, 'oob_score': False, 'random_state': 13, 'verbose': 1, 'warm_start': False}


[Parallel(n_jobs=-1)]: Done 1400 out of 1400 | elapsed:  3.1min finished


#### <u> **Save hyperpameters of model** </u>

In [93]:
if TRAIN == True:
    file = "{0}/{1}_{2}_RF-clf_{3}.params.csv".format(OUTDIR, PREPROCESS, FEATURES, HYPERPARAM)
    print("Saving hyperparams to {}".format(file))
    pd.DataFrame(pd.Series(clf.get_params())).T.to_csv(file, index=False)
else:
    print("Not training a new model, load in cell below")

Saving hyperparams to 0.18-0.4_mixed-1.0_random-CV_RF/0.18-0.4_mixed-1.0_RF-clf_random-CV.params.csv


#### <u> **Load previously trained model** </u>

In [114]:
if TRAIN == False:
    file = '{}/{}_{}_RF-clf_{}.pickle'.format(OUTDIR, PREPROCESS, FEATURES, HYPERPARAM)
    print("Loading model from {}".format(file))
    clf = pickle.load(open(file, 'rb'))
else:
    print("Already trained a model in cell above (run if not run already)")

Already trained a model in cell above (run if not run already)


# Validation Set Performance

In [115]:
# Get the classification results on validation set
y_tr_preds = clf.predict(X_train)
y_preds = clf.predict(X_test)
y_tr_probs = clf.predict_proba(X_train)[:, 1]
y_probs = clf.predict_proba(X_test)[:, 1]

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    7.4s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   13.3s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:   20.7s
[Parallel(n_jobs=4)]: Done 1400 out of 1400 | elapsed:   23.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    2.5s
[Parallel(n_jobs=4)]: Done 1400 out of 1400 | elapsed:    2.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Par

In [117]:
# Generate a report
project_utils.classification_report(out_path="{}_{}_{}_RF".format(PREPROCESS, FEATURES, HYPERPARAM),
                                    train_X=X_train, test_X=X_test, 
                                    train_y=y_train, test_y=y_test,
                                    train_preds=y_tr_preds, test_preds=y_preds,
                                    train_probs=y_tr_probs, test_probs=y_probs)

Predictions provided, skipping them
Generating confusion matrix
Calculating classification metrics
Metric	Train	Test
Accuracy	0.8628	0.7149
Precision	0.8824	0.7175
Recall	0.8219	0.6571
F0.1-Score	0.8818	0.7168
F0.5-Score	0.8696	0.7045
F1-Score	0.8511	0.6859
F2-Score	0.8333	0.6683
F10-Score	0.8225	0.6576
Plotting PR Curve
Plotting ROC Curve
Generating report


# Save the classifier

In [118]:
if TRAIN == True:
    file = '{}/{}_{}_RF-clf_{}.pickle'.format(OUTDIR, PREPROCESS, FEATURES, HYPERPARAM)
    pickle.dump(clf, open(file, 'wb'))
else:
    print("No need to save, loaded trained model already")

# Scratch

# References

 1. Ref 1
 2. Ref 2