# Logistic Regression (LR) For Enhancer Activity Classification

**Authorship:**
Adam Klie, *07/25/2021*
***
**Description:**
Notebook to train Logistic Regression classifiers for enhancer activity
***
<div class="alert alert-block alert-warning">
<b>TODOs</b>:
<ul>
    <b><li></li></b>
    </ul>
</div>

# Set-up

In [75]:
# Classics
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pickle

# Figure style
plt.style.use('presentation')

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

import sys
sys.path.append('/cellar/users/aklie/projects/EUGENE/bin/')
import project_utils

In [76]:
# Define parameters of the notebook
DATASET = "2021_OLS_Library"  # Which dataset to look at
FEATURES = "mixed-1.0"  # What features to use to train the model
LABELS = "binary"
PREPROCESS = "0.18-0.4"  # Preprocessing steps, separated by "-"
TRAIN = True
SPLIT = 0.9
SUBSET = False
HYPERPARAM = "baseline"
OUTDIR="{}_{}_{}_LR".format(PREPROCESS, FEATURES, HYPERPARAM)
if not os.path.exists(OUTDIR):
    os.makedirs(OUTDIR)

# Load train and validation data

In [77]:
# Load train and val
X_train = np.load('../data/{0}/{1}/{2}_X-train-{3}_{4}.npy'.format(DATASET, FEATURES.replace("-", "_"), PREPROCESS, SPLIT, FEATURES))
X_test = np.load('../data/{0}/{1}/{2}_X-test-{3}_{4}.npy'.format(DATASET, FEATURES.replace("-", "_"), PREPROCESS, round(1-SPLIT, 1), FEATURES))
y_train = np.loadtxt('../data/{0}/{1}/{2}_y-train-{3}_{1}.txt'.format(DATASET, LABELS, PREPROCESS, SPLIT), dtype=int)
y_test = np.loadtxt('../data/{0}/{1}/{2}_y-test-{3}_{1}.txt'.format(DATASET, LABELS, PREPROCESS, round(1-SPLIT, 1)), dtype=int)

# Check em
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((272591, 21), (272591,), (30288, 21), (30288,))

# Train an LR model

In [78]:
from sklearn.linear_model import LogisticRegression

#### <u> **Train new model from scratch using out-of-box or hand-crafted hyperparams** </u> 

In [79]:
if TRAIN == True and HYPERPARAM == "baseline":
    clf = LogisticRegression(random_state=13, verbose=1)
    print("Training new LR model with the following params: {}".format(clf.get_params()))
    clf.fit(X_train, y_train)

Training new LR model with the following params: {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 13, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 1, 'warm_start': False}


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s finished


#### <u> **Save hyperpameters of model** </u>

In [80]:
if TRAIN == True:
    file = "{0}/{1}_{2}_LR-clf_{3}.params.csv".format(OUTDIR, PREPROCESS, FEATURES, HYPERPARAM)
    print("Saving hyperparams to {}".format(file))
    pd.DataFrame(pd.Series(clf.get_params())).T.to_csv(file, index=False)
else:
    print("Not training a new model, load in cell below")

Saving hyperparams to 0.18-0.4_mixed-1.0_baseline_LR/0.18-0.4_mixed-1.0_LR-clf_baseline.params.csv


#### <u> **Load previously trained model** </u>

In [81]:
if TRAIN == False:
    file = '{}/{}_{}_LR-clf_{}.pickle'.format(OUTDIR, PREPROCESS, FEATURES, HYPERPARAM)
    print("Loading model from {}".format(file))
    clf = pickle.load(open(file, 'rb'))
else:
    print("Already trained a model in cell above (run if not run already)")

Already trained a model in cell above (run if not run already)


# Validation Set Performance

In [82]:
# Get the classification results on validation set
prob_thresh = 0.5
y_tr_probs = clf.predict_proba(X_train)[:, 1]
y_probs = clf.predict_proba(X_test)[:, 1]
y_tr_preds = (y_tr_probs >= prob_thresh).astype(int)
y_preds = (y_probs >= prob_thresh).astype(int)

In [83]:
# Generate a report
project_utils.classification_report(out_path="{}_{}_{}_LR".format(PREPROCESS, FEATURES, HYPERPARAM),
                                    train_X=X_train, test_X=X_test, 
                                    train_y=y_train, test_y=y_test,
                                    train_preds=y_tr_preds, test_preds=y_preds,
                                    train_probs=y_tr_probs, test_probs=y_probs)

Predictions provided, skipping them
Generating confusion matrix
Calculating classification metrics
Metric	Train	Test
Accuracy	0.6970	0.6948
Precision	0.5752	0.5587
Recall	0.0979	0.0924
F0.1-Score	0.5487	0.5321
F0.5-Score	0.2912	0.2780
F1-Score	0.1673	0.1586
F2-Score	0.1173	0.1109
F10-Score	0.0987	0.0932
Plotting PR Curve
Plotting ROC Curve
Generating report


# Save the classifier

In [84]:
if TRAIN == True:
    file = '{}/{}_{}_LR-clf_{}.pickle'.format(OUTDIR, PREPROCESS, FEATURES, HYPERPARAM)
    pickle.dump(clf, open(file, 'wb'))
else:
    print("No need to save, loaded trained model already")