# Evaluation of Models For Enhancer Activity Prediction

**Authorship:**
Adam Klie, *MM/DD/20YY*
***
**Description:**
Notebook to evaluate a model on test data
***
**TODOs:**
 - <font color='red'> Add TODOs here </font>
***

# Set-up

In [None]:
# Classics
import os
import tqdm
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import seaborn as sns

# For stats
from scipy import stats

# scikit-learn
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
import pickle

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

import sys
sys.path.append('/cellar/users/aklie/projects/EUGENE/bin/')
import project_utils
import otx_enhancer_utils

In [None]:
# Define parameters of the notebook
TRAIN_DATASET = "2021_OLS_Library"  # Which dataset to look at
FEATURES = "mixed-1.0"  # What features to use to train the model
LABELS = "binary"
PREPROCESS = "0.09-0.4"  # Preprocessing steps, separated by "-"
SPLIT = 0.9
HYPERPARAM = "baseline"
#OUTDIR="{}_{}_{}_CHENGE_ME".format(PREPROCESS, FEATURES, HYPERPARAM)
if not os.path.exists(OUTDIR):
    os.makedirs(OUTDIR)

# Load model and training dataset

In [None]:
# Load model
#file = '{}/{}_{}_CHANGE ME-clf_{}.pickle'.format(OUTDIR, PREPROCESS, FEATURES, HYPERPARAM)
print("Loading model from {}".format(file))
clf = pickle.load(open(file, 'rb'))

In [None]:
# Load training dataframe
OLS_dataset = pd.read_csv("../data/2021_OLS_Library/2021_OLS_Library.tsv", sep="\t")
OLS_dataset.head(1)

# Evaluate on validation sequences

In [None]:
# Load train and val
X_val = np.load('../data/{0}/{1}/{2}_X-test-{3}_{4}.npy'.format(TRAIN_DATASET, FEATURES.replace("-", "_"), PREPROCESS, round(1-SPLIT, 1), FEATURES))
y_val = np.loadtxt('../data/{0}/{1}/{2}_y-test-{3}_{1}.txt'.format(TRAIN_DATASET, LABELS, PREPROCESS, round(1-SPLIT, 1)), dtype=int)
id_val = np.loadtxt("../data/{}/id/{}_id-test-{}.txt".format(TRAIN_DATASET, PREPROCESS, round(1-SPLIT, 1)), dtype=str)

# Make predictions on val
y_val_preds = clf.predict(X_val)
y_val_probs = clf.predict_proba(X_val)
clf_val_df = pd.DataFrame(data={"NAME": id_val, "PREDS": y_val_preds, "SCORES": y_val_probs[:, 1]})
val_df = pd.merge(OLS_dataset, clf_val_df, on="NAME")

# Set data to be val
data = val_df

## <u> **Confusion matrix** </u>

In [None]:
project_utils.cf_plot_from_df(data, label_col="MPRA_FXN", title="2021 OLS Validation Set Sequences", ylab="MPRA Activity")

## <u> **Correlation w/ activity** </u>

In [None]:
# Plot scatter
fig, ax = plt.subplots(1, 1, figsize=(8,8))
#sns.regplot(data=data, x="ACTIVITY_SUMRNA_NUMDNA", y="SCORES", x_jitter=.5, ax=ax)
sns.scatterplot(data=data, x="ACTIVITY_SUMRNA_NUMDNA", y="SCORES", ax=ax)

# Add pearson
no_na = (~data["ACTIVITY_SUMRNA_NUMDNA"].isna())
r, p = stats.spearmanr(data[no_na]["ACTIVITY_SUMRNA_NUMDNA"], data[no_na]["SCORES"])
print(r, p)
ax.annotate(r'Pearson $r = CHANGE ME$' + '\n' + r'p $=CHANGE ME$', (6, 0.8), fontsize=16);

## <u> **Threshold plot** </u>

In [None]:
project_utils.threshold_plot(data, label_col="MPRA_FXN")

## <u> **Score distribution plot** </u>

In [None]:
fig, ax = plt.subplots(1,1, figsize=(8,8))
sns.histplot(data=data, x="SCORES", hue="MPRA_FXN", ax=ax);
ax.vlines(0.5, 0, 1, color="red", linestyle="dashed", label="Threshold");

# Evaluate on holdout sequences

In [None]:
X_holdout = np.load("../data/{}/{}/{}_X-holdout_{}.npy".format(TRAIN_DATASET, FEATURES.replace("-", "_"), PREPROCESS, FEATURES))
id_holdout = np.loadtxt("../data/{}/id/{}_id-holdout.txt".format(TRAIN_DATASET, PREPROCESS), dtype=str)
y_holdout_preds = clf.predict(X_holdout)
y_holdout_probs = clf.predict_proba(X_holdout)
clf_holdout_df = pd.DataFrame(data={"NAME": id_holdout, "PREDS": y_holdout_preds, "SCORES": y_holdout_probs[:, 1]})
holdout_df = pd.merge(OLS_dataset, clf_holdout_df, on="NAME")

## Microscope sequences

In [None]:
microscope_mask = (~holdout_df["MICROSCOPE_FXN"].isna())
microscope_df = holdout_df[microscope_mask]
microscope_df["microscope_label"] = (~(microscope_df["MICROSCOPE_FXN"] == "Non-Functional")).astype(int)
data = microscope_df
microscope_colors = dict(zip(data["MICROSCOPE_FXN"].unique(), ["darkgreen", "lightgreen", "gold", "red"]))
data["microscope_label"].value_counts()

### <u> **Confusion matrix** </u>

In [None]:
project_utils.cf_plot_from_df(data, label_col="microscope_label", title="2021 OLS Microscope Validated Set", ylab="Microscope Activity")

### <u> **Activity boxplot grouped by function** </u>

In [None]:
microscope_order = ['Non-Functional', 'Weak Neural Enhancer', 'Neural Enhancer', 'Neural + Ectopic Expression']
fig, ax = plt.subplots(1, 1, figsize=(8,8))
sns.boxplot(y=data["SCORES"], x=data["MICROSCOPE_FXN"], order=microscope_order, palette=microscope_colors, ax=ax)
sns.swarmplot(y=data["SCORES"], x=data["MICROSCOPE_FXN"], order=microscope_order, palette=microscope_colors, size=10, edgecolor="black", linewidth=2, ax=ax)
ax.hlines(0.5, ax.get_xlim()[0], ax.get_xlim()[1], color="red", linestyle="dashed")
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=16);
ax.set_xlabel("Microscope Classification", fontsize=20)
ax.set_ylabel("SCORES", fontsize=20);

### <u> **Swarmplot of scores** </u>

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,8))
sns.swarmplot(y=data["SCORES"], hue=data["MICROSCOPE_FXN"], x=[""]*len(data), palette=microscope_colors, ax=ax)
plt.legend(bbox_to_anchor=(1,1))
ax.hlines(0.5, ax.get_xlim()[0], ax.get_xlim()[1], color="red", linestyle="dashed");

### <u> **Scatterplot with activity** </u>

In [None]:
m, b = np.polyfit(data["ACTIVITY_SUMRNA_NUMDNA"], data["SCORES"], 1)

fig, ax = plt.subplots(1, 1, figsize=(16,8))
sns.scatterplot(data=data, x="ACTIVITY_SUMRNA_NUMDNA", y="SCORES", hue="MICROSCOPE_FXN", palette=microscope_colors, s=50, edgecolor="black", ax=ax)
ax.hlines(0.5, ax.get_xlim()[0], ax.get_xlim()[1], color="red", linestyle="dashed");
ax.set_xlabel("MPRA Activity", fontsize=20)
ax.set_ylabel("Score", fontsize=20);
ax.legend(title='Microscope Classification', fontsize=16)

# Add trendline
x=np.arange(ax.get_xlim()[0], ax.get_xlim()[1], 0.01)
ax.plot(x, m*x + b, '-', color = "grey", alpha=0.8)
ax.legend(loc="lower right", fontsize=16)

# Add pearson
no_na = (~data["ACTIVITY_SUMRNA_NUMDNA"].isna())
r, p = stats.spearmanr(data[no_na]["ACTIVITY_SUMRNA_NUMDNA"], data[no_na]["SCORES"])
print(r, p)
ax.annotate(r'Spearman $\rho = CHANGE ME$' + '\n' + r'p $=CHANGEME$', (1.25, 0.8), fontsize=16);

## No label sequences

In [None]:
ambiguous_df = holdout_df[~microscope_mask]
data = ambiguous_df

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(16,8))
sns.scatterplot(data=ambiguous_df, x="ACTIVITY_SUMRNA_NUMDNA", y="SCORES", ax=ax)
no_na = (~ambiguous_df["ACTIVITY_SUMRNA_NUMDNA"].isna())
p, r = stats.pearsonr(ambiguous_df[no_na]["ACTIVITY_SUMRNA_NUMDNA"], ambiguous_df[no_na]["SCORES"])
print(p, r)

# Evaluate on genomic sequences

## Full sequences

In [None]:
# Load genomic sequences
X_genomic = np.load("../data/All_Genomic_Sequences/mixed/{}_{}-split_X-test_{}.npy".format(PREPROCESS, SPLIT, FEATURES))
y_genomic = np.loadtxt("../data/All_Genomic_Sequences/{0}/y_{0}.txt".format(LABELS))
id_genomic = np.loadtxt("../data/All_Genomic_Sequences/id/id.txt", dtype=str)
valid_idx = np.loadtxt("../data/All_Genomic_Sequences/mixed/id-valid.txt", dtype=str)
valid_indices = np.where(np.in1d(id_genomic, valid_idx))[0]
genomic_dataset = pd.read_csv("../data/All_Genomic_Sequences/All_Genomic_Sequences.tsv", sep="\t").loc[valid_indices]

# Make predictions on sequences
y_genomic_preds = clf.predict(X_genomic)
y_genomic_probs = clf.predict_proba(X_genomic)[:, 1]
genomic_dataset["PREDS"] = y_genomic_preds
genomic_dataset["SCORES"] = y_genomic_probs

# Set data variable
data = genomic_dataset

### <u> **Confusion matrix** </u>

In [None]:
project_utils.cf_plot_from_df(data, title="Genomic Sequences", ylab="Validated Activity")

### <u> **Feature importances** </u>
This will probably vary significantly between model types. Below is a simple example case for linear regression

In [None]:
#feature_names = pd.read_csv("../data/2021_OLS_Library/mixed_1.0/mixed-1.0_header.txt", header=None)[0]
#project_utils.coefficient_plot(clf, feature_names, title="{} {} {} Encoded Coefficients".format(PREPROCESS, TRAIN_DATASET, FEATURES))

## Tiled Sequences

In [None]:
# Load genomic sequences
X_genomic = np.load("../data/All_Genomic_Sequences/mixed/{}_{}-split_X-test_{}-tiled.npy".format(PREPROCESS, SPLIT, FEATURES))
y_genomic = np.loadtxt("../data/All_Genomic_Sequences/{0}/y-tiled_{0}.txt".format(LABELS))
id_genomic = np.loadtxt("../data/All_Genomic_Sequences/id/id-tiled.txt", dtype=str)
valid_idx = np.loadtxt("../data/All_Genomic_Sequences/mixed/id-valid-tiled.txt", dtype=str)
valid_indices = np.where(np.in1d(id_genomic, valid_idx))[0]
genomic_dataset = pd.read_csv("../data/All_Genomic_Sequences/All_Genomic_Sequences-tiled.tsv", sep="\t").loc[valid_indices]

# Make predictions on sequences
y_genomic_preds = clf.predict(X_genomic)
y_genomic_probs = clf.predict_proba(X_genomic)[:, 1]
genomic_dataset["PREDS"] = y_genomic_preds
genomic_dataset["SCORES"] = y_genomic_probs

# Set data variable
data = genomic_dataset[genomic_dataset["TILE"].str.lower() != "full"]

### <u> **Tile plot** </u>

In [None]:
otx_enhancer_utils.tile_plot(data)

### <u> **Sequence tracks** </u>

In [None]:
cmap = mpl.cm.RdYlGn
norm = mpl.colors.Normalize(vmin=data["SCORES"].min(), vmax=data["SCORES"].max())

In [None]:
test = data[data["NAME"] == "scaffold_48:226447:226527"]
seqs = test["SEQ"].apply(str.upper).values
names = test["NAME"].values
scores = test["SCORES"].values
otx_enhancer_utils.otxGenomeTracks(seqs[0], seq_name=names[0], model_pred=scores[0], cmap=cmap, norm=norm)
otx_enhancer_utils.defineTFBS(seqs[0])

# Scratch

# References

 1. Ref 1
 2. Ref 2