In [5]:
from importlib import reload
from sklearn.externals import joblib
from sklearn import preprocessing
import scipy.stats as stats
import numpy as np
np.random.seed(1337)
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
random.seed(1337)

import keras
from keras import backend as K

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

import model
import utils

In [None]:
""" EVALUATION """

def rSquared(predictions, targets):
    ssr = np.sum(np.square(predictions - targets))
    ybar = np.average(targets)
    sst = np.sum(np.square(targets - ybar))
    return 1 - (ssr/sst)

def r2_paper(x,y):
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    return r_value**2

def evaluate(model, data):
    inputs = [data["val_input"]["seq"], data["val_input"]["indicator"]]
    predictions_val = model.predict(inputs)
    print("Rsquared on validation set " + data["name"] + ": " + str(rSquared(predictions_val, data["val_output"])))
    return pd.DataFrame({"predicted": predictions_val.reshape(-1), "actual": data["val_output"].reshape(-1)})

def test(model, data):
    inputs = [data["test_input"]["seq"], data["test_input"]["indicator"]]
    predictions_test = model.predict(inputs)
    print("Rsquared on test set " + data["name"] + ": " + str(rSquared(predictions_test, data["test_output"])))
    return pd.DataFrame({"predicted": predictions_test.reshape(-1), "actual": data["test_output"].reshape(-1)})

def train(model, data, batch_size=128, epochs=3, val_data=None):
    inputs = [data["train_input"]["seq"], data["train_input"]["indicator"]]
    if val_data is not None:
        val_data = ([val_data["val_input"]["seq"], val_data["val_input"]["indicator"]], data["val_output"])
    model.fit(inputs, data["train_output"], batch_size, epochs, verbose=1, validation_data=val_data)
    
def train_scale(model, data, batch_size=128, epochs=3):
    inputs = [data["val_input"]["seq"], data["val_input"]["indicator"]]
    model.fit(inputs, data["val_output"], batch_size, epochs, verbose=1)

def freeze_all_except_scaling(model):
    for layer in model.layers:
        if layer.name != "scaling_regression":
            layer.trainable = False
        adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(loss='mean_squared_error', optimizer=adam)
    return model
    
def check_layer(model, data, layer_names, node=0):
    return_dict = {}
    for name in layer_names:
        target_obj = model.get_layer(name).get_output_at(node)
        if type(target_obj) == list:
            target = [tensor for tensor in target_obj] 
        else:
            target = [target_obj]
        check_fn = K.function([model.get_layer("input_seq").input, model.get_layer("input_experiment").input], target)
        return_dict[name] = check_fn([data["val_input"]["seq"], data["val_input"]["indicator"]])
    return return_dict

def plot(evaluation):
    c1 = (0.3, 0.45, 0.69)
    c2 = 'r'
    g = sns.JointGrid(x='predicted', y="actual", data=evaluation, space=0, xlim=(0,10), ylim=(0,10), ratio=6, size=7)
    g.plot_joint(plt.scatter,s=20, color=c1, linewidth=0.2, alpha='0.5', edgecolor='white')
    f = g.fig
    ax = f.gca()
    x = np.linspace(*ax.get_xlim())
    plt.plot(x, x)

def plot_bivar(evaluation_egfp, evaluation_mcherry):
    c1 = (0.3, 0.45, 0.69)
    c2 = 'r'
    g = sns.JointGrid(x='predicted', y="actual", data=evaluation_egfp, space=0, xlim=(0,10), ylim=(0,10), ratio=6, size=7)
    g.plot_joint(plt.scatter,s=20, color=c1, linewidth=0.2, alpha='0.5', edgecolor='white')
    g.x = evaluation_mcherry['predicted'].values
    g.y = evaluation_mcherry['actual'].values
    g.plot_joint(plt.scatter, s=20, linewidth=0.2, alpha='0.5', color=c2, edgecolor='white')
    f = g.fig
    ax = f.gca()
    x = np.linspace(*ax.get_xlim())
    plt.plot(x, x)
 
""" VALIDATIONS """

def check_uAUG_detection(trained_model, kozak=False, seq_length=200, samples=1000):
    if kozak:
        out_df = pd.DataFrame({"idx": list(range(-seq_length, -8, 1)), "in_frame": [i % 3 == 0 for i in range(-seq_length, -8, 1)]})
    else:
        out_df = pd.DataFrame({"idx": list(range(-seq_length, -2, 1)), "in_frame": [i % 3 == 0 for i in range(-seq_length, -2, 1)]})
    predictions = []
    for i in range(samples):
         # Make a random sequence
        seq = ''.join(random.choices(["A","C","T","G"], k=seq_length))
        # Remove existing atg
        atg_present = [m.start() for m in re.finditer('ATG', seq)]
        for idx in atg_present:
            seq = seq[:idx] + ''.join(random.choices(["C","T","G"], k=seq_length)) + seq[idx+3:]
        # Iterate over all possible atg/kozak locations
        uAUG_seqs = []
        if kozak:
            for i in range(len(seq) - 8):
                new_seq = seq
                new_seq = new_seq[:i] + "GCCACCATG" + new_seq[i+9:] 
                uAUG_seqs.append(new_seq)
            df = pd.DataFrame({"utr": uAUG_seqs})
        else:
            for i in range(len(seq) - 2):
                new_seq = seq
                new_seq = new_seq[:i] + "ATG" + new_seq[i+3:] 
                uAUG_seqs.append(new_seq)
            df = pd.DataFrame({"utr": uAUG_seqs})
        # Predict
        data = utils.encode_fromdf(df, 0)
        predictions.append(trained_model.predict([data["seq"], data["indicator"]]))
    #get average prediction
    out_df["prediction"] = (sum(predictions)/samples).reshape(-1)
    return out_df

def uAUG_plot(uAUG_pred):
    in_frame = uAUG_pred.loc[uAUG_pred['in_frame']==True]
    out_frame = uAUG_pred.loc[uAUG_pred['in_frame']!=True]
    plt.plot( 'idx', 'prediction', data=in_frame, color='skyblue', linewidth=2)
    plt.plot( 'idx', 'prediction', data=out_frame, color='red', linewidth=2)
    
def prepare_ptr_data(df, seq_pad=0):
    data_dict = {}
    data_dict["input"] = utils.encode_fromdf(df, 0, seq_pad=seq_pad, variable_len=True)
    return data_dict
    
def get_ptr_corr(model, data, df):
    inputs = [data["input"]["seq"], data["input"]["indicator"]]
    predictions = model.predict(inputs)
    print(stats.pearsonr(df["PTR"],predictions.reshape(-1)))
    
def prepare_data(df, col, seq_pad=0):
    data_dict = {}
    data_dict["input"] = utils.encode_fromdf(df, 2, col=col, seq_pad=seq_pad)
    data_dict["output"] = np.array(df["hrl"]).reshape(-1,1)
    return data_dict

### Read in the prepared data

In [None]:
with open("../Data/data_dict.pkl", 'rb') as handle:
    data_dict = pickle.load(handle)

data_df = data_dict["data"]
snv_df = data_dict["snv"]
ptr_df = data_dict["ptr"]

### Encode the data in the required input format

In [None]:
one_hot_egfp = 