In [98]:
from importlib import reload
import re
import random
random.seed(1337)
import os
import pickle

import numpy as np
np.random.seed(1337)
import pandas as pd
import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns

import keras
from keras import backend as K
from keras.models import load_model

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

import model
import utils

In [None]:
def plot_bivar(evaluation_egfp, evaluation_mcherry):
    c1 = (0.3, 0.45, 0.69)
    c2 = 'r'
    g = sns.JointGrid(x='predicted', y="actual", data=evaluation_egfp, space=0, xlim=(0,10), ylim=(0,10), ratio=6, size=7)
    g.plot_joint(plt.scatter,s=20, color=c1, linewidth=0.2, alpha='0.5', edgecolor='white')
    g.x = evaluation_mcherry['predicted'].values
    g.y = evaluation_mcherry['actual'].values
    g.plot_joint(plt.scatter, s=20, linewidth=0.2, alpha='0.5', color=c2, edgecolor='white')
    f = g.fig
    ax = f.gca()
    x = np.linspace(*ax.get_xlim())
    plt.plot(x, x)
 
""" VALIDATIONS """
    
def prepare_ptr_data(df, seq_pad=0):
    data_dict = {}
    data_dict["input"] = utils.encode_fromdf(df, 0, seq_pad=seq_pad, variable_len=True)
    return data_dict
    
def get_ptr_corr(model, data, df):
    inputs = [data["input"]["seq"], data["input"]["indicator"]]
    predictions = model.predict(inputs)
    print(stats.pearsonr(df["PTR"],predictions.reshape(-1)))
    
def prepare_data(df, col, seq_pad=0):
    data_dict = {}
    data_dict["input"] = utils.encode_fromdf(df, 2, col=col, seq_pad=seq_pad)
    data_dict["output"] = np.array(df["hrl"]).reshape(-1,1)
    return data_dict

### Read in the prepared data

In [5]:
with open("../Data/data_dict.pkl", 'rb') as handle:
    data_dict = pickle.load(handle)

data_df = data_dict["data"]
snv_df = data_dict["snv"]
ptr_df = data_dict["ptr"]

### Encode the data in the required input format

In [26]:
encoded_data = {}
for set_type in data_df["set"].unique():
    df_slice = data_df[data_df["set"] == set_type]
    set_type_dict = {}
    for library in df_slice["library"].unique():
        set_type_dict[library] = utils.encode_df(df_slice[df_slice["library"] == library])  
    encoded_data[set_type] = set_type_dict

snv_encoded = {}
snv_df["library"] = "human"
snv_encoded["snv"] = utils.encode_df(snv_df, col="utr", output_col="hrl")
snv_encoded["wt"] = utils.encode_df(snv_df, col="mother", output_col="hrl")
sub = snv_df
path_list = ['Pathogenic', 'Likely pathogenic', 'Pathogenic, other', 'Pathogenic/Likely pathogenic']
benign_list = ['Benign/Likely benign', 'Benign', 'Likely Benign']
uncertain_list = ['Conflicting interpretations of pathogenicity', 'Uncertain significance']
path = sub[(sub['clin_sig'] == path_list[0]) | (sub['clin_sig'] == path_list[1]) |
           (sub['clin_sig'] == path_list[2]) | (sub['clin_sig'] == path_list[3])]
non = sub[(sub['clin_sig'] == benign_list[0]) | (sub['clin_sig'] == benign_list[1]) | (sub['clin_sig'] == benign_list[2])]
unsure = sub[(sub['clin_sig'] == uncertain_list[0]) | (sub['clin_sig'] == uncertain_list[1])]

ptr_df["library"] = "egfp_unmod_1"
ptr_encoded = utils.encode_df(ptr_df, col="utr", output_col=None, variable_len=True)

### Instantiate a basic model and train

In [90]:
reload(model)

<module 'model' from '/data/ouga04b/ag_gagneur/home/karollus/5UTRModel/Collab/Training/model.py'>

In [86]:
utr_model = model.create_model_masked_bordered()

In [104]:
utils.train(utr_model, encoded_data, libraries=["egfp_unmod_1"], epochs=10, file="basic_model.h5")
utr_model = load_model("basic_model.h5", custom_objects={'FrameSliceLayer': model.FrameSliceLayer})

Train on 240000 samples, validate on 20000 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.27912, saving model to basic_model.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.27912 to 0.26651, saving model to basic_model.h5
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.26651
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.26651
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.26651
Epoch 00005: early stopping


In [107]:
pred = utils.evaluate(utr_model, encoded_data, libraries=["egfp_unmod_1"], do_test=True)

Rsquared on set egfp_unmod_1 : 0.9184789401533968, Pearson: 0.9210255977813298


In [108]:
utr_model = utils.retrain_only_scaling(utr_model, encoded_data, batch_size=128, epochs=2)

Epoch 1/2
Epoch 2/2


In [109]:
utils.evaluate(utr_model, encoded_data, libraries=['mcherry_1', 'mcherry_2', 'egfp_unmod_2', 'human'])

Rsquared on set egfp_unmod_1 : 0.909004145542096, Pearson: 0.9099067587126427
Rsquared on set mcherry_1 : 0.7279082654301345, Pearson: 0.7280978897307177
Rsquared on set mcherry_2 : 0.7635886128590488, Pearson: 0.7652201512338017
Rsquared on set egfp_unmod_2 : 0.856273726593918, Pearson: 0.8579424133934646
Rsquared on set human : 0.7617980195718617, Pearson: 0.7619559824295236


### Evaluate the model on the test set, human data, snv data and ptr data

In [79]:
list(data_df["library"].unique())

['egfp_unmod_1', 'mcherry_1', 'mcherry_2', 'egfp_unmod_2', 'human', 'ga']