# **Modelling Training Replication code**
This notebook provides code to train a Framepooling model.

In [1]:
## Imports
# base python
from importlib import reload
import re
import itertools
from pathlib import Path
import random
random.seed(1337)
import os
import pickle
from decimal import Decimal
import collections

# numpy and similar
import numpy as np
np.random.seed(1337)
import pandas as pd
pd.options.mode.chained_assignment = None 
import scipy.stats as stats

# Dont use GPU
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Deep Learning packages
import tensorflow as tf
import keras
from keras import backend as K
from keras.models import load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping

# code scripts
import model
import utils
import utils_data

Using TensorFlow backend.


In [2]:
#os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"

#### Make utility directories

In [None]:
if not os.path.exists(Path("../Data/Metrics")):
    os.makedirs(Path("../Data/Metrics"))

# **Reading in the data**
Reading in the data pickle to run model training

In [6]:
with open(Path("../Data/data_dict.pkl"), 'rb') as handle:
    data_dict = pickle.load(handle)

In [7]:
for key in data_dict:
    print(key)
    print(len(data_dict[key]))

mpra
980244
snv
1584
ptr
11575
wilhelm
5293
andreev
8003
pcr3
7672
eichhorn
8956
doudna
25831
varlen_mpra
102214


# **Training**
This code shows how to train a framepool model from scratch

### Prepare training data

In [None]:
# Encoding functions
one_hot_fn = utils_data.OneHotEncoder("utr")
out_encoding_fn = utils_data.DataFrameExtractor("rl", method="direct")
library_fn = utils_data.LibraryEncoder("library", {"egfp_unmod_1":0, "random":1})

In [8]:
# length 50 mpra data
mpra_data = data_dict["mpra"]
train_data_50 = mpra_data[(mpra_data.set == "train") & (mpra_data.library == "egfp_unmod_1")]
generator_50 = utils_data.DataSequence(train_data_50, encoding_functions=[one_hot_fn], 
                                    output_encoding_fn=out_encoding_fn, shuffle=True)

In [None]:
# variable length (up to 100) mpra data
mpra_data_varlen = data_dict["varlen_mpra"]
train_data_100 = mpra_data_varlen[(mpra_data_varlen.set == "train") & (mpra_data_varlen.library == "random")]
generator_100 = utils_data.DataSequence(train_data_100, encoding_functions=[one_hot_fn], 
                                    output_encoding_fn=out_encoding_fn, shuffle=True)

In [11]:
# combined length 50 and variable length mpra data
train_data_combined = pd.concat([train_data_100[["utr", "rl", "library"]], train_data_50[["utr", "rl", "library"]]])
generator_combined = utils_data.DataSequence(train_data_combined, encoding_functions=[one_hot_fn, library_fn], 
                                    output_encoding_fn=out_encoding_fn, shuffle=True)

### Train a Model

In [None]:
utr_model_combined = model.create_frame_slice_model(kernel_size=[7,7,7],
                        only_max_pool=False,
                        padding="same",
                        skip_connections="residual",
                        use_scaling_regression=True, library_size=2)

In [None]:
utr_model.fit_generator(generator_combined, epochs=6, verbose=2)

In [None]:
utr_model.save("../Models/utr_model_combined_residual.h5")

### Evaluate it

In [None]:
utr_model_combined = load_model("..Models/Framepool_combined_residual.h5", custom_objects={'FrameSliceLayer': model.FrameSliceLayer})

In [None]:
metrics = utils.compute_all_test_metrics(data_dict, utr_model, extra_encoding_fn=[library_fn], noTG = True)

In [None]:
# pickle and save
with open(Path("../Data/Metrics/metrics_frameslice_combined_dict.pkl"), 'wb') as handle:
    pickle.dump(metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)