In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../")
import os
os.chdir("..")
import numpy as np
import pandas as pd
import scipy.stats as stats
from tqdm import tqdm
from tensorflow.keras.models import load_model, Model
import src.quad_model
from src.figutils import add_flanking, create_input_data
from src.vis_data import get_vis_data
from joblib import load
from json import dump
from src.generate_custom_model import generate_custom_model

pd.set_option('mode.chained_assignment', None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def r2_score(y_true, y_pred):
    slope, intercept, r_value, p_value, std_err = stats.linregress(y_pred, y_true)
    return r_value

In [3]:
model_keras = load_model('model/custom_adjacency_regularizer_20210731_124_step3.h5')

# Ke et al. & Baeza et al

Code reference: 
- [regev-lab/splicing_library_analysis/2023_05_08_revision
/analyze_ke_et_al_for_cnnc.ipynb](https://github.com/regev-lab/splicing_library_analysis/blob/e17c9ff422b0a2270358cf347511253d045228e8/2023_05_08_revision/analyze_ke_et_al_for_cnnc.ipynb#L277)
- [regev-lab/splicing_library_analysis/2023_05_08_revision
/analyze_baeza_et_al_for_cnnc.ipynb](https://github.com/regev-lab/splicing_library_analysis/blob/e17c9ff422b0a2270358cf347511253d045228e8/2023_05_08_revision/analyze_baeza_et_al_for_cnnc.ipynb)

In [4]:
data = pd.read_excel('data/datasets/ke_et_al_Supplemental_Table_S2_no_empty_lines.xls')
data['PSI'] = 0.19*data.EI + 0.02
data['sequence'] = data.apply(lambda x: x.seq[16:-9], axis=1)
data['exon'] = data.apply(lambda x: x.sequence[7:-7], axis=1)
data = data[['exon', 'sequence', 'PSI']]
data.to_csv(f'data/datasets/WT1_exon_5.csv', index=False)
data.head()

Unnamed: 0,exon,sequence,PSI
0,AGTTGCTGCTGGGAGCTCCAGCACAGTGAAATGGACAGAAGGGCAG...,TTTCTAGAGTTGCTGCTGGGAGCTCCAGCACAGTGAAATGGACAGA...,0.055445
1,AAATGCTGCTGGGAGCTCCAGCACAGTGAAATGGACAGAAGGGCAG...,TTTCTAGAAATGCTGCTGGGAGCTCCAGCACAGTGAAATGGACAGA...,0.029533
2,AACTGCTGCTGGGAGCTCCAGCACAGTGAAATGGACAGAAGGGCAG...,TTTCTAGAACTGCTGCTGGGAGCTCCAGCACAGTGAAATGGACAGA...,0.022148
3,AAGTGCTGCTGGGAGCTCCAGCACAGTGAAATGGACAGAAGGGCAG...,TTTCTAGAAGTGCTGCTGGGAGCTCCAGCACAGTGAAATGGACAGA...,0.027447
4,AATTGCTGCTGGGAGCTCCAGCACAGTGAAATGGACAGAAGGGCAG...,TTTCTAGAATTGCTGCTGGGAGCTCCAGCACAGTGAAATGGACAGA...,0.099695


In [5]:
data = pd.read_excel("data/datasets/baeza_et_al_untreated_only.xls")
data["PSI"] = data['Mean.PSI']
data = data[data["PSI.Standard.Deviation"] < 0.1]
data['sequence'] = data.Sequence
data['exon'] = data.Sequence
data = data[['exon', 'sequence', 'PSI']]
data.to_csv(f'data/datasets/FAS_exon_6.csv', index=False)
data.head()

Unnamed: 0,exon,sequence,PSI
21,GATCCAGATCTAACTTGCGGTGGCTGTGTCTCCTGCTTTTCCCGAT...,GATCCAGATCTAACTTGCGGTGGCTGTGTCTCCTGCTTTTCCCGAT...,0.936154
31,GATCCAGATCTAACTTGCGGTGGCTGTGTCTCCTGCTTTTGCCGAT...,GATCCAGATCTAACTTGCGGTGGCTGTGTCTCCTGCTTTTGCCGAT...,1.043201
83,GATCCAGATCTAACTTGCGGTGGCTGTGTCTTCTGCTTTTCCCAAT...,GATCCAGATCTAACTTGCGGTGGCTGTGTCTTCTGCTTTTCCCAAT...,0.821621
92,GATCCAGATCTAACTTGCGGTGGCTGTGTCTTCTGCTTTTGCCGAT...,GATCCAGATCTAACTTGCGGTGGCTGTGTCTTCTGCTTTTGCCGAT...,1.108504
93,GATCCAGATCTAACTTGCGGTGGCTGTGTCTTCTGCTTTTGCCGAT...,GATCCAGATCTAACTTGCGGTGGCTGTGTCTTCTGCTTTTGCCGAT...,1.092884


# Rosenberg et al.

Code reference: 
- [Alex-Rosenberg/cell-2015/ipython.notebooks
/Cell2015_N11_Predicting_Cassette_Exon_SNP_Effects.ipynb](https://github.com/Alex-Rosenberg/cell-2015/blob/ca54d1117fd28375260bfde3d1b46f3d6074f306/ipython.notebooks/Cell2015_N11_Predicting_Cassette_Exon_SNP_Effects.ipynb#L712)

In [6]:
def get_sequence(ref_seq, pos, seq):
    return ref_seq[:pos-1] + seq[:min(len(seq),len(ref_seq)-pos+1)] + ref_seq[min(pos+len(seq)-1, len(ref_seq)):]

def get_mutant_sequence_for_row(ref_seq, row):
    assert(get_sequence(ref_seq, row.POS, row.REF) == ref_seq)
    return(get_sequence(ref_seq, row.POS, row.ALT))

In [7]:
data = pd.read_excel("data/datasets/rosenberg_et_al_supplementary_no_comments.xls")

SMN2_WT = "GGTTTTAGACAAAATCAAAAAGAAGGAAGGTGCTCACATTCCTTAAATTAAGGA"
data = data[data.Gene == "SMN1/2"]
data = data[data.apply(lambda x: len(x.REF) == len(x.ALT), axis=1)] # ignore indels
data = data[data.REF_PSI < 0.5]  # take only SMN2, identified by its low REF_PSI; ignore SMN1
data = data[data.apply (lambda x : SMN2_WT[:3] == get_mutant_sequence_for_row(SMN2_WT, x)[:3], axis=1)] # first and last 3nt are the same (same filtering done in the Rosenberg notebook "genomic_predictions - Copy")
data = data[data.apply (lambda x : SMN2_WT[-3:] == get_mutant_sequence_for_row(SMN2_WT, x)[-3:], axis=1)]
data["exon"] = data.apply(lambda x: get_mutant_sequence_for_row(SMN2_WT, x), axis=1)
data["sequence"] = data.apply(lambda x: "AACTTCCTTTATTTTCCTTACAG"+get_mutant_sequence_for_row(SMN2_WT, x)+"GTAAGTCTGCC", axis=1)
data["PSI"] = data.ALT_PSI

data = data[['exon', 'sequence', 'PSI']]
data.to_csv(f'data/datasets/SMN2_exon_7.csv', index=False)
data.head()

Unnamed: 0,exon,sequence,PSI
0,GGTTTCAGACAAAATCAAAAAGAAGGAAGGTGCTCACATTCCTTAA...,AACTTCCTTTATTTTCCTTACAGGGTTTCAGACAAAATCAAAAAGA...,0.97
4,GGTGGTAGACAAAATCAAAAAGAAGGAAGGTGCTCACATTCCTTAA...,AACTTCCTTTATTTTCCTTACAGGGTGGTAGACAAAATCAAAAAGA...,0.59
6,GGTGTTAGACAAAATCAAAAAGAAGGAAGGTGCTCACATTCCTTAA...,AACTTCCTTTATTTTCCTTACAGGGTGTTAGACAAAATCAAAAAGA...,0.21
7,GGTTGTAGACAAAATCAAAAAGAAGGAAGGTGCTCACATTCCTTAA...,AACTTCCTTTATTTTCCTTACAGGGTTGTAGACAAAATCAAAAAGA...,0.47
8,GGTTTTAGACACAATCAAAAAGAAGGAAGGTGCTCACATTCCTTAA...,AACTTCCTTTATTTTCCTTACAGGGTTTTAGACACAATCAAAAAGA...,0.91


In [8]:
data = pd.read_excel("data/datasets/rosenberg_et_al_supplementary_no_comments.xls")

BRCA2 = "GGTCGTCAGACACCAAAACATATTTCTGAAAGTCTAGGAGCTGAGGTGGATCCTGATATGTCTTGGTCAAGTTCTTTAGCTACACCACCCACCCTTAGTTCTACTGTGCTCATAG"
data = data[data.Gene == "BRCA2"]
data = data.dropna()
data = data[data.apply(lambda x: len(x.REF) == len(x.ALT), axis=1)] # ignore indels
data = data[data.apply (lambda x : BRCA2[:3] == get_mutant_sequence_for_row(BRCA2, x)[:3], axis=1)] # first and last 4nt are the same (same filtering done in the Rosenberg notebook "genomic_predictions - Copy")
data = data[data.apply (lambda x : BRCA2[-3:] == get_mutant_sequence_for_row(BRCA2, x)[-3:], axis=1)]
data["exon"] = data.apply(lambda x: get_mutant_sequence_for_row(BRCA2, x), axis=1)
data["sequence"] = data.apply(lambda x: "TTTCTTTCCTCCCAG"+get_mutant_sequence_for_row(BRCA2, x)+"GTAATAATAGCAAAT", axis=1)
data["PSI"] = data.ALT_PSI

data = data[['exon', 'sequence', 'PSI']]
data.to_csv(f'data/datasets/BRCA2_exon_7.csv', index=False)
data.head()

Unnamed: 0,exon,sequence,PSI
153,GGTCATCAGACACCAAAACATATTTCTGAAAGTCTAGGAGCTGAGG...,TTTCTTTCCTCCCAGGGTCATCAGACACCAAAACATATTTCTGAAA...,0.8
154,GGTCGTCAGACACCACAACATATTTCTGAAAGTCTAGGAGCTGAGG...,TTTCTTTCCTCCCAGGGTCGTCAGACACCACAACATATTTCTGAAA...,0.89
155,GGTCGTCAGACACCAAAGCATATTTCTGAAAGTCTAGGAGCTGAGG...,TTTCTTTCCTCCCAGGGTCGTCAGACACCAAAGCATATTTCTGAAA...,0.87
157,GGTCGTCAGACACCAAAACATACTTCTGAAAGTCTAGGAGCTGAGG...,TTTCTTTCCTCCCAGGGTCGTCAGACACCAAAACATACTTCTGAAA...,0.9
158,GGTCGTCAGACACCAAAACATATTTCTGAAAGCCTAGGAGCTGAGG...,TTTCTTTCCTCCCAGGGTCGTCAGACACCAAAACATATTTCTGAAA...,0.9


In [9]:
data = pd.read_excel("data/datasets/rosenberg_et_al_supplementary_no_comments.xls")

CFTR = "AGCAGTATACAAAGATGCTGATTTGTATTTATTAGACTCTCCTTTTGGATACCTAGATGTTTTAACAGAAAAAGAAATATTTGAAAG"
data = data[data.Gene == "CFTR"]
data = data[data.apply(lambda x: len(x.REF) == len(x.ALT), axis=1)] # ignore indels
data = data[data.apply (lambda x : CFTR[:3] == get_mutant_sequence_for_row(CFTR, x)[:3], axis=1)] # first and last 4nt are the same (same filtering done in the Rosenberg notebook "genomic_predictions - Copy")
data = data[data.apply (lambda x : CFTR[-3:] == get_mutant_sequence_for_row(CFTR, x)[-3:], axis=1)]
data["exon"] = data.apply(lambda x: get_mutant_sequence_for_row(CFTR, x), axis=1)
data["sequence"] = data.apply(lambda x: "CCATTTTCTTTTTAG"+get_mutant_sequence_for_row(CFTR, x)+"GTATGTTCTTTGAAT", axis=1)
data["PSI"] = data.ALT_PSI

data = data[['exon', 'sequence', 'PSI']]
data.to_csv(f'data/datasets/CFTR_exon_13.csv', index=False)
data.head()

Unnamed: 0,exon,sequence,PSI
130,AGCAGTATATAAAGATGCTGATTTGTACCTATTAGATTCCCCTTTT...,CCATTTTCTTTTTAGAGCAGTATATAAAGATGCTGATTTGTACCTA...,0.95
131,AGCAGTATACAAAGATGCTGATTTGTATTTATTAGACTCTCCTTTT...,CCATTTTCTTTTTAGAGCAGTATACAAAGATGCTGATTTGTATTTA...,1.0
132,AGCAGTATATAAAGATGCTGATTTGTACCTATTAGATTCCCCTTTT...,CCATTTTCTTTTTAGAGCAGTATATAAAGATGCTGATTTGTACCTA...,0.05
133,AGCAGTATACAAGGATGCTGATTTGTATTTATTAGACTCTCCTTTT...,CCATTTTCTTTTTAGAGCAGTATACAAGGATGCTGATTTGTATTTA...,0.95
134,AGCAGTATACAAAGACGCTGATTTGTATTTATTAGACTCTCCTTTT...,CCATTTTCTTTTTAGAGCAGTATACAAAGACGCTGATTTGTATTTA...,0.95


# Save basal shift and padding sequences for each dataset

In [10]:
datasets_data = {
    "ES7": {
        "basal_shift": 0,
        "pre_flanking_sequence": "CATCCAGGTT",
        "post_flanking_sequence": "CAGGTCTGAC",
        "number_of_datapoints": 47962,
        "exon_length": 70,
        "sequence_length": 90,
        "r_squared": 0.9069368881066787
    }
}
datasets = [
    'WT1_exon_5',
    'FAS_exon_6',
    'SMN2_exon_7',
    'BRCA2_exon_7',
    'CFTR_exon_13'
]
for dataset in datasets:
    data = pd.read_csv(f'data/datasets/{dataset}.csv')
    input_to_model = create_input_data(list(data.sequence))
    seq_len = len(data.sequence.iloc[0])
    best_rmse = 1000
    r2 = None
    for basal_shift in tqdm(np.arange(-10,10,0.2)):
        basal_shift = np.round(basal_shift,1)
        custom_model = generate_custom_model(seq_len, basal_shift)
        predictions = custom_model(input_to_model).numpy().flatten()
        rmse = ((data.PSI-predictions)**2).mean()
        if (rmse<best_rmse):
            best_basal_shift = basal_shift
            best_rmse = rmse
            r2 = r2_score(data.PSI, predictions)
    print(dataset)
    print('Sequence length: ', seq_len)
    print('Exon length: ', len(data.exon.iloc[0]))
    print('Number of datapoints: ', len(data))
    print('Best shift: ', best_basal_shift)
    print("Best RMSE: ", best_rmse)
    print("R2: ", r2)
    print()
    sequence = data.sequence.iloc[0]
    exon = data.exon.iloc[0]
    pre_flanking_sequence = sequence[:sequence.find(exon)]
    post_flanking_sequence = sequence[(sequence.find(exon)+len(exon)):]
    datasets_data[dataset] = {
        "basal_shift": best_basal_shift,
        "pre_flanking_sequence": pre_flanking_sequence,
        "post_flanking_sequence": post_flanking_sequence,
        "number_of_datapoints": len(data),
        "exon_length": len(data.exon.iloc[0]),
        "sequence_length": seq_len,
        "r_squared": r2
    }
with open("data/datasets_data.json", "w") as f:
    dump(datasets_data, f, indent=2)

100%|██████████| 100/100 [00:37<00:00,  2.64it/s]


WT1_exon_5
Sequence length:  65
Exon length:  51
Number of datapoints:  5560
Best shift:  -5.8
Best RMSE:  0.02669870216950634
R2:  0.7756628909179489



100%|██████████| 100/100 [00:34<00:00,  2.91it/s]


FAS_exon_6
Sequence length:  63
Exon length:  63
Number of datapoints:  794
Best shift:  2.4
Best RMSE:  0.02633237888327526
R2:  0.9020387508616958



100%|██████████| 100/100 [00:36<00:00,  2.76it/s]


SMN2_exon_7
Sequence length:  88
Exon length:  54
Number of datapoints:  56
Best shift:  5.2
Best RMSE:  0.05765770791180734
R2:  0.6485961639623267



100%|██████████| 100/100 [00:42<00:00,  2.37it/s]


BRCA2_exon_7
Sequence length:  145
Exon length:  115
Number of datapoints:  31
Best shift:  8.0
Best RMSE:  0.014833782753232546
R2:  0.6915058670445455



100%|██████████| 100/100 [00:38<00:00,  2.63it/s]

CFTR_exon_13
Sequence length:  117
Exon length:  87
Number of datapoints:  22
Best shift:  7.0
Best RMSE:  0.047352519377309416
R2:  0.8591799198918351




