In [5]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import shap
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Dropout, BatchNormalization, GRU, LayerNormalization, ReLU
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1, l1_l2
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from keras.utils import to_categorical
from itertools import product 
from matplotlib.legend_handler import HandlerBase
import matplotlib.cm as cm
import seaborn as sns
import matplotlib.patches as patches
from sklearn.metrics import roc_auc_score, roc_curve, auc

2025-02-11 08:19:07.747079: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
transcript_data = []

with open('Data/genomes/genome_m39/gencode.vM27.chr_patch_hapl_scaff.annotation.gtf', 'r') as gtf_file:
    for line in gtf_file:
        if line.startswith('#'):
            continue

        columns = line.strip().split('\t')
        attributes = columns[8]

        if 'transcript_id' in attributes:
            transcript_id = [field for field in attributes.split(';') if 'transcript_id' in field][0]
            transcript_id = transcript_id.split('"')[1]

            transcript_data.append(columns[:7] + [transcript_id])

column_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', "strand", 'transcript_id']

annotation = pd.DataFrame(transcript_data, columns=column_names)
annotation['start'] = annotation['start'].astype(int) - 1
annotation

Unnamed: 0,seqname,source,feature,start,end,score,strand,transcript_id
0,chr1,HAVANA,transcript,3143475,3144545,.,+,ENSMUST00000193812.2
1,chr1,HAVANA,exon,3143475,3144545,.,+,ENSMUST00000193812.2
2,chr1,ENSEMBL,transcript,3172238,3172348,.,+,ENSMUST00000082908.3
3,chr1,ENSEMBL,exon,3172238,3172348,.,+,ENSMUST00000082908.3
4,chr1,HAVANA,transcript,3276123,3286567,.,-,ENSMUST00000162897.2
...,...,...,...,...,...,...,...,...
1814260,JH584304.1,ENSEMBL,stop_codon,55479,55482,.,-,ENSMUST00000178343.2
1814261,JH584304.1,ENSEMBL,exon,52690,54867,.,-,ENSMUST00000178343.2
1814262,JH584304.1,ENSEMBL,UTR,58616,59690,.,-,ENSMUST00000178343.2
1814263,JH584304.1,ENSEMBL,UTR,55111,55482,.,-,ENSMUST00000178343.2


In [3]:
smoops_df = pd.read_csv('Data/masterfile/all_smOOPS_and_controls.tsv', sep='\t')
smoops_df = smoops_df[['gene_name',	'transcript_id', 'smoops_naive',	'smoops_epi',	'smoops_diff',	'control_common',	'semi_diff_mean_tpm',	'semi_epiSC_mean_tpm',	'semi_naive_mean_tpm', 'semi_diff_mean_count',	'semi_epiSC_mean_count',	'semi_naive_mean_count']]
smoops_df

Unnamed: 0,gene_name,transcript_id,smoops_naive,smoops_epi,smoops_diff,control_common,semi_diff_mean_tpm,semi_epiSC_mean_tpm,semi_naive_mean_tpm,semi_diff_mean_count,semi_epiSC_mean_count,semi_naive_mean_count
0,Xkr4,ENSMUST00000070533.5,False,True,True,False,3.872970,2.324972,0.014051,194.333000,235.666667,0.666667
1,Gm37180,ENSMUST00000195335.2,False,True,True,False,0.521827,0.301747,0.012481,22.666667,26.666667,0.333333
2,Gm37363,ENSMUST00000192336.2,False,False,True,False,0.485116,0.206451,0.000000,14.333333,15.666667,0.000000
3,Gm37686,ENSMUST00000194099.2,False,False,True,False,0.627143,0.321620,0.000000,18.000000,24.666667,0.000000
4,Gm37329,ENSMUST00000192973.2,False,False,True,False,1.040409,0.393665,0.000000,30.333333,25.333333,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
3055,Nhs,ENSMUST00000087085.10,False,True,True,False,0.707368,0.917609,0.077997,91.000000,245.000000,6.000000
3056,Ofd1,ENSMUST00000049501.9,False,True,True,False,2.596442,2.308879,1.399653,165.333333,326.725000,86.666667
3057,Hccs,ENSMUST00000033717.9,False,False,False,True,8.779636,9.323919,9.318443,270.333333,630.333333,302.000333
3058,Ddx3y,ENSMUST00000091190.12,False,True,True,False,12.061610,9.830595,1.813766,633.994667,1141.029333,96.344667


In [4]:
merged_df = pd.merge(annotation, smoops_df, left_on='transcript_id', right_on='transcript_id')
merged_df = merged_df[merged_df['feature'] == 'exon']
merged_df

Unnamed: 0,seqname,source,feature,start,end,score,strand,transcript_id,gene_name,smoops_naive,smoops_epi,smoops_diff,control_common,semi_diff_mean_tpm,semi_epiSC_mean_tpm,semi_naive_mean_tpm,semi_diff_mean_count,semi_epiSC_mean_count,semi_naive_mean_count
1,chr1,HAVANA,exon,3740774,3741721,.,-,ENSMUST00000070533.5,Xkr4,False,True,True,False,3.872970,2.324972,0.014051,194.333000,235.666667,0.666667
4,chr1,HAVANA,exon,3491924,3492124,.,-,ENSMUST00000070533.5,Xkr4,False,True,True,False,3.872970,2.324972,0.014051,194.333000,235.666667,0.666667
6,chr1,HAVANA,exon,3284704,3287191,.,-,ENSMUST00000070533.5,Xkr4,False,True,True,False,3.872970,2.324972,0.014051,194.333000,235.666667,0.666667
12,chr1,HAVANA,exon,3435953,3438772,.,-,ENSMUST00000195335.2,Gm37180,False,True,True,False,0.521827,0.301747,0.012481,22.666667,26.666667,0.333333
14,chr1,HAVANA,exon,3445778,3448011,.,-,ENSMUST00000192336.2,Gm37363,False,False,True,False,0.485116,0.206451,0.000000,14.333333,15.666667,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79116,chrY,HAVANA,exon,1264840,1264958,.,-,ENSMUST00000091190.12,Ddx3y,False,True,True,False,12.061610,9.830595,1.813766,633.994667,1141.029333,96.344667
79118,chrY,HAVANA,exon,1263795,1263949,.,-,ENSMUST00000091190.12,Ddx3y,False,True,True,False,12.061610,9.830595,1.813766,633.994667,1141.029333,96.344667
79120,chrY,HAVANA,exon,1263569,1263700,.,-,ENSMUST00000091190.12,Ddx3y,False,True,True,False,12.061610,9.830595,1.813766,633.994667,1141.029333,96.344667
79122,chrY,HAVANA,exon,1260770,1263367,.,-,ENSMUST00000091190.12,Ddx3y,False,True,True,False,12.061610,9.830595,1.813766,633.994667,1141.029333,96.344667


In [5]:
# convert start and end to integer
merged_df['start'] = merged_df['start'].astype(int)
merged_df['end'] = merged_df['end'].astype(int)
smoops_transcripts_for_fasta = merged_df[['seqname', 'start', 'end', 'transcript_id', 'score',  'strand', 'gene_name', 'feature', 'smoops_naive',	'smoops_epi',	'smoops_diff',	'control_common',	'semi_diff_mean_tpm',	'semi_epiSC_mean_tpm',	'semi_naive_mean_tpm', 'semi_diff_mean_count',	'semi_epiSC_mean_count',	'semi_naive_mean_count']]
smoops_transcripts_for_fasta.to_csv("Data/machine_learning_input_prep/all_smOOPS_and_controls_for_fasta.bed", sep='\t', index=False, header=False)

In [6]:
!bedtools getfasta -s -fi genomes/genome_m39/GRCm39.primary_assembly.genome.fa -bed machine_learning_input_prep/all_smOOPS_and_controls_for_fasta.bed | grep -v ">" | paste -d "\t" machine_learning_input_prep/all_smOOPS_and_controls_for_fasta.bed - > machine_learning_input_prep/all_smOOPS_and_controls_with_fasta.bed

In [7]:
smoops_with_fasta = pd.read_csv('Data/machine_learning_input_prep/all_smOOPS_and_controls_with_fasta.bed', sep='\t', names=['chr', 'start', 'end', 'transcript_id', 'score',  'strand', 'gene_name', 'feature', 'smoops_naive',	'smoops_epi',	'smoops_diff',	'control_common',	'semi_diff_mean_tpm',	'semi_epiSC_mean_tpm',	'semi_naive_mean_tpm', 'semi_diff_mean_count',	'semi_epiSC_mean_count',	'semi_naive_mean_count', 'sequence'])
smoops_with_fasta.to_csv('Data/machine_learning_input_prep/all_smOOPS_and_controls_with_fasta.bed', sep='\t', index=False)
smoops_with_fasta

Unnamed: 0,chr,start,end,transcript_id,score,strand,gene_name,feature,smoops_naive,smoops_epi,smoops_diff,control_common,semi_diff_mean_tpm,semi_epiSC_mean_tpm,semi_naive_mean_tpm,semi_diff_mean_count,semi_epiSC_mean_count,semi_naive_mean_count,sequence
0,chr1,3740774,3741721,ENSMUST00000070533.5,.,-,Xkr4,exon,False,True,True,False,3.872970,2.324972,0.014051,194.333000,235.666667,0.666667,GCGGCGGCGGGCGAGCGGGCGCTGGAGTAGGAGCTGGGGAGCGGCG...
1,chr1,3491924,3492124,ENSMUST00000070533.5,.,-,Xkr4,exon,False,True,True,False,3.872970,2.324972,0.014051,194.333000,235.666667,0.666667,GTATTTGCACACAATATACTTAGGTATCCGGAGCCGGCAGAGTGGG...
2,chr1,3284704,3287191,ENSMUST00000070533.5,.,-,Xkr4,exon,False,True,True,False,3.872970,2.324972,0.014051,194.333000,235.666667,0.666667,GTTTCACAGCAGCAGCCTCCCTTGTGTCCTTGGCTTGGGCCCTAGC...
3,chr1,3435953,3438772,ENSMUST00000195335.2,.,-,Gm37180,exon,False,True,True,False,0.521827,0.301747,0.012481,22.666667,26.666667,0.333333,TGCAGCTGCAGCAGAAACTCCACCTAAATGCTCCTGCTGCAGCAGC...
4,chr1,3445778,3448011,ENSMUST00000192336.2,.,-,Gm37363,exon,False,False,True,False,0.485116,0.206451,0.000000,14.333333,15.666667,0.000000,GCCATTTTTCAACGTGTAGCTTGGAACTCCTTGTGTAATGAGCAAA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34699,chrY,1264840,1264958,ENSMUST00000091190.12,.,-,Ddx3y,exon,False,True,True,False,12.061610,9.830595,1.813766,633.994667,1141.029333,96.344667,GTGGCAGCAAGAGGTCTAGACATTTCAAATGTGAAACATGTTATCA...
34700,chrY,1263795,1263949,ENSMUST00000091190.12,.,-,Ddx3y,exon,False,True,True,False,12.061610,9.830595,1.813766,633.994667,1141.029333,96.344667,GACTTGCCACCTCTTTTTTCAATGAGAGGAATTTGAACATCACAAA...
34701,chrY,1263569,1263700,ENSMUST00000091190.12,.,-,Ddx3y,exon,False,True,True,False,12.061610,9.830595,1.813766,633.994667,1141.029333,96.344667,CAGATTCAGTGGAGGATTTGGAGCCAGAGACTATCGACAGAGCAGT...
34702,chrY,1260770,1263367,ENSMUST00000091190.12,.,-,Ddx3y,exon,False,True,True,False,12.061610,9.830595,1.813766,633.994667,1141.029333,96.344667,GTGGCTATGGAGGTTTCTACAATAATGATGGTTATGGAGGAAATTA...


In [3]:
naive_transcripts = pd.read_csv("Data/machine_learning_input_prep/all_smOOPS_and_controls_with_fasta.bed", sep='\t')
naive_transcripts

Unnamed: 0,chr,start,end,transcript_id,score,strand,gene_name,feature,smoops_naive,smoops_epi,smoops_diff,control_common,semi_diff_mean_tpm,semi_epiSC_mean_tpm,semi_naive_mean_tpm,semi_diff_mean_count,semi_epiSC_mean_count,semi_naive_mean_count,sequence
0,chr1,3740774,3741721,ENSMUST00000070533.5,.,-,Xkr4,exon,False,True,True,False,3.872970,2.324972,0.014051,194.333000,235.666667,0.666667,GCGGCGGCGGGCGAGCGGGCGCTGGAGTAGGAGCTGGGGAGCGGCG...
1,chr1,3491924,3492124,ENSMUST00000070533.5,.,-,Xkr4,exon,False,True,True,False,3.872970,2.324972,0.014051,194.333000,235.666667,0.666667,GTATTTGCACACAATATACTTAGGTATCCGGAGCCGGCAGAGTGGG...
2,chr1,3284704,3287191,ENSMUST00000070533.5,.,-,Xkr4,exon,False,True,True,False,3.872970,2.324972,0.014051,194.333000,235.666667,0.666667,GTTTCACAGCAGCAGCCTCCCTTGTGTCCTTGGCTTGGGCCCTAGC...
3,chr1,3435953,3438772,ENSMUST00000195335.2,.,-,Gm37180,exon,False,True,True,False,0.521827,0.301747,0.012481,22.666667,26.666667,0.333333,TGCAGCTGCAGCAGAAACTCCACCTAAATGCTCCTGCTGCAGCAGC...
4,chr1,3445778,3448011,ENSMUST00000192336.2,.,-,Gm37363,exon,False,False,True,False,0.485116,0.206451,0.000000,14.333333,15.666667,0.000000,GCCATTTTTCAACGTGTAGCTTGGAACTCCTTGTGTAATGAGCAAA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34699,chrY,1264840,1264958,ENSMUST00000091190.12,.,-,Ddx3y,exon,False,True,True,False,12.061610,9.830595,1.813766,633.994667,1141.029333,96.344667,GTGGCAGCAAGAGGTCTAGACATTTCAAATGTGAAACATGTTATCA...
34700,chrY,1263795,1263949,ENSMUST00000091190.12,.,-,Ddx3y,exon,False,True,True,False,12.061610,9.830595,1.813766,633.994667,1141.029333,96.344667,GACTTGCCACCTCTTTTTTCAATGAGAGGAATTTGAACATCACAAA...
34701,chrY,1263569,1263700,ENSMUST00000091190.12,.,-,Ddx3y,exon,False,True,True,False,12.061610,9.830595,1.813766,633.994667,1141.029333,96.344667,CAGATTCAGTGGAGGATTTGGAGCCAGAGACTATCGACAGAGCAGT...
34702,chrY,1260770,1263367,ENSMUST00000091190.12,.,-,Ddx3y,exon,False,True,True,False,12.061610,9.830595,1.813766,633.994667,1141.029333,96.344667,GTGGCTATGGAGGTTTCTACAATAATGATGGTTATGGAGGAAATTA...
