In [44]:
import numpy as np
import pandas as pd
import os
import re
import pickle
pd.options.mode.chained_assignment = None 

In [28]:
def import_data_and_sort(path):
    # print(path)
    df = pd.read_csv(path)
    # drop the suffix
    df["utr"] = df["utr"].str[:50]
    # reorder
    df.drop(['Unnamed: 0'], axis=1, inplace=True)  # drop first column
    if 'total_reads' in df:
        df.sort_values(by=['total_reads'], inplace=True, ascending=False)
    else:
        df.sort_values(by=['total'], inplace=True, ascending=False)
    df.reset_index(inplace=True, drop=True)  # necessary as sorting creates an extra index
    return df
    

### We begin by reading in all the raw data, sorting it and removing suffixes not part of the UTR

In [29]:
""" Read in all Data """
path = "../Data/RawData/"
files = [os.path.join(path,file) for file in os.listdir(path) if file.startswith("GSM")]
df_list = {re.search("_(.*)\.",file).group(1):import_data_and_sort(file) for file in files}
print(df_list.keys())
# Remove the nonstandard chemistries
entriesToRemove = ['egfp_pseudo_1', 'egfp_pseudo_2', 'egfp_m1pseudo_1', 'egfp_m1pseudo_2']
for k in entriesToRemove:
    df_list.pop(k, None)
print(df_list.keys())

  exec(code_obj, self.user_global_ns, self.user_ns)


dict_keys(['egfp_unmod_1', 'egfp_pseudo_2', 'egfp_m1pseudo_1', 'egfp_m1pseudo_2', 'mcherry_1', 'mcherry_2', 'designed_library', 'egfp_unmod_2', 'egfp_pseudo_1'])
dict_keys(['egfp_unmod_1', 'mcherry_1', 'mcherry_2', 'designed_library', 'egfp_unmod_2'])


### We subset the data using the same cutoffs as in the sample code (except for the genetic algorithm data, where I impose a cutoff of minimum 200 reads (as I couldnt find which value is actually used)

In [30]:
# Subsetting the random mpra data
keys = ['egfp_unmod_1', 'egfp_unmod_2', 'mcherry_1', 'mcherry_2']
cuts = [280000, 300000, 180000, 170000]
for key, cutoff in zip(keys, cuts):
    df_list[key] = df_list[key].iloc[:cutoff].copy()

# Subsetting the human data    
human = df_list["designed_library"][(df_list["designed_library"]['library'] == 'human_utrs') | 
                                    (df_list["designed_library"]['library'] == 'snv')]
human = human.sort_values('total', ascending=False).reset_index(drop=True)
human = human.iloc[:25000].copy()

# Subsetting the genetic algorithm data
GA_types = ['step_random_to_best_allow_uatg',
 'step_random_to_best_no_uatgs',
 'step_worst_to_best_allow_uatg',
 'step_worst_to_best_no_uatg',
 'target_allow_uaug_allow_stop',
 'target_no_uaug_allow_stop',
 'target_no_uaug_no_stop']
GA = df_list['designed_library'][df_list['designed_library']["library"].isin(GA_types)]
GA = GA.iloc[:sum(GA["total"] >= 200)].copy()

df_list.pop("designed_library", None)
df_list["human"] = human
df_list["ga"] = GA

### We reduce to the needed columns (utr and rl) and add a library column

In [31]:
for key, df in df_list.items():
    df = df.filter(regex=("rl|utr"))
    df["library"] = key
    df_list[key] = df

### We prepare the test (20k), val (20k) and train (rest) split for all the sets except ga (only for training) and human (only for validation)

In [32]:
for key, df in df_list.items():
    df["set"] = ""
    if key == "human":
        df["set"] = "val"
    elif key == "ga":
        df["set"] = "train"
    else:
        df.loc[:20000, "set"] = "test"
        df.loc[20000:40000, "set"] = "val"
        df.loc[40000:, "set"] = "train"
    df_list[key] = df

### We combine the data into one large frame

In [118]:
combined_df = pd.concat(df_list.values())    
combined_df.reset_index(inplace=True, drop=True)

### We add the TIS context

In [99]:
context_dict = {"egfp_unmod_1": 'ATGGGCGAATTAAGTAAGGGCGAGGAGCTGTTCACCGGGGTG', 
                "egfp_unmod_2": 'ATGGGCGAATTAAGTAAGGGCGAGGAGCTGTTCACCGGGGTG', 
                "mcherry_1": "ATGCCTCCCGAGAAGAAGATCAAGAGCGTGAGCAAGGGCGA", 
                "mcherry_2": "ATGCCTCCCGAGAAGAAGATCAAGAGCGTGAGCAAGGGCGA", 
                "ga": 'ATGGGCGAATTAAGTAAGGGCGAGGAGCTGTTCACCGGGGTG', 
                "human": 'ATGGGCGAATTAAGTAAGGGCGAGGAGCTGTTCACCGGGGTG'}
combined_df["context"] = [context_dict[x] for x in combined_df["library"]]

In [119]:
combined_df["cds"] = "XXX"
combined_df["3utr"] = "XXX"

### We collect the snv data

In [120]:
snv_df = pd.read_csv("../Data/SNV/snv_phenotype_log_diff.csv")
snv_df.drop(['Unnamed: 0'], axis=1, inplace=True)  # drop first column
snv_df = snv_df[snv_df['obs_diff'] != 0.0]
snv_df = snv_df[snv_df['total'] >= 620]
snv_df["cds"] = 'XXX'
snv_df["3utr"] = "XXX"

### We collect the PTR data

In [121]:
# We get the PTR data
ptr_df = pd.read_csv("../Data/PTR/ptr.tsv", sep='\t')
# We average over all tissues
ptr_vals = ptr_df.select(lambda col: col.endswith('PTR'), axis=1).apply(pd.to_numeric, errors='coerce').mean(axis=1)
ptr_vals_df = pd.DataFrame({"GeneName":ptr_df["GeneName"], "PTR":ptr_vals})

# We get the sequences
seq_df = pd.read_csv("../Data/PTR/seq.tsv", sep='\t')

# We combine
combined_df_ptr = seq_df[["GeneName","UTR5_Sequence", "CDS_Sequence", "UTR3_Sequence"]].merge(ptr_vals_df)
combined_df_ptr = combined_df_ptr.rename(index=str, columns={"UTR5_Sequence": "utr", 
                                                            "CDS_Sequence":"cds",
                                                            "UTR3_Sequence":"3utr"})

  after removing the cwd from sys.path.


### Combine all the data in a dict and pickle

In [122]:
data_dict = {"data":combined_df, "snv":snv_df, "ptr":combined_df_ptr}
with open("../Data/data_dict.pkl", 'wb') as handle:
    pickle.dump(data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Collect the Riboseq data and combine it with sequence data

In [123]:
# We get the sequences
seq_df = pd.read_csv("../Data/PTR/seq.tsv", sep='\t')
seq_df = seq_df.rename(index=str, columns={"UTR5_Sequence": "utr",
                                          "CDS_Sequence":"cds",
                                          "UTR3_Sequence":"3utr"})

#We get the andreev riboseq data and combine
andreev_df = pd.read_csv("../Data/RiboSeq/andreev_counts.tsv", sep='\t', decimal=",")
andreev_df = andreev_df.rename(index=str, columns={"Gene name": "GeneName", 
                                                   "Riboseq control reads, coding": "rpf",
                                                   "RNAseq control, (normalised)": "rnaseq_norm"})

andreev_merged = andreev_df[["GeneName","rpf","rnaseq_norm"]].merge(seq_df)

In [3]:
#We get the xtail pcr3 riboseq data and combine
pcr3_df = pd.read_csv("../Data/RiboSeq/xtail_counts_pcr3.tsv", sep='\t', decimal=",")
pcr3_df = pcr3_df.rename(index=str, columns={"Ensembl_ID": "EnsemblGeneID"})
pcr3_df = pcr3_df.rename(columns=lambda x: re.sub('.1$','_normalized',x))

pcr3_merged = pcr3_df.merge(seq_df)

In [124]:
#We get the Eichhorn hek293 riboseq data and combine
eichhorn_df = pd.read_csv("../Data/RiboSeq/Eichhorn_GSE60426_MockHEK293T.tsv", sep='\t', decimal=".")

eichhorn_merged = eichhorn_df.merge(seq_df)
eichhorn_merged = eichhorn_merged.dropna()

In [125]:
ribo_dict = {"andreev":andreev_merged, "pcr3":pcr3_merged, "eichhorn": eichhorn_merged}
with open("../Data/ribo_dict.pkl", 'wb') as handle:
    pickle.dump(ribo_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Prepare more PTR data

In [23]:
# We get the eraslan PTR data
eraslan_ptr_df = pd.read_csv("../Data/PTR/ptr.tsv", sep='\t')

# We get the sequences
seq_df = pd.read_csv("../Data/PTR/seq.tsv", sep='\t')
seq_df = seq_df.rename(index=str, columns={"UTR5_Sequence": "utr"})

# We combine
combined_eraslan = seq_df.merge(eraslan_ptr_df, on="GeneName")

In [24]:
# We get the Zheng PTR data
zheng_ptr_df = pd.read_csv("../Data/PTR/Zheng_ptr.tsv", sep='\t', decimal=",")

zheng_ptr_df = zheng_ptr_df.rename(index=str, columns={"GeneSymbol": "GeneName"})
combined_zheng = seq_df.merge(zheng_ptr_df, on="GeneName")

In [25]:
# We get the wilhelm PTR data
wilhelm_ptr_df = pd.read_csv("../Data/PTR/wilhelm_ptr.tsv", sep='\t', decimal=",")
wilhelm_ptr_df = wilhelm_ptr_df.dropna()

wilhelm_ptr_df = wilhelm_ptr_df.rename(index=str, columns={"Accessions": "EnsemblGeneID",
                                                          "protein/mRNA ratio": "ptr"})
combined_wilhelm = seq_df.merge(wilhelm_ptr_df, on="EnsemblGeneID")

In [26]:
ptr_dict = {"eraslan":combined_eraslan, "zheng":combined_zheng, "wilhelm": combined_wilhelm}
with open("../Data/ptr_dict.pkl", 'wb') as handle:
    pickle.dump(ptr_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Add the polysome profiling data

#### By gene

In [74]:
doudna_df = pd.read_csv("../Data/TrIP-Seq/doudna_polysome_tripseq_gene_tpm_ensembl_v75.csv")
doudna_df = doudna_df.rename(index=str, columns={"gene_id": "EnsemblGeneID",
                                                 "isoform_id": "EnsemblTranscriptID",
                                                      "gene_name": "GeneName"})


# replicate 1
fractions_1 = doudna_df.select(lambda col: re.match("poly._1|80S_1", col), axis=1)
doudna_df["count_1"] = fractions_1.sum(axis=1)
doudna_df["rl_1"] = np.sum(np.array(fractions_1) * np.arange(1,9), axis=1)/np.sum(np.array(fractions_1),axis=1)
# replicate 2
fractions_2 = doudna_df.select(lambda col: re.match("poly._2|80S_2", col), axis=1)
doudna_df["count_2"] = fractions_2.sum(axis=1)
doudna_df["rl_2"] = np.sum(np.array(fractions_2) * np.arange(1,9), axis=1)/np.sum(np.array(fractions_2),axis=1)
# replicate mean
fractions = (np.array(fractions_1) + np.array(fractions_2))/2
doudna_df["count_mean"] = np.sum(fractions, axis=1)
doudna_df["rl_mean"] = np.sum(fractions * np.arange(1,9), axis=1)/np.sum(fractions,axis=1)

  import sys
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]


In [81]:
seq_df = pd.read_csv("../Data/PTR/seq.tsv", sep='\t')
seq_df = seq_df.rename(index=str, columns={"UTR5_Sequence": "utr"})
combined_doudna = seq_df.merge(doudna_df, on="GeneName")

In [82]:
with open("../Data/doudna_polysome.pkl", 'wb') as handle:
    pickle.dump(combined_doudna, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### By isoform: hopefully more high quality data

In [132]:
doudna_df = pd.read_csv("../Data/TrIP-Seq/doudna_polysome_tripseq_isoform_tpm_ensembl_v75.csv")
doudna_df = doudna_df.rename(index=str, columns={"gene_id": "EnsemblGeneID",
                                                 "isoform_id": "EnsemblTranscriptID",
                                                      "gene_name": "GeneName"})


# replicate 1
fractions_1 = doudna_df.select(lambda col: re.match("poly._1|80S_1|cyto_1", col), axis=1)
doudna_df["count_1"] = fractions_1.sum(axis=1)
doudna_df["rl_1"] = np.sum(np.array(fractions_1) * np.arange(0,9), axis=1)/np.sum(np.array(fractions_1),axis=1)
# replicate 2
fractions_2 = doudna_df.select(lambda col: re.match("poly._2|80S_2|cyto_2", col), axis=1)
doudna_df["count_2"] = fractions_2.sum(axis=1)
doudna_df["rl_2"] = np.sum(np.array(fractions_2) * np.arange(0,9), axis=1)/np.sum(np.array(fractions_2),axis=1)
# replicate mean
fractions = (np.array(fractions_1) + np.array(fractions_2))/2
doudna_df["count_mean"] = np.sum(fractions, axis=1)
doudna_df["rl_mean"] = np.sum(fractions * np.arange(0,9), axis=1)/np.sum(fractions,axis=1)

  
  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  


In [133]:
seq_df = pd.read_csv("../Data/gencodev19_seq.csv")
combined_doudna = seq_df.merge(doudna_df, on="EnsemblTranscriptID")

In [115]:
with open("../Data/doudna_polysome_iso.pkl", 'wb') as handle:
    pickle.dump(combined_doudna, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [134]:
combined_doudna = combined_doudna[combined_doudna["count_1"] + combined_doudna["count_2"] > 2]
combined_doudna = combined_doudna.sample(frac=1).reset_index(drop=True)
set_vector = ["test"]*1500 + ["val"]*1500 + ["train"]*(len(combined_doudna) - 3000)
combined_doudna["set"] = set_vector
with open("../Data/doudna_polysome_iso_sub.pkl", 'wb') as handle:
    pickle.dump(combined_doudna, handle, protocol=pickle.HIGHEST_PROTOCOL)