In [70]:
import pandas as pd
import os
import re
import pickle
pd.options.mode.chained_assignment = None 

In [4]:
def import_data_and_sort(path):
    # print(path)
    df = pd.read_csv(path)
    # drop the suffix
    df["utr"] = df["utr"].str[:50]
    # reorder
    df.drop(['Unnamed: 0'], axis=1, inplace=True)  # drop first column
    if 'total_reads' in df:
        df.sort_values(by=['total_reads'], inplace=True, ascending=False)
    else:
        df.sort_values(by=['total'], inplace=True, ascending=False)
    df.reset_index(inplace=True, drop=True)  # necessary as sorting creates an extra index
    return df

### We begin by reading in all the raw data, sorting it and removing suffixes not part of the UTR

In [56]:
""" Read in all Data """
path = "../Data/RawData/"
files = [os.path.join(path,file) for file in os.listdir(path) if file.startswith("GSM")]
df_list = {re.search("_(.*)\.",file).group(1):import_data_and_sort(file) for file in files}
print(df_list.keys())
# Remove the nonstandard chemistries
entriesToRemove = ['egfp_pseudo_1', 'egfp_pseudo_2', 'egfp_m1pseudo_1', 'egfp_m1pseudo_2']
for k in entriesToRemove:
    df_list.pop(k, None)
print(df_list.keys())

  exec(code_obj, self.user_global_ns, self.user_ns)


dict_keys(['egfp_unmod_1', 'egfp_pseudo_2', 'egfp_m1pseudo_1', 'egfp_m1pseudo_2', 'mcherry_1', 'mcherry_2', 'designed_library', 'egfp_unmod_2', 'egfp_pseudo_1'])
dict_keys(['egfp_unmod_1', 'mcherry_1', 'mcherry_2', 'designed_library', 'egfp_unmod_2'])


### We subset the data using the same cutoffs as in the sample code (except for the genetic algorithm data, where I impose a cutoff of minimum 200 reads (as I couldnt find which value is actually used)

In [57]:
# Subsetting the random mpra data
keys = ['egfp_unmod_1', 'egfp_unmod_2', 'mcherry_1', 'mcherry_2']
cuts = [280000, 300000, 180000, 170000]
for key, cutoff in zip(keys, cuts):
    df_list[key] = df_list[key].iloc[:cutoff].copy()

# Subsetting the human data    
human = df_list["designed_library"][(df_list["designed_library"]['library'] == 'human_utrs') | 
                                    (df_list["designed_library"]['library'] == 'snv')]
human = human.sort_values('total', ascending=False).reset_index(drop=True)
human = human.iloc[:25000].copy()

# Subsetting the genetic algorithm data
GA_types = ['step_random_to_best_allow_uatg',
 'step_random_to_best_no_uatgs',
 'step_worst_to_best_allow_uatg',
 'step_worst_to_best_no_uatg',
 'target_allow_uaug_allow_stop',
 'target_no_uaug_allow_stop',
 'target_no_uaug_no_stop']
GA = df_list['designed_library'][df_list['designed_library']["library"].isin(GA_types)]
GA = GA.iloc[:sum(GA["total"] >= 200)].copy()

df_list.pop("designed_library", None)
df_list["human"] = human
df_list["ga"] = GA

### We reduce to the needed columns (utr and rl) and add a library column

In [61]:
for key, df in df_list.items():
    df = df.filter(regex=("rl|utr"))
    df["library"] = key
    df_list[key] = df

### We prepare the test (20k), val (20k) and train (rest) split for all the sets except ga (only for training) and human (only for validation)

In [62]:
for key, df in df_list.items():
    df["set"] = ""
    if key == "human":
        df["set"] = "val"
    elif key == "ga":
        df["set"] = "train"
    else:
        df.loc[:20000, "set"] = "test"
        df.loc[20000:40000, "set"] = "val"
        df.loc[40000:, "set"] = "train"
    df_list[key] = df

### We combine the data into one large frame

In [68]:
combined_df = pd.concat(df_list.values())    
combined_df.reset_index(inplace=True, drop=True)

### We collect the snv data

In [73]:
snv_df = pd.read_csv("../Data/SNV/snv_phenotype_log_diff.csv")
snv_df.drop(['Unnamed: 0'], axis=1, inplace=True)  # drop first column
snv_df = snv_df[snv_df['obs_diff'] != 0.0]
snv_df = snv_df[snv_df['total'] >= 620]

### We collect the PTR data

In [82]:
# We get the PTR data
ptr_df = pd.read_csv("../Data/PTR/ptr.tsv", sep='\t')
# We average over all tissues
ptr_vals = ptr_df.select(lambda col: col.endswith('PTR'), axis=1).apply(pd.to_numeric, errors='coerce').mean(axis=1)
ptr_vals_df = pd.DataFrame({"GeneName":ptr_df["GeneName"], "PTR":ptr_vals})

# We get the sequences
seq_df = pd.read_csv("../Data/PTR/seq.tsv", sep='\t')

# We combine
combined_df_ptr = seq_df[["GeneName","UTR5_Sequence"]].merge(ptr_vals_df)
combined_df_ptr = combined_df_ptr.rename(index=str, columns={"UTR5_Sequence": "utr"})

  after removing the cwd from sys.path.


### Combine all the data in a dict and pickle

In [89]:
data_dict = {"data":combined_df, "snv":snv_df, "ptr":combined_df_ptr}
with open("../Data/data_dict.pkl", 'wb') as handle:
    pickle.dump(data_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)