### Use TCGA expression data to train RB1 signature

In [3]:
import pandas as pd
import numpy as np

#### 1. TCGA expression data and RB1 mutation data preprocessing 

In [49]:
mRNA_file = "./data/TCGA/mRNA.tsv"
mutation_file = "./data/TCGA/RB1_maf.tsv"
clinical_file = "./data/TCGA/clinical.tsv"

In [23]:
mRNA_df = pd.read_csv(mRNA_file, delimiter="\t")
# dropna
mRNA_df.dropna(inplace=True)
# transpose
mRNA_df_transposed = mRNA_df.set_index("gene_id").transpose()
# take log
mRNA_df_transposed_log = mRNA_df_transposed.applymap(lambda x:np.log(x+1))
# save preprocessed mRNA (dropna, transpose, log)
mRNA_df_transposed_log.index.name = "sample_id"
mRNA_df_transposed_log.to_csv("./data/TCGA/mRNA_transpose_log.csv")

In [27]:
# get just prostate tissue
def get_label(barcode, dic):
    patient_id = barcode[:12]
    if patient_id in dic:
        return dic[patient_id]
    else:
        return None

mRNA_df = pd.read_csv("./data/TCGA/mRNA_transpose_log.csv")
clinical_df = pd.read_csv(clinical_file, delimiter="\t", 
                          encoding="ISO-8859-1", low_memory=False)
tissue_dict = dict(zip(clinical_df.bcr_patient_barcode, clinical_df.acronym))
label_df = pd.DataFrame(index=mRNA_df.sample_id)
label_df["tissue"] = list(mRNA_df.sample_id.apply(lambda x: get_label(x, tissue_dict)))
prostate_samples = list(label_df[label_df.tissue == "PRAD"].index)
prostate_mRNA_df = mRNA_df[mRNA_df.sample_id.isin(prostate_samples)]
prostate_mRNA_df.to_csv("./data/TCGA/mRNA_transpose_log_prostate.csv")

In [139]:
# get RB1 mutation status
def is_RB1_mutated(variant_classification):
    if variant_classification in ["Silent", "3'UTR", "Intron"]:
        return 0
    else:
        return 1
    
    
def add_to_label(barcode):
    if barcode[:15] not in RB1_dict.keys():
        return 0
    else:
        return RB1_dict[barcode[:15]]
    
    
rb_df = pd.read_csv(mutation_file, delimiter="\t")
rb_df["RB1_mutated"] = rb_df.Variant_Classification.apply(is_RB1_mutated)
RB1_dict = dict(zip([i[:15] for i in rb_df.Tumor_Sample_Barcode], 
                    rb_df.RB1_mutated))

assert(list(label_df.index) == list(mRNA_df.sample_id))
label_df["RB_mutated"] = list(map(add_to_label, list(label_df.index)))
label_df.to_csv("./data/TCGA/RB_labels.csv")