In [1]:
import os,re,glob
import pandas as pd
import numpy as np
from collections import Counter
from rna_seq_normalization import Normalization as Norm
from functools import reduce

users_dir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction"
specie = "human"
# GENOMEDIR = "/genomics/users/marta/genomes"
GENOMEDIR = "/data/genomics/marta/genomes"

### cancer data
cancer_dir = users_dir + "/cancers"
raw_cancer_dir="/users/genomics/marta/TCGA_RNASeq"
tcga_projects=["TCGA-BRCA","TCGA-LUSC","TCGA-PRAD","TCGA-KIRC","TCGA-KIRP","TCGA-LUAD","TCGA-BLCA"]#,"TCGA-LIHC"]
other_projects=["GSE102101_KIRC","GSE133624_BLCA","GSE22260_PRAD","PRJEB2449_PRAD","SRP238334_KIRC","GSE214846_LIHC","GSE229705_LUAD","TCGA_COAD","SRP107326_COAD"]
# deleted=["GSE103001_BRCA","GSE89223_PRAD"]
manuscript_projects = ["liver_adjacent_totalRNA_LIHC","hcc_normal_totalRNA_LIHC","GSE193567_LIHC","LIHC_TCGA_LIHC"]
all_projects = tcga_projects + other_projects + manuscript_projects

cancertypes = ["BRCA","BLCA","LUAD","KIRC","PRAD","LUSC","LIHC","COAD"]
## annotation file
annotation="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/gencode.v38.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf"
transcript_gene=pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/1transcript_1gene.reconstructed.csv")

def count_greater_than_one(row):
    return (row > 1).sum()

## Quantify with TCGAData and new reference
`featureCounts_newRef_cancer.sh`

output in: `/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/cancers/featureCounts`

## TPM

In [7]:
for file in os.listdir(os.path.join(cancer_dir,"featureCounts")):
    if file.endswith(".txt"):

        proj = file.split("featureCounts_")[-1]
        cancer_type = proj[:-4]
        print(cancer_type)
        toc = pd.read_csv(os.path.join(cancer_dir,"featureCounts",file), sep="\t", comment="#")
        toc = toc[toc['Geneid'].str.contains('PAR_')==False]
        toc['Geneid']=toc['Geneid'].str.split('.').str[0]
        toc.rename(columns={'Geneid':'transcript_id'}, inplace=True)

        filter_col = [col for col in toc if col.startswith('/')]
        for col in filter_col:
            new_col=col.split("Aligned")[0]
            new_col=new_col.split("/")[-1]
            if proj in tcga_projects:
                new_col=cancer_type+new_col[4:]
            toc.rename(columns={col:new_col}, inplace=True)

        length = toc['Length']
        genes = toc['transcript_id']
        # we are only interested in the columns with counts
        counts = toc
        counts.drop(["Chr","Start","End","Strand","Length","transcript_id"],axis=1, inplace=True)
        # calculate TPMs
        tpm_df = Norm.tpm(counts, length)
        # add transcript_id and length again
        tpms = pd.concat([genes,tpm_df, length], axis=1)
        tpms.to_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+cancer_type+".csv", index=None)
 

TCGA-PRAD
TCGA-LUSC
GSE89223_PRAD
TCGA-BLCA
GSE102101_KIRC
TCGA_COAD_SE
GSE103001_BRCA
TCGA-BRCA
GSE214846_LIHC
liver_adjacent_totalRNA_LIHC
GSE193567_LIHC
TCGA_COAD_PE
TCGA-KIRP
GSE22260_PRAD
GSE229705_LUAD
GSE133624_BLCA
PRJEB2449_PRAD
TCGA-LUAD
SRP107326_COAD
hcc_normal_totalRNA_LIHC
TCGA-KIRC
LIHC_TCGA_LIHC
SRP238334_KIRC


In [8]:
## merge single-end and paired-end TCGA COAD
SE_COAD = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_TCGA_COAD_SE.csv"))
print(len(SE_COAD.columns))
PE_COAD = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_TCGA_COAD_PE.csv"))
print(len(PE_COAD.columns))

COAD = SE_COAD.merge(PE_COAD, on=["transcript_id","Length"])
COAD.to_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_TCGA_COAD.csv"), index=None)
print(len(COAD.columns))

19
66
83


In [9]:
## filter patients we are interested in, from those not TCGA
for proj in all_projects:
    print(proj)
    if proj in tcga_projects:
        cancer_type = proj[5:]

        fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
        tumor_transcripts = list()
        patients=pd.read_csv(os.path.join("/users/genomics/marta/TCGA_RNASeq",proj,"results/QC_patients1.csv"))
        print(len(patients)," patients")
        # patients['normal'] = patients['normal'].str.replace('TCGA', cancer_type)
        # patients['tumor'] = patients['tumor'].str.replace('TCGA', cancer_type)

    elif proj in other_projects:
        fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
        print(fc)
        tumor_transcripts = list()
        patients=pd.read_csv(os.path.join("/users/genomics/marta/cancers_RNASeq",proj,"results/patients.csv"))
        print(len(patients)," patients")

        samples_to_keep = patients.tumor.values.tolist() + patients.normal.values.tolist() + ['Length','transcript_id']
        fc_selected = fc[fc.columns.intersection(samples_to_keep)]

    elif proj in manuscript_projects:
        fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
        tumor_transcripts = list()

        if "liver_adjacent_totalRNA" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/liver_adjacent_totalRNA/results/clean_patients.csv"))
            # patients.rename(columns={'adjacent':'normal'}, inplace=True)
        elif "hcc_normal_totalRNA" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/hcc_normal_totalRNA/results/clean_patients.csv"))
            # patients.rename(columns={'adjacent':'normal'}, inplace=True)
        elif "GSE193567" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/GSE193567/results/clean_patients.csv"))        
            # patients.rename(columns={'adjacent':'normal'}, inplace=True)
        elif "LIHC_TCGA" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/LIHC_TCGA/results/clean_patients.csv"))    
            patients['normal'] = patients['patient'] + "_normal"
            patients['tumor'] = patients['patient'] + "_tumor"
        print(len(patients)," patients")

    for index,patient in patients.iterrows():
        if proj in tcga_projects:
            patient_fc = fc[["transcript_id","Length",patient['tumor']]]

            tumor_patient_fc = patient_fc[patient_fc[patient['tumor']] >= 1 ]
            tumor_transcripts.extend(tumor_patient_fc.transcript_id.values.tolist())
        if proj in other_projects:
            patient_fc = fc[["transcript_id","Length",patient['tumor']]]

            tumor_patient_fc = patient_fc[patient_fc[patient['tumor']] >= 1 ]
            tumor_transcripts.extend(tumor_patient_fc.transcript_id.values.tolist())       
        if proj in manuscript_projects:
            patient_fc = fc[["transcript_id","Length",patient['tumor']]]

            tumor_patient_fc = patient_fc[patient_fc[patient['tumor']] >= 1 ]
            tumor_transcripts.extend(tumor_patient_fc.transcript_id.values.tolist())                        

    tumor1FPKM = fc[fc['transcript_id'].isin(tumor_transcripts)]
    # print(tumor1FPKM)
    known = tumor1FPKM[tumor1FPKM['transcript_id'].str.contains('ENST')]
    novel = tumor1FPKM[tumor1FPKM['transcript_id'].str.contains('TCONS')]
    merged = tumor1FPKM.merge(transcript_gene, on=['transcript_id'], how="inner")
    merged = merged[merged['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]

    lncRNA = merged[merged['gene_type'].isin(['lncRNA','processed_pseudogene'])]
    print("lncRNA: ",len(lncRNA))

    cds = merged[merged['gene_type'] == "protein_coding"]
    print("PROTEIN CODING: ",len(cds))

    novel = merged[merged['gene_type'] == "novel"]
    ## length limitations
    novel = novel[novel['Length'].astype(int) < 91667]
    novel = novel[novel['Length'].astype(int) > 300]

    print("NOVEL: ",len(novel))

    # # tumor1FPKM = pd.concat([novel, lncRNA, cds], ignore_index = True)
    # merged[['transcript_id','gene_id','gene_name','gene_type','Length']].to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_")+cancer_type+".csv",index=False)

    ## keep only tumor samples
    if proj in tcga_projects:
        tumorONLY_merged = merged[merged.columns.drop(list(merged.filter(regex='normal')))]
    else:
        tumorONLY_merged = merged.drop(columns=patients.normal.values.tolist())

    tumorONLY_merged = tumorONLY_merged[['transcript_id','gene_id','gene_id', 'gene_type', 'gene_name','Length', *patients.tumor.values.tolist()]]
    tumorONLY_merged.to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_table_of_counts_")+proj+".csv",index=False)

    ###### GET TUMOR-EXPRESSED > 1 TPM
    # Filter only integer columns
    int_columns = tumorONLY_merged.select_dtypes(include='int64')
    
    # Create a new column with the count of int columns per row whose value is > 1
    # tumorONLY_merged['n'] = int_columns.apply(lambda row: (row > 1).sum(), axis=1)
    tumorONLY_merged['n'] = tumorONLY_merged.drop(columns=['transcript_id','gene_id','gene_name','gene_type','Length']).apply(count_greater_than_one, axis=1)
    tumorONLY_merged = tumorONLY_merged[['transcript_id','gene_id','gene_name','gene_type','Length','n']]
    tumorONLY_merged.sort_values(by=['n'], ascending=False).to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_n_")+proj+".csv",index=False)

TCGA-BRCA
109  patients
lncRNA:  10685
PROTEIN CODING:  16230
NOVEL:  478
TCGA-LUSC
49  patients
lncRNA:  9922
PROTEIN CODING:  16280
NOVEL:  456
TCGA-PRAD
52  patients
lncRNA:  7393
PROTEIN CODING:  15071
NOVEL:  385
TCGA-KIRC
71  patients
lncRNA:  7978
PROTEIN CODING:  15463
NOVEL:  391
TCGA-KIRP
32  patients
lncRNA:  7911
PROTEIN CODING:  15436
NOVEL:  390
TCGA-LUAD
56  patients
lncRNA:  10065
PROTEIN CODING:  16087
NOVEL:  431
TCGA-BLCA
18  patients
lncRNA:  7942
PROTEIN CODING:  15521
NOVEL:  373
GSE102101_KIRC
         transcript_id   SRR5885319   SRR5885320   SRR5885321   SRR5885322  \
0       TCONS_00002128   918.580901   259.244976   743.867429   460.021130   
1      ENST00000387314  2739.699672   347.541269  1844.771818   399.913980   
2      ENST00000389680  4460.676619  1093.170912  3362.743728   897.972000   
3       TCONS_00002129  8547.016124  2386.216764  6447.811071  1860.799667   
4      ENST00000387342  1675.129075    55.297117  1637.638926   179.115200   
...       

In [10]:
## create a merged table of counts per ctype and a big patients file
## CAN THEY BE CONSIDERED AS ONE?
for ctype in cancertypes:
    print(ctype)
    
    # Lists to store DataFrames for counts and patients
    fc_list = []
    patients_list = []
    
    for proj in all_projects:
        if proj in tcga_projects and ctype in proj:
            print(proj)
            # Read the CSV files for TCGA projects
            patients = pd.read_csv(os.path.join("/users/genomics/marta/TCGA_RNASeq", proj, "results/QC_patients1.csv"))
            patients['project'] = proj  # Add the project column
            fc = pd.read_csv(os.path.join(cancer_dir, "featureCounts", f"table_of_counts_TPMs_{proj}.csv"))
            fc_list.append(fc)
            patients_list.append(patients)
        
        elif proj in other_projects and ctype in proj:
            print(proj)
            # Read the CSV files for other projects
            patients = pd.read_csv(os.path.join("/users/genomics/marta/cancers_RNASeq", proj, "results/patients.csv"))
            patients['project'] = proj  # Add the project column
            fc = pd.read_csv(os.path.join(cancer_dir, "featureCounts", f"table_of_counts_TPMs_{proj}.csv"))
            fc_list.append(fc)
            patients_list.append(patients)
        
        elif proj in manuscript_projects and ctype in proj:
            print(proj)
            # Read the CSV files for manuscript projects
            if proj == "LIHC_TCGA_LIHC":
                patients = pd.read_csv(os.path.join("/projects_eg/projects/marta", proj[:-5], "results/clean_patients.csv"))
                patients['normal'] = patients['patient'] + "_normal"
                patients['tumor'] = patients['patient'] + "_tumor"

            else:
                patients = pd.read_csv(os.path.join("/projects_eg/projects/marta", proj[:-5], "results/clean_patients.csv"))

            patients['project'] = proj  # Add the project column
            fc = pd.read_csv(os.path.join(cancer_dir, "featureCounts", f"table_of_counts_TPMs_{proj}.csv"))
            fc_list.append(fc)
            patients_list.append(patients)
    
    # Merge all fc DataFrames by 'transcript_id' and 'Length'
    if fc_list:
        merged_fc = reduce(lambda left, right: pd.merge(left, right, on=["transcript_id", "Length"], how='outer'), fc_list)
        # Save the merged fc DataFrame
        merged_fc.to_csv(os.path.join(cancer_dir, f"merged_fc_{ctype}.csv"), index=False)
    
    # Combine all patient DataFrames into one big DataFrame
    if patients_list:
        merged_patients = pd.concat(patients_list, ignore_index=True)
        # Save the merged patients DataFrame
        merged_patients.to_csv(os.path.join(cancer_dir, f"merged_patients_{ctype}.csv"), index=False)


BRCA
TCGA-BRCA
BLCA
TCGA-BLCA
GSE133624_BLCA
LUAD
TCGA-LUAD
GSE229705_LUAD
KIRC
TCGA-KIRC
GSE102101_KIRC
SRP238334_KIRC
KIRP
TCGA-KIRP
PRAD
TCGA-PRAD
GSE22260_PRAD
PRJEB2449_PRAD
LUSC
TCGA-LUSC
LIHC
GSE214846_LIHC
liver_adjacent_totalRNA_LIHC
hcc_normal_totalRNA_LIHC
GSE193567_LIHC
LIHC_TCGA_LIHC
COAD
TCGA_COAD
SRP107326_COAD


Here to run Q3.1 - TPM distribution to know if we can treat them as one per cancer type or if there is bias according to the dataset

Instead of selecting the tumor-specific, let's look first at the log2ratio between means


In [88]:
testisRestr = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q2_TestisRestricted/human/testisRestricted_GTEx_translatedONLYtestis.noProteome.csv")
testisRestr

Unnamed: 0.1,Unnamed: 0,gene_id,gene_name,gene_type,orfID,transcript_id,orfType,length,length_aa,start_codon,ORFpep,TranslatedLiver,TranslatedBrain
0,0,ENSG00000039600,SOX30,protein_coding,ENST00000265007.11:5:-|20|3283:361:2623|canoni...,ENST00000265007,canonical,2262,754,ATG,MERARPEPPPQPRPLRPAPPPLPVEGTSFWAAAMEPPPSSPTLSAA...,no,no
1,1,ENSG00000046774,MAGEC2,protein_coding,ENST00000247452.4:X:-|25|1994:349:1471|canonic...,ENST00000247452,canonical,1122,374,ATG,MPPVPGVPFRNVDNDSPTSVELEDWVDAQHPTDEEEEEASSASSTL...,no,no
2,2,ENSG00000054796,SPO11,protein_coding,ENST00000371263.8:20:+|4|1789:66:1257|canonica...,ENST00000371263,canonical,1191,397,ATG,MAFAPMGPEASFFDVLDRHRESLLAALRRGGREPPTGGSRLASSSE...,no,no
3,3,ENSG00000063515,GSC2,protein_coding,ENST00000086933.3:22:-|1|2625:10:628|canonical...,ENST00000086933,canonical,618,206,ATG,MAAAAGGAASRRGAGRPCPFSIEHILSSLPERSLPARAACPPQPAG...,no,no
4,4,ENSG00000068985,PAGE1,protein_coding,ENST00000376150.4:X:-|10|659:125:566|canonical...,ENST00000376150,canonical,441,147,ATG,MGFLRRLIYRRRPMIYVESSEESSDEQPDEVESPTQSQDSTPAEER...,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
956,981,XLOC_001809,XLOC_001809,novel,TCONS_00002050:9:-|489|10655:7119:7137|noncodi...,TCONS_00002050,noncoding,18,6,ATG,MHFFL*,no,no
957,982,XLOC_001809,XLOC_001809,novel,TCONS_00002050:9:-|172|10655:3074:3203|noncodi...,TCONS_00002050,noncoding,129,43,CTG,MTSLRLLKIDSTVCTTNKWWCKEFTPHFLCPPPFPLSLPPSY*,no,no
958,983,XLOC_001809,XLOC_001809,novel,TCONS_00002050:9:-|311|10655:5223:5259|noncodi...,TCONS_00002050,noncoding,36,12,ATG,MSRGTSARASF*,no,no
959,984,XLOC_001810,XLOC_001810,novel,TCONS_00002051:9:-|9|437:109:145|noncoding|ATG,TCONS_00002051,noncoding,36,12,ATG,MALPMEIFVAG*,no,no


In [23]:
## log2ratio3x & 1 TPM
log2ratio3x_general = pd.DataFrame(columns=["gene_name","gene_id","gene_type"])
log2ratio3x_1TPM_general = pd.DataFrame(columns=["gene_name","gene_id","gene_type"])
log2ratio3x_1TPM_general_5percent = pd.DataFrame(columns=["gene_name","gene_id","gene_type"])
testisRestr = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q2_TestisRestricted/human/testisRestricted_GTEx_translatedONLYtestis.noProteome.csv")

for ctype in cancertypes:
    print(ctype)
    tumorspecific_candidates = pd.DataFrame()

    patients = pd.read_csv(cancer_dir+"/merged_patients_"+ctype+".csv")
    patients_long = patients.melt(id_vars=['patient', 'project'], value_vars=['normal', 'tumor'], 
                    var_name='normal_tumor', value_name='sample')
    patients_long['normal_tumor'] = patients_long['normal_tumor'].replace({'normal': 'normal', 'tumor': 'tumor'})
    
    fc=pd.read_csv(cancer_dir+"/merged_fc_"+ctype+".csv")

    fc_info = fc.merge(transcript_gene, on=['transcript_id'], how="inner")
    fc_info = fc_info[fc_info['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]

    fc_info_long = pd.melt(fc_info, id_vars=["transcript_id","Length", "gene_id", "gene_type", "gene_name"], ignore_index=False).reset_index()
    fc_info_long.rename(columns={"variable":"sample","value":"TPM"}, inplace=True)
    fc_info_long = fc_info_long.merge(patients_long, on=["sample"])

    means = fc_info_long.groupby(["gene_name","gene_id","gene_type","normal_tumor"])['TPM'].mean().reset_index()
    means_pivot = means.pivot_table(index=['gene_name', 'gene_id', 'gene_type'], 
                          columns='normal_tumor', 
                          values='TPM').reset_index()
    means_pivot['log2ratio'] = np.log2(means_pivot['tumor']/means_pivot['normal'])

    log2ratio3x = means_pivot[means_pivot['log2ratio'] >= np.log2(3)]
    log2ratio3x_testisRestr = log2ratio3x[log2ratio3x['gene_id'].isin(testisRestr.gene_id.values.tolist())]
    # try:
    #     os.makedirs(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype))
    # except:
    #     continue
    log2ratio3x_testisRestr.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"log2ratio3x.csv"), index=None)
    print(log2ratio3x_testisRestr.groupby(["gene_type"]).count())
    log2ratio3x_general = pd.concat([log2ratio3x_general, log2ratio3x_testisRestr[["gene_name","gene_id","gene_type"]]])

    ### > 1 TPM
    max = fc_info_long.groupby(["gene_name","gene_id","gene_type","normal_tumor"])['TPM'].max().reset_index()
    ## tumor samples max has to be greater than 1TPM
    max = max[max['normal_tumor'] == "tumor"]
    max.rename(columns={'TPM':'max_TPM'}, inplace=True)

    TPM1 = max[max['max_TPM'] > 1]
    log2ratio3x_1TPM = log2ratio3x_testisRestr[log2ratio3x_testisRestr['gene_id'].isin(TPM1.gene_id.values.tolist())]
    log2ratio3x_1TPM.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"log2ratio3x_1TPM.csv"), index=None)
    log2ratio3x_1TPM_general = pd.concat([log2ratio3x_1TPM_general, log2ratio3x_1TPM[["gene_name","gene_id","gene_type"]]])

    ### > 1 TPM - 5%
    TAA_1TPM5percent = pd.read_csv(os.path.join(cancer_dir,"tumorexpressed/cancertypes/tumor_1FPKM_n5percent_pancancer.csv"))
    
    log2ratio3x_1TPM = log2ratio3x_testisRestr[log2ratio3x_testisRestr['gene_id'].isin(TAA_1TPM5percent.gene_id.values.tolist())]
    log2ratio3x_1TPM.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"log2ratio3x_1TPM5percent.csv"), index=None)
    log2ratio3x_1TPM_general_5percent = pd.concat([log2ratio3x_1TPM_general_5percent, log2ratio3x_1TPM[["gene_name","gene_id","gene_type"]]])



BRCA


  result = getattr(ufunc, method)(*inputs, **kwargs)


normal_tumor          gene_name  gene_id  normal  tumor  log2ratio
gene_type                                                         
lncRNA                       49       49      49     49         49
novel                         9        9       9      9          9
processed_pseudogene          4        4       4      4          4
protein_coding               96       96      96     96         96
BLCA


  result = getattr(ufunc, method)(*inputs, **kwargs)


normal_tumor          gene_name  gene_id  normal  tumor  log2ratio
gene_type                                                         
lncRNA                       90       90      90     90         90
novel                        36       36      36     36         36
processed_pseudogene          4        4       4      4          4
protein_coding              128      128     128    128        128
LUAD


  result = getattr(ufunc, method)(*inputs, **kwargs)


normal_tumor          gene_name  gene_id  normal  tumor  log2ratio
gene_type                                                         
lncRNA                       40       40      40     40         40
novel                        10       10      10     10         10
processed_pseudogene          1        1       1      1          1
protein_coding               81       81      81     81         81
KIRC


  result = getattr(ufunc, method)(*inputs, **kwargs)


normal_tumor          gene_name  gene_id  normal  tumor  log2ratio
gene_type                                                         
lncRNA                       57       57      57     57         57
novel                        17       17      17     17         17
processed_pseudogene          1        1       1      1          1
protein_coding               68       68      68     68         68
PRAD


  result = getattr(ufunc, method)(*inputs, **kwargs)


normal_tumor          gene_name  gene_id  normal  tumor  log2ratio
gene_type                                                         
lncRNA                       52       52      52     52         52
novel                        24       24      24     24         24
processed_pseudogene          1        1       1      1          1
protein_coding               81       81      81     81         81
LUSC


  result = getattr(ufunc, method)(*inputs, **kwargs)


normal_tumor          gene_name  gene_id  normal  tumor  log2ratio
gene_type                                                         
lncRNA                      110      110     110    110        110
novel                        36       36      36     36         36
processed_pseudogene          3        3       3      3          3
protein_coding              158      158     158    158        158
LIHC


  result = getattr(ufunc, method)(*inputs, **kwargs)


normal_tumor          gene_name  gene_id  normal  tumor  log2ratio
gene_type                                                         
lncRNA                      100      100     100    100        100
novel                        32       32      32     32         32
processed_pseudogene          7        7       7      7          7
protein_coding              187      187     187    187        187
COAD


  result = getattr(ufunc, method)(*inputs, **kwargs)


normal_tumor          gene_name  gene_id  normal  tumor  log2ratio
gene_type                                                         
lncRNA                      117      117     117    117        117
novel                        48       48      48     48         48
processed_pseudogene          5        5       5      5          5
protein_coding              136      136     136    136        136


In [36]:
log2ratio3x_general.drop_duplicates(inplace=True)
log2ratio3x_general.groupby("gene_type").count()


Unnamed: 0,gene_name,gene_id,gene_type
534,AC007131.2,ENSG00000233891,lncRNA
708,AC009411.1,ENSG00000228538,lncRNA
1011,AC019055.1,ENSG00000235911,lncRNA
1413,AC093716.1,ENSG00000235243,processed_pseudogene
1827,ACTL8,ENSG00000117148,protein_coding
...,...,...,...
43650,XLOC_001273,XLOC_001273,novel
43680,XLOC_001303,XLOC_001303,novel
43746,XLOC_001369,XLOC_001369,novel
43892,XLOC_001515,XLOC_001515,novel


In [35]:
log2ratio3x_1TPM_general.drop_duplicates(inplace=True)
log2ratio3x_1TPM_general.groupby("gene_type").count()
log2ratio3x_1TPM_general

Unnamed: 0,gene_name,gene_id,gene_type
1827,ACTL8,ENSG00000117148,protein_coding
1884,ADAM18,ENSG00000168619,protein_coding
4008,C10orf120,ENSG00000183559,protein_coding
4287,C4orf51,ENSG00000237136,protein_coding
4498,CALR3,ENSG00000269058,protein_coding
...,...,...,...
43650,XLOC_001273,XLOC_001273,novel
43746,XLOC_001369,XLOC_001369,novel
43892,XLOC_001515,XLOC_001515,novel
44004,XLOC_001627,XLOC_001627,novel


In [26]:
log2ratio3x_1TPM_general_5percent.drop_duplicates(inplace=True)
log2ratio3x_1TPM_general_5percent.groupby("gene_type").count()

Unnamed: 0_level_0,gene_name,gene_id
gene_type,Unnamed: 1_level_1,Unnamed: 2_level_1
lncRNA,61,61
novel,26,26
processed_pseudogene,2,2
protein_coding,101,101


In [None]:
## OVEREXPRESSED TUMOR 3X NORMAL - per cancer type - 5% or 10%
patients_overexpressed_5percent_general = pd.DataFrame(columns=["gene_name","gene_id"])
for ctype in cancertypes:
    print(ctype)
    tumorspecific_candidates = pd.DataFrame()

    patients = pd.read_csv(cancer_dir+"/merged_patients_"+ctype+".csv")
    fc=pd.read_csv(cancer_dir+"/merged_fc_"+ctype+".csv")

    log2ratio_1TPM = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"log2ratio3x_1TPM.csv"))
    fc_info = fc.merge(transcript_gene, on=['transcript_id'], how="inner")
    fc_info = fc_info[fc_info['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]
    fc_info = fc_info[fc_info['gene_id'].isin(log2ratio_1TPM.gene_id.values.tolist())]
    # print("log2ratio 1TPM: ", len(fc_info))

    for index,patient in patients.iterrows():
        log2ratio_1TPM_TOv3x = fc_info[["transcript_id","gene_id","gene_name","gene_type","Length",patient['tumor'],patient['normal']]] 
        # Selecting the rows where the value of tumor is double that of normal sample
        tumor1TPM_overexpressed = log2ratio_1TPM_TOv3x[log2ratio_1TPM_TOv3x.iloc[:, 5] >= 3 * log2ratio_1TPM_TOv3x.iloc[:, 6]]
        # print("log2ratio 1TPM Overexpressed 3x: ", len(tumor1TPM_overexpressed))
        try:
            os.makedirs(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,patient['patient']))
        except:
            pass

        tumor1TPM_overexpressed.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,patient['patient'],"tumor_3xnormal_TPMs.csv"), index=None)
        tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
        tumorspecific_candidates = pd.concat([tumorspecific_candidates,tumor1TPM_overexpressed[["transcript_id","gene_id","gene_name","gene_type","Length","patient_overexpr"]]])
    
    patients_overexpressed = tumorspecific_candidates.groupby(['gene_id','transcript_id','gene_name','gene_type','Length']).size().reset_index(name='num_patients_overexpr')
    patients_overexpressed['percentage_num_patients_overexpr'] = round(patients_overexpressed['num_patients_overexpr']/len(patients)*100,2)
    patients_overexpressed.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"tumor_3xnormal_TPMs.csv"), index=None)
    
    patients_overexpressed_5percent = patients_overexpressed[patients_overexpressed['percentage_num_patients_overexpr'] >= 5]
    patients_overexpressed_5percent.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"tumor_3xnormal_TPMs_5percent.csv"), index=None)
    print(len(patients_overexpressed_5percent)," genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients")
    patients_overexpressed_5percent['ctype'] = ctype
    patients_overexpressed_5percent_general = pd.concat([patients_overexpressed_5percent_general,patients_overexpressed_5percent])

    patients_overexpressed_10percent = patients_overexpressed[patients_overexpressed['percentage_num_patients_overexpr'] >= 10]
    patients_overexpressed_10percent.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"tumor_3xnormal_TPMs_10percent.csv"), index=None)
    print(len(patients_overexpressed_10percent)," genes expressed 3x in tumor than in normal (>1TPM) in at least 10% of the patients")
    
patients_overexpressed_5percent_general.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEANgenes.csv"))

In [27]:
patients_overexpressed_5percent_general[['gene_name','gene_type']].drop_duplicates().groupby("gene_type").count()

Unnamed: 0_level_0,gene_name
gene_type,Unnamed: 1_level_1
lncRNA,108
novel,39
processed_pseudogene,6
protein_coding,172


In [20]:
## ORF level    
tumorReact = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEANgenes.csv"))
testisRestr_ORFs = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q2_TestisRestricted/human/testisRestricted_GTEx_translatedONLYtestis.noProteome.csv")
tumorReact_ORFs = testisRestr_ORFs[testisRestr_ORFs['gene_id'].isin(tumorReact.gene_id.values.tolist())]
tumorReact_ORFs.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEAN.csv"), index=None)
tumorReact_ORFs.groupby("gene_type").count()

Unnamed: 0_level_0,Unnamed: 0,gene_id,gene_name,orfID,transcript_id,orfType,length,length_aa,start_codon,ORFpep,TranslatedLiver,TranslatedBrain
gene_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
lncRNA,147,147,147,147,147,147,147,147,147,147,147,147
novel,60,60,60,60,60,60,60,60,60,60,60,60
processed_pseudogene,7,7,7,7,7,7,7,7,7,7,7,7
protein_coding,172,172,172,172,172,172,172,172,172,172,172,172


In [33]:
## OVEREXPRESSED TUMOR 3X NORMAL - per cancer type - 5% or 10%
patients_overexpressed_5percent_general = pd.DataFrame(columns=["gene_name","gene_id"])
for ctype in cancertypes:
    print(ctype)
    tumorspecific_candidates = pd.DataFrame()

    patients = pd.read_csv(cancer_dir+"/merged_patients_"+ctype+".csv")
    fc=pd.read_csv(cancer_dir+"/merged_fc_"+ctype+".csv")

    log2ratio_1TPM = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"log2ratio3x_1TPM5percent.csv"))
    fc_info = fc.merge(transcript_gene, on=['transcript_id'], how="inner")
    fc_info = fc_info[fc_info['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]
    fc_info = fc_info[fc_info['gene_id'].isin(log2ratio_1TPM.gene_id.values.tolist())]
    # print("log2ratio 1TPM: ", len(fc_info))

    for index,patient in patients.iterrows():
        log2ratio_1TPM_TOv3x = fc_info[["transcript_id","gene_id","gene_name","gene_type","Length",patient['tumor'],patient['normal']]] 
        # Selecting the rows where the value of tumor is double that of normal sample
        tumor1TPM_overexpressed = log2ratio_1TPM_TOv3x[log2ratio_1TPM_TOv3x.iloc[:, 5] >= 3 * log2ratio_1TPM_TOv3x.iloc[:, 6]]
        # print("log2ratio 1TPM Overexpressed 3x: ", len(tumor1TPM_overexpressed))
        try:
            os.makedirs(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,patient['patient']))
        except:
            pass

        tumor1TPM_overexpressed.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,patient['patient'],"tumor_3xnormal_TPMs.csv"), index=None)
        tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
        tumorspecific_candidates = pd.concat([tumorspecific_candidates,tumor1TPM_overexpressed[["transcript_id","gene_id","gene_name","gene_type","Length","patient_overexpr"]]])
    
    patients_overexpressed = tumorspecific_candidates.groupby(['gene_id','transcript_id','gene_name','gene_type','Length']).size().reset_index(name='num_patients_overexpr')
    patients_overexpressed['percentage_num_patients_overexpr'] = round(patients_overexpressed['num_patients_overexpr']/len(patients)*100,2)
    patients_overexpressed.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"tumor_3xnormal_TPMs.csv"), index=None)
    
    patients_overexpressed_5percent = patients_overexpressed[patients_overexpressed['percentage_num_patients_overexpr'] >= 5]
    patients_overexpressed_5percent.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"tumor_3xnormal_TPMs_5percent.csv"), index=None)
    print(len(patients_overexpressed_5percent)," genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients")
    patients_overexpressed_5percent['ctype'] = ctype
    patients_overexpressed_5percent_general = pd.concat([patients_overexpressed_5percent_general,patients_overexpressed_5percent])

    patients_overexpressed_10percent = patients_overexpressed[patients_overexpressed['percentage_num_patients_overexpr'] >= 10]
    patients_overexpressed_10percent.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"tumor_3xnormal_TPMs_10percent.csv"), index=None)
    print(len(patients_overexpressed_10percent)," genes expressed 3x in tumor than in normal (>1TPM) in at least 10% of the patients")
    
patients_overexpressed_5percent_general.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEANgenes.csv"))

BRCA


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is

79  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
79  genes expressed 3x in tumor than in normal (>1TPM) in at least 10% of the patients
BLCA


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is

109  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
109  genes expressed 3x in tumor than in normal (>1TPM) in at least 10% of the patients
LUAD


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is

74  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
74  genes expressed 3x in tumor than in normal (>1TPM) in at least 10% of the patients
KIRC


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is

46  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
46  genes expressed 3x in tumor than in normal (>1TPM) in at least 10% of the patients
PRAD


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is

47  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
47  genes expressed 3x in tumor than in normal (>1TPM) in at least 10% of the patients
LUSC


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is

118  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
117  genes expressed 3x in tumor than in normal (>1TPM) in at least 10% of the patients
LIHC


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is

136  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
136  genes expressed 3x in tumor than in normal (>1TPM) in at least 10% of the patients
COAD


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is

123  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
123  genes expressed 3x in tumor than in normal (>1TPM) in at least 10% of the patients


In [34]:
patients_overexpressed_5percent_general[['gene_name','gene_type']].drop_duplicates().groupby("gene_type").count()

Unnamed: 0_level_0,gene_name
gene_type,Unnamed: 1_level_1
lncRNA,61
novel,26
processed_pseudogene,2
protein_coding,101


In [None]:
## ORF level    
tumorReact = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEANgenes.csv"))
testisRestr_ORFs = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q2_TestisRestricted/human/testisRestricted_GTEx_translatedONLYtestis.noProteome.csv")
tumorReact_ORFs = testisRestr_ORFs[testisRestr_ORFs['gene_id'].isin(tumorReact.gene_id.values.tolist())]
tumorReact_ORFs.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEAN.csv"), index=None)
tumorReact_ORFs.groupby("gene_type").count()

Unnamed: 0_level_0,Unnamed: 0,gene_id,gene_name,orfID,transcript_id,orfType,length,length_aa,start_codon,ORFpep,TranslatedLiver,TranslatedBrain
gene_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
lncRNA,147,147,147,147,147,147,147,147,147,147,147,147
novel,60,60,60,60,60,60,60,60,60,60,60,60
processed_pseudogene,7,7,7,7,7,7,7,7,7,7,7,7
protein_coding,172,172,172,172,172,172,172,172,172,172,172,172
