In [2]:
import os,re,glob,csv
import pandas as pd
import numpy as np
from collections import Counter
from rna_seq_normalization import Normalization as Norm
from functools import reduce

users_dir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47"
specie = "human"
# GENOMEDIR = "/genomics/users/marta/genomes"
GENOMEDIR = "/data/genomics/marta/genomes"

### cancer data
cancer_dir = users_dir + "/cancers"
raw_cancer_dir="/users/genomics/marta/TCGA_RNASeq"
tcga_projects=["TCGA-BRCA","TCGA-LUSC","TCGA-PRAD","TCGA-KIRC","TCGA-LUAD","TCGA-BLCA"]#,"TCGA-LIHC"]
other_projects=["GSE102101_KIRC","GSE133624_BLCA","GSE22260_PRAD","PRJEB2449_PRAD","SRP238334_KIRC","GSE214846_LIHC","GSE229705_LUAD","SRP107326_COAD","TCGA_COAD"]
# deleted=["GSE103001_BRCA","GSE89223_PRAD"]
manuscript_projects = ["liver_adjacent_totalRNA_LIHC","hcc_normal_totalRNA_LIHC","GSE193567_LIHC","LIHC_TCGA_LIHC"]
all_projects = tcga_projects + other_projects + manuscript_projects

cancertypes = ["BRCA","BLCA","LUAD","KIRC","PRAD","LUSC","LIHC","COAD"]
## annotation file
annotation="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed/gencode.v47.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf"
transcript_gene=pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed/transID_geneID_isoforms_selected.1to1.csv")

def count_greater_than_one(row):
    return (row > 1).sum()

## Quantify with TCGAData and new reference
`featureCounts_newRef_cancer.sh`

output in: `/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/cancers/featureCounts`

## TPM

In [9]:
for file in os.listdir(os.path.join(cancer_dir,"featureCounts")):
    if file.endswith(".txt"):

        proj = file.split("featureCounts_")[-1]
        cancer_type = proj[:-4]
        print(cancer_type)
        toc = pd.read_csv(os.path.join(cancer_dir,"featureCounts",file), sep="\t", comment="#")
        toc = toc[toc['Geneid'].str.contains('PAR_')==False]
        toc['Geneid']=toc['Geneid'].str.split('.').str[0]
        toc.rename(columns={'Geneid':'transcript_id'}, inplace=True)

        filter_col = [col for col in toc if col.startswith('/')]
        for col in filter_col:
            new_col=col.split("Aligned")[0]
            new_col=new_col.split("/")[-1]
            if proj in tcga_projects:
                new_col=cancer_type+new_col[4:]
            toc.rename(columns={col:new_col}, inplace=True)

        length = toc['Length']
        genes = toc['transcript_id']
        # we are only interested in the columns with counts
        counts = toc
        counts.drop(["Chr","Start","End","Strand","Length","transcript_id"],axis=1, inplace=True)
        # calculate TPMs
        tpm_df = Norm.tpm(counts, length)
        # add transcript_id and length again
        tpms = pd.concat([genes,tpm_df, length], axis=1)
        tpms.to_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+cancer_type+".csv", index=None)
 

TCGA-BRCA
GSE102101_KIRC
TCGA-LUSC
TCGA-PRAD
TCGA-BLCA
GSE229705_LUAD
PRJEB2449_PRAD
GSE133624_BLCA
TCGA_COAD_SE
GSE22260_PRAD
GSE214846_LIHC
TCGA_COAD_PE
liver_adjacent_totalRNA_LIHC
TCGA-KIRC
hcc_normal_totalRNA_LIHC
SRP107326_COAD
TCGA-LUAD
GSE193567_LIHC
SRP238334_KIRC
LIHC_TCGA_LIHC


In [10]:
## merge single-end and paired-end TCGA COAD
SE_COAD = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_TCGA_COAD_SE.csv"))
print(len(SE_COAD.columns))
PE_COAD = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_TCGA_COAD_PE.csv"))
print(len(PE_COAD.columns))

COAD = SE_COAD.merge(PE_COAD, on=["transcript_id","Length"])
COAD.to_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_TCGA_COAD.csv"), index=None)
print(len(COAD.columns))

19
66
83


In [11]:
## filter patients we are interested in, from those not TCGA
for proj in all_projects:
    print(proj)
    if proj in tcga_projects:
        cancer_type = proj[5:]

        fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
        tumor_transcripts = list()
        patients=pd.read_csv(os.path.join("/users/genomics/marta/TCGA_RNASeq",proj,"results/QC_patients1.csv"))
        print(len(patients)," patients")
        # patients['normal'] = patients['normal'].str.replace('TCGA', cancer_type)
        # patients['tumor'] = patients['tumor'].str.replace('TCGA', cancer_type)

    elif proj in other_projects:
        fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
        print(fc)
        tumor_transcripts = list()
        patients=pd.read_csv(os.path.join("/users/genomics/marta/cancers_RNASeq",proj,"results/patients.csv"))
        print(len(patients)," patients")

        samples_to_keep = patients.tumor.values.tolist() + patients.normal.values.tolist() + ['Length','transcript_id']
        fc_selected = fc[fc.columns.intersection(samples_to_keep)]

    elif proj in manuscript_projects:
        fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
        tumor_transcripts = list()

        if "liver_adjacent_totalRNA" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/liver_adjacent_totalRNA/results/clean_patients.csv"))
            # patients.rename(columns={'adjacent':'normal'}, inplace=True)
        elif "hcc_normal_totalRNA" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/hcc_normal_totalRNA/results/clean_patients.csv"))
            # patients.rename(columns={'adjacent':'normal'}, inplace=True)
        elif "GSE193567" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/GSE193567/results/clean_patients.csv"))        
            # patients.rename(columns={'adjacent':'normal'}, inplace=True)
        elif "LIHC_TCGA" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/LIHC_TCGA/results/clean_patients.csv"))    
            patients['normal'] = patients['patient'] + "_normal"
            patients['tumor'] = patients['patient'] + "_tumor"
        print(len(patients)," patients")

    for index,patient in patients.iterrows():
        if proj in tcga_projects:
            patient_fc = fc[["transcript_id","Length",patient['tumor']]]

            tumor_patient_fc = patient_fc[patient_fc[patient['tumor']] >= 1 ]
            tumor_transcripts.extend(tumor_patient_fc.transcript_id.values.tolist())
        if proj in other_projects:
            patient_fc = fc[["transcript_id","Length",patient['tumor']]]

            tumor_patient_fc = patient_fc[patient_fc[patient['tumor']] >= 1 ]
            tumor_transcripts.extend(tumor_patient_fc.transcript_id.values.tolist())       
        if proj in manuscript_projects:
            patient_fc = fc[["transcript_id","Length",patient['tumor']]]

            tumor_patient_fc = patient_fc[patient_fc[patient['tumor']] >= 1 ]
            tumor_transcripts.extend(tumor_patient_fc.transcript_id.values.tolist())                        

    tumor1FPKM = fc[fc['transcript_id'].isin(tumor_transcripts)]
    # print(tumor1FPKM)
    known = tumor1FPKM[tumor1FPKM['transcript_id'].str.contains('ENST')]
    novel = tumor1FPKM[tumor1FPKM['transcript_id'].str.contains('TCONS')]
    merged = tumor1FPKM.merge(transcript_gene, on=['transcript_id'], how="inner")
    merged = merged[merged['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]

    lncRNA = merged[merged['gene_type'].isin(['lncRNA','processed_pseudogene'])]
    print("lncRNA: ",len(lncRNA))

    cds = merged[merged['gene_type'] == "protein_coding"]
    print("PROTEIN CODING: ",len(cds))

    novel = merged[merged['gene_type'] == "novel"]
    ## length limitations
    novel = novel[novel['Length'].astype(int) < 91667]
    novel = novel[novel['Length'].astype(int) > 300]

    print("NOVEL: ",len(novel))

    # # tumor1FPKM = pd.concat([novel, lncRNA, cds], ignore_index = True)
    # merged[['transcript_id','gene_id','gene_name','gene_type','Length']].to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_")+cancer_type+".csv",index=False)

    ## keep only tumor samples
    if proj in tcga_projects:
        tumorONLY_merged = merged[merged.columns.drop(list(merged.filter(regex='normal')))]
    else:
        tumorONLY_merged = merged.drop(columns=patients.normal.values.tolist())

    tumorONLY_merged = tumorONLY_merged[['transcript_id','gene_id','gene_id', 'gene_type', 'gene_name','Length', *patients.tumor.values.tolist()]]
    tumorONLY_merged.to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_table_of_counts_")+proj+".csv",index=False)

    ###### GET TUMOR-EXPRESSED > 1 TPM
    # Filter only integer columns
    int_columns = tumorONLY_merged.select_dtypes(include='int64')
    
    # Create a new column with the count of int columns per row whose value is > 1
    # tumorONLY_merged['n'] = int_columns.apply(lambda row: (row > 1).sum(), axis=1)
    tumorONLY_merged['n'] = tumorONLY_merged.drop(columns=['transcript_id','gene_id','gene_name','gene_type','Length']).apply(count_greater_than_one, axis=1)
    tumorONLY_merged = tumorONLY_merged[['transcript_id','gene_id','gene_name','gene_type','Length','n']]
    tumorONLY_merged.sort_values(by=['n'], ascending=False).to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_n_")+proj+".csv",index=False)

TCGA-BRCA
109  patients
lncRNA:  15782
PROTEIN CODING:  17312
NOVEL:  831
TCGA-LUSC
49  patients
lncRNA:  14551
PROTEIN CODING:  17327
NOVEL:  782
TCGA-PRAD
52  patients
lncRNA:  10397
PROTEIN CODING:  15946
NOVEL:  667
TCGA-KIRC
71  patients
lncRNA:  11394
PROTEIN CODING:  16417
NOVEL:  650
TCGA-LUAD
56  patients
lncRNA:  14343
PROTEIN CODING:  17105
NOVEL:  732
TCGA-BLCA
18  patients
lncRNA:  11269
PROTEIN CODING:  16528
NOVEL:  634
GSE102101_KIRC
         transcript_id   SRR5885319   SRR5885320   SRR5885321   SRR5885322  \
0       TCONS_00002635   828.245234   202.053655   664.505097   358.740771   
1      ENST00000387314  2470.270384   270.871146  1647.955305   311.867086   
2      ENST00000389680  4022.001922   852.009488  3003.976595   700.270370   
3       TCONS_00002636  7706.480029  1859.800055  5759.901768  1451.117488   
4      ENST00000387342  1510.392466    43.098172  1462.921174   139.680377   
...                ...          ...          ...          ...          ...   


In [12]:
## create a merged table of counts per ctype and a big patients file
## CAN THEY BE CONSIDERED AS ONE?
for ctype in cancertypes:
    print(ctype)
    
    # Lists to store DataFrames for counts and patients
    fc_list = []
    patients_list = []
    
    for proj in all_projects:
        if proj in tcga_projects and ctype in proj:
            print(proj)
            # Read the CSV files for TCGA projects
            patients = pd.read_csv(os.path.join("/users/genomics/marta/TCGA_RNASeq", proj, "results/QC_patients1.csv"))
            patients['project'] = proj  # Add the project column
            fc = pd.read_csv(os.path.join(cancer_dir, "featureCounts", f"table_of_counts_TPMs_{proj}.csv"))
            fc_list.append(fc)
            patients_list.append(patients)
        
        elif proj in other_projects and ctype in proj:
            print(proj)
            # Read the CSV files for other projects
            patients = pd.read_csv(os.path.join("/users/genomics/marta/cancers_RNASeq", proj, "results/patients.csv"))
            patients['project'] = proj  # Add the project column
            fc = pd.read_csv(os.path.join(cancer_dir, "featureCounts", f"table_of_counts_TPMs_{proj}.csv"))
            fc_list.append(fc)
            patients_list.append(patients)
        
        elif proj in manuscript_projects and ctype in proj:
            print(proj)
            # Read the CSV files for manuscript projects
            if proj == "LIHC_TCGA_LIHC":
                patients = pd.read_csv(os.path.join("/projects_eg/projects/marta", proj[:-5], "results/clean_patients.csv"))
                patients['normal'] = patients['patient'] + "_normal"
                patients['tumor'] = patients['patient'] + "_tumor"

            else:
                patients = pd.read_csv(os.path.join("/projects_eg/projects/marta", proj[:-5], "results/clean_patients.csv"))

            patients['project'] = proj  # Add the project column
            fc = pd.read_csv(os.path.join(cancer_dir, "featureCounts", f"table_of_counts_TPMs_{proj}.csv"))
            fc_list.append(fc)
            patients_list.append(patients)
    
    # Merge all fc DataFrames by 'transcript_id' and 'Length'
    if fc_list:
        merged_fc = reduce(lambda left, right: pd.merge(left, right, on=["transcript_id", "Length"], how='outer'), fc_list)
        # Save the merged fc DataFrame
        merged_fc.to_csv(os.path.join(cancer_dir, f"merged_fc_{ctype}.csv"), index=False)
    
    # Combine all patient DataFrames into one big DataFrame
    if patients_list:
        merged_patients = pd.concat(patients_list, ignore_index=True)
        # Save the merged patients DataFrame
        merged_patients.to_csv(os.path.join(cancer_dir, f"merged_patients_{ctype}.csv"), index=False)


BRCA
TCGA-BRCA
BLCA
TCGA-BLCA
GSE133624_BLCA
LUAD
TCGA-LUAD
GSE229705_LUAD
KIRC
TCGA-KIRC
GSE102101_KIRC
SRP238334_KIRC
PRAD
TCGA-PRAD
GSE22260_PRAD
PRJEB2449_PRAD
LUSC
TCGA-LUSC
LIHC
GSE214846_LIHC
liver_adjacent_totalRNA_LIHC
hcc_normal_totalRNA_LIHC
GSE193567_LIHC
LIHC_TCGA_LIHC
COAD
SRP107326_COAD
TCGA_COAD


Here to run Q3.1 - TPM distribution to know if we can treat them as one per cancer type or if there is bias according to the dataset

Instead of selecting the tumor-specific, let's look first at the log2ratio between means


In [13]:
testisRestr = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Multimap_altORFs/Q2_TestisRestricted/human/testisRestricted_GTEx_translatedONLYtestis.noProteome.csv")
testisRestr

Unnamed: 0.1,Unnamed: 0,gene_id,gene_name,orfID,transcript_id,gene_type,orfType,length,geneORFtype,length_aa,start_codon,ORFpep,TranslatedLiver,TranslatedBrain
0,0,ENSG00000039600,SOX30,ENST00000265007.11:5:-|9|3283:233:491|ouORF|CTG,ENST00000265007,protein_coding,ouORF,258,protein_coding_ouORF,86,CTG,MRFEPRRVLGSKGLARLLTERGRGKVANKRLSSQSWPGRPPPPWRE...,no,no
1,1,ENSG00000039600,SOX30,ENST00000265007.11:5:-|14|3283:361:2623|canoni...,ENST00000265007,protein_coding,canonical,2262,protein_coding_canonical,754,ATG,MERARPEPPPQPRPLRPAPPPLPVEGTSFWAAAMEPPPSSPTLSAA...,no,no
2,2,ENSG00000046774,MAGEC2,ENST00000247452.4:X:-|1|1994:91:148|uORF|CTG,ENST00000247452,protein_coding,uORF,57,protein_coding_uORF,19,CTG,MASPQGEGPEEELRDLPP*,no,no
3,3,ENSG00000046774,MAGEC2,ENST00000247452.4:X:-|15|1994:349:1471|canonic...,ENST00000247452,protein_coding,canonical,1122,protein_coding_canonical,374,ATG,MPPVPGVPFRNVDNDSPTSVELEDWVDAQHPTDEEEEEASSASSTL...,no,no
4,4,ENSG00000046774,MAGEC2,ENST00000247452.4:X:-|8|1994:285:339|uORF|CTG,ENST00000247452,protein_coding,uORF,54,protein_coding_uORF,18,CTG,MLDLIIHIPVDTFTCCS*,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038,1078,XLOC_002365,XLOC_002365,TCONS_00002556:9:-|534|10655:9766:9832|noncodi...,TCONS_00002556,novel,noncoding,66,novel_noncoding,22,TTG,MPISTEGQKQGNEPPSGYRKE*,no,no
1039,1079,XLOC_002365,XLOC_002365,TCONS_00002556:9:-|561|10655:10374:10479|nonco...,TCONS_00002556,novel,noncoding,105,novel_noncoding,35,ATG,MFSNENTWTQGGERQRVGNVTCQSLSGAAGLGEG*,no,no
1040,1080,XLOC_002365,XLOC_002365,TCONS_00002556:9:-|130|10655:3631:3685|noncodi...,TCONS_00002556,novel,noncoding,54,novel_noncoding,18,ATG,MNCFPNRIWIFHHHLTL*,no,no
1041,1081,XLOC_002366,XLOC_002366,TCONS_00002557:9:-|14|437:244:337|noncoding|CTG,TCONS_00002557,novel,noncoding,93,novel_noncoding,31,CTG,MKPIFRLEFLLLFFFLSYCVSPSPESTGVM*,no,no


In [14]:
## log2ratio3x & 1 TPM
log2ratio3x_general = pd.DataFrame(columns=["gene_name","gene_id","gene_type"])
log2ratio3x_1TPM_general = pd.DataFrame(columns=["gene_name","gene_id","gene_type","ctype"])
log2ratio3x_1TPM_general_5percent = pd.DataFrame(columns=["gene_name","gene_id","gene_type"])

for ctype in cancertypes:

    print(ctype)
    tumorspecific_candidates = pd.DataFrame()

    patients = pd.read_csv(cancer_dir+"/merged_patients_"+ctype+".csv")
    patients_long = patients.melt(id_vars=['patient', 'project'], value_vars=['normal', 'tumor'], 
                    var_name='normal_tumor', value_name='sample')
    patients_long['normal_tumor'] = patients_long['normal_tumor'].replace({'normal': 'normal', 'tumor': 'tumor'})
    
    fc=pd.read_csv(cancer_dir+"/merged_fc_"+ctype+".csv")

    fc_info = fc.merge(transcript_gene, on=['transcript_id'], how="inner")
    fc_info = fc_info[fc_info['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]

    fc_info_long = pd.melt(fc_info, id_vars=["transcript_id","Length", "gene_id", "gene_type", "gene_name"], ignore_index=False).reset_index()
    fc_info_long.rename(columns={"variable":"sample","value":"TPM"}, inplace=True)
    fc_info_long = fc_info_long.merge(patients_long, on=["sample"])

    means = fc_info_long.groupby(["gene_name","gene_id","gene_type","normal_tumor"])['TPM'].mean().reset_index()
    means_pivot = means.pivot_table(index=['gene_name', 'gene_id', 'gene_type'], 
                          columns='normal_tumor', 
                          values='TPM').reset_index()

    # Avoid division by zero by adding a small constant (epsilon)
    epsilon = 1e-6  # A small value to prevent division by zero
    means_pivot['tumor'] = pd.to_numeric(means_pivot['tumor'], errors='coerce')
    means_pivot['normal'] = pd.to_numeric(means_pivot['normal'], errors='coerce')
    means_pivot['log2ratio'] = np.log2((means_pivot['tumor'] + epsilon) / (means_pivot['normal'] + epsilon))

    log2ratio3x = means_pivot[means_pivot['log2ratio'] >= np.log2(3)]
    log2ratio3x_testisRestr = log2ratio3x[log2ratio3x['gene_id'].isin(testisRestr.gene_id.values.tolist())]

    # try:
    #     os.makedirs(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype))
    # except:
    #     continue
    log2ratio3x_testisRestr.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"log2ratio3x.csv"), index=None)
    log2ratio3x_general = pd.concat([log2ratio3x_general, log2ratio3x_testisRestr[["gene_name","gene_id","gene_type"]]])

    ### > 1 TPM
    max = fc_info_long.groupby(["gene_name","gene_id","gene_type","normal_tumor"])['TPM'].max().reset_index()
    ## tumor samples max has to be greater than 1TPM
    max = max[max['normal_tumor'] == "tumor"]
    max.rename(columns={'TPM':'max_TPM'}, inplace=True)

    TPM1 = max[max['max_TPM'] > 1]
    log2ratio3x_1TPM = log2ratio3x_testisRestr[log2ratio3x_testisRestr['gene_id'].isin(TPM1.gene_id.values.tolist())]
    log2ratio3x_1TPM.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"log2ratio3x_1TPM.csv"), index=None)
    log2ratio3x_1TPM['ctype'] = ctype
    log2ratio3x_1TPM_general = pd.concat([log2ratio3x_1TPM_general, log2ratio3x_1TPM[["gene_name","gene_id","gene_type","ctype"]]])

    ### > 1 TPM - 5%
    # TAA_1TPM5percent = pd.read_csv(os.path.join(cancer_dir,"tumorexpressed/cancertypes/tumor_1FPKM_n5percent_pancancer.csv"))
    
    # log2ratio3x_1TPM = log2ratio3x_testisRestr[log2ratio3x_testisRestr['gene_id'].isin(TAA_1TPM5percent.gene_id.values.tolist())]
    # log2ratio3x_1TPM.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"log2ratio3x_1TPM5percent.csv"), index=None)
    # log2ratio3x_1TPM_general_5percent = pd.concat([log2ratio3x_1TPM_general_5percent, log2ratio3x_1TPM[["gene_name","gene_id","gene_type"]]])

log2ratio3x_1TPM_general.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/log2ratio3x_1TPM_genes.csv"), index=None)

BRCA


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  log2ratio3x_1TPM['ctype'] = ctype


BLCA


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  log2ratio3x_1TPM['ctype'] = ctype


LUAD


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  log2ratio3x_1TPM['ctype'] = ctype


KIRC


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  log2ratio3x_1TPM['ctype'] = ctype


PRAD


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  log2ratio3x_1TPM['ctype'] = ctype


LUSC


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  log2ratio3x_1TPM['ctype'] = ctype


LIHC


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  log2ratio3x_1TPM['ctype'] = ctype


COAD


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  log2ratio3x_1TPM['ctype'] = ctype


In [15]:
log2ratio3x_1TPM_general.drop_duplicates(inplace=True)
log2ratio3x_1TPM_general[['gene_name','gene_type']].groupby("gene_type").count()
# log2ratio3x_1TPM_general

Unnamed: 0_level_0,gene_name
gene_type,Unnamed: 1_level_1
lncRNA,180
novel,91
processed_pseudogene,15
protein_coding,551


In [16]:
## ORF level   
log2ratio3x_1TPM_general = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/log2ratio3x_1TPM_genes.csv"))
tumorReact_ORFs = log2ratio3x_1TPM_general.merge(testisRestr, on=["gene_name","gene_id","gene_type"])
tumorReact_ORFs = tumorReact_ORFs[['gene_name','gene_id','transcript_id','gene_type','length','ctype','orfType','geneORFtype','length_aa','start_codon','ORFpep']]
tumorReact_ORFs.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/log2ratio3x_1TPM_ORFs.csv"), index=None)
tumorReact_ORFs.groupby("geneORFtype").count()
tumorReact_ORFs[['gene_name','length_aa','geneORFtype']].drop_duplicates().groupby("geneORFtype").count()
tumorReact_ORFs[['gene_name','length_aa','geneORFtype','ctype']].drop_duplicates().groupby(["ctype","geneORFtype"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,gene_name,length_aa
ctype,geneORFtype,Unnamed: 2_level_1,Unnamed: 3_level_1
BLCA,lncRNA_noncoding,31,31
BLCA,novel_noncoding,14,14
BLCA,processed_pseudogene_noncoding,1,1
BLCA,protein_coding_canonical,79,79
BLCA,protein_coding_odORF,1,1
BLCA,protein_coding_ouORF,9,9
BLCA,protein_coding_uORF,15,15
BRCA,lncRNA_noncoding,15,15
BRCA,novel_noncoding,7,7
BRCA,protein_coding_canonical,70,70


In [17]:
## OVEREXPRESSED TUMOR 3X NORMAL - per cancer type - 5%
tumorspecific_candidates_All = pd.DataFrame()
patients_overexpressed_5percent_general = pd.DataFrame(columns=["gene_name","gene_id"])
TSTR_all_5percent = pd.DataFrame(columns=["gene_name","gene_id"])
for ctype in cancertypes:
# for ctype in ["LIHC"]:
    print(ctype)

    tumorspecific_candidates_ctype = pd.DataFrame()

    patients = pd.read_csv(cancer_dir+"/merged_patients_"+ctype+".csv")
    patients_long = pd.melt(patients, id_vars=["patient","project"], ignore_index=False).reset_index()
    patients_long.drop("index", axis=1, inplace=True)
    patients_long.rename(columns={"variable":"normal_tumor","value":"sample"}, inplace=True)

    fc=pd.read_csv(cancer_dir+"/merged_fc_"+ctype+".csv")

    log2ratio_1TPM = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"log2ratio3x_1TPM.csv"))
    fc_info = fc.merge(transcript_gene, on=['transcript_id'], how="inner")
    fc_info = fc_info[fc_info['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]
    # print("log2ratio 1TPM: ", len(fc_info))

    fc_info_long = pd.melt(fc_info, id_vars=["transcript_id","Length", "gene_id", "gene_type", "gene_name"], ignore_index=False).reset_index()
    fc_info_long.rename(columns={"variable":"sample","value":"TPM"}, inplace=True)
    fc_info_long = fc_info_long.merge(patients_long, on=["sample"])

    ## create full file with TPM per patient and ctype
    fc_info_long['ctype'] = ctype

    for index,patient in patients.iterrows():
        log2ratio_1TPM_TOv3x = fc_info[["transcript_id","gene_id","gene_name","gene_type","Length",patient['tumor'],patient['normal']]] 
        log2ratio_1TPM_TOv3x = log2ratio_1TPM_TOv3x[log2ratio_1TPM_TOv3x['gene_id'].isin(log2ratio_1TPM.gene_id.values.tolist())]
        # Selecting the rows where the value of tumor is double that of normal sample
        ## overexpressed
        ## in that patient > 1 TPM
        log2ratio_1TPM_TOv3x = log2ratio_1TPM_TOv3x[log2ratio_1TPM_TOv3x.iloc[:, 5] > 1]
        ## and 3 times more than normal
        tumor1TPM_overexpressed = log2ratio_1TPM_TOv3x[
    (log2ratio_1TPM_TOv3x.iloc[:, 5] >= 3 * log2ratio_1TPM_TOv3x.iloc[:, 6]) & 
    ~((log2ratio_1TPM_TOv3x.iloc[:, 5] == 0) & (log2ratio_1TPM_TOv3x.iloc[:, 6] == 0))]
                # print("log2ratio 1TPM Overexpressed 3x: ", len(tumor1TPM_overexpressed))
        try:
            os.makedirs(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,patient['patient']))
        except:
            pass

        tumor1TPM_overexpressed.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,patient['patient'],"tumor_3xnormal_TPMs.csv"), index=None)
        tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
        tumor1TPM_overexpressed['ctype'] = ctype
        tumorspecific_candidates_ctype = pd.concat([tumorspecific_candidates_ctype,tumor1TPM_overexpressed[["transcript_id","gene_id","gene_name","gene_type","Length","patient_overexpr","ctype"]]])
    
        tumorspecific_candidates_All = pd.concat([tumorspecific_candidates_All,tumor1TPM_overexpressed[["transcript_id","gene_id","gene_name","gene_type","Length","patient_overexpr","ctype"]]])

    # in how many patients of a specific ctype is the gene overexpressed?
    patients_overexpressed = tumorspecific_candidates_ctype[['gene_id','transcript_id','gene_name','gene_type','Length','ctype']].groupby(['gene_id','transcript_id','gene_name','gene_type','Length','ctype']).size().reset_index(name='num_patients_overexpr')
    patients_overexpressed['percentage_num_patients_overexpr'] = round(patients_overexpressed['num_patients_overexpr']/len(set(patients.patient.values.tolist()))*100,2)
    # print(patients_overexpressed['percentage_num_patients_overexpr'].max())
    patients_overexpressed.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"tumor_3xnormal_TPMs.csv"), index=None)
    
    patients_overexpressed_5percent = patients_overexpressed[patients_overexpressed['percentage_num_patients_overexpr'] >= 5]
    patients_overexpressed_5percent.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"tumor_3xnormal_TPMs_5percent.csv"), index=None)
    print(len(patients_overexpressed_5percent)," genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients")
    patients_overexpressed_5percent_general = pd.concat([patients_overexpressed_5percent_general,patients_overexpressed_5percent])

    # fc_info_long_5percent = fc_info_long[fc_info_long['gene_id'].isin(tumor1TPM_overexpressed.gene_id.values.tolist())]

    # TSTR_all_5percent = pd.concat([TSTR_all_5percent, fc_info_long_5percent])

patients_overexpressed_5percent_general.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEANgenes.csv"))
tumorspecific_candidates_5percent = tumorspecific_candidates_All.merge(patients_overexpressed_5percent_general[["ctype","gene_id","percentage_num_patients_overexpr"]], on=["ctype","gene_id"])
tumorspecific_candidates_5percent.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEANgenes_patients.csv"))

# TSTR_all_5percent.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TSTR_Expression.csv"))

BRCA


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['ctype'] = ctype
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a c

22  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
BLCA


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['ctype'] = ctype
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a c

53  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
LUAD


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['ctype'] = ctype
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a c

19  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
KIRC


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['ctype'] = ctype
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a c

11  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
PRAD


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['ctype'] = ctype
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a c

10  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
LUSC


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['ctype'] = ctype
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a c

66  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
LIHC


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['ctype'] = ctype
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a c

65  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients
COAD


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['ctype'] = ctype
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumor1TPM_overexpressed['patient_overexpr'] = patient['patient']
A value is trying to be set on a c

16  genes expressed 3x in tumor than in normal (>1TPM) in at least 5% of the patients


In [18]:
## get table of counts of candidates
candidates = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEANgenes_patients.csv"))
candidates

full_table_of_counts = pd.DataFrame()
for ctype in cancertypes:
    patients = pd.read_csv(cancer_dir+"/merged_patients_"+ctype+".csv")
    patients_long = pd.melt(patients, id_vars=["patient","project"], ignore_index=False).reset_index()
    patients_long.drop("index", axis=1, inplace=True)
    patients_long.rename(columns={"variable":"normal_tumor","value":"sample"}, inplace=True)

    fc=pd.read_csv(cancer_dir+"/merged_fc_"+ctype+".csv")

    log2ratio_1TPM = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes",ctype,"log2ratio3x_1TPM.csv"))
    fc_info = fc.merge(transcript_gene, on=['transcript_id'], how="inner")
    fc_info = fc_info[fc_info['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]
    # print("log2ratio 1TPM: ", len(fc_info))

    fc_info_long = pd.melt(fc_info, id_vars=["transcript_id","Length", "gene_id", "gene_type", "gene_name"], ignore_index=False).reset_index()
    fc_info_long.rename(columns={"variable":"sample","value":"TPM"}, inplace=True)
    fc_info_long = fc_info_long.merge(patients_long, on=["sample"])

    ## create full file with TPM per patient and ctype
    fc_info_long['ctype'] = ctype

    full_table_of_counts = pd.concat([full_table_of_counts, fc_info_long])

# full_table_of_counts.to_csv(os.path.join(cancer_dir,"full_table_of_counts.csv"))


In [19]:
merged = pd.merge(
    full_table_of_counts,
    candidates[['gene_id', 'patient_overexpr']],  # Only keep relevant columns for the merge
    left_on=['gene_id', 'patient'],
    right_on=['gene_id', 'patient_overexpr'],
    how='left'
)
merged = merged[merged['gene_id'].isin(candidates.gene_id.values.tolist())]
merged.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TSTR_Expression.csv"), index=False)

# Create the new column 'forced_TPM' using numpy.where
merged['forced_TPM'] = np.where(merged['patient_overexpr'].notna(), merged['TPM'], 0)
merged_tumor = merged[merged['normal_tumor'] == "tumor"]
merged_tumor.drop(columns=['patient_overexpr','index','Length','sample'], inplace=True)
merged_tumor.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TSTR_Expression_tumor.csv"), index=False)

merged_normal = merged[merged['normal_tumor'] == "normal"]
merged_normal.drop(columns=['patient_overexpr','index','Length','sample'], inplace=True)
merged_normal.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TSTR_Expression_normal.csv"), index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_tumor.drop(columns=['patient_overexpr','index','Length','sample'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_normal.drop(columns=['patient_overexpr','index','Length','sample'], inplace=True)


In [20]:
patients_overexpressed_5percent_general[['gene_name','gene_type']].drop_duplicates().groupby("gene_type").count()

Unnamed: 0_level_0,gene_name
gene_type,Unnamed: 1_level_1
lncRNA,25
novel,22
processed_pseudogene,1
protein_coding,92


In [21]:
patients_overexpressed_5percent_general[['gene_name','gene_type','ctype']].drop_duplicates().groupby(["gene_type","ctype"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,gene_name
gene_type,ctype,Unnamed: 2_level_1
lncRNA,BLCA,9
lncRNA,BRCA,1
lncRNA,COAD,1
lncRNA,KIRC,3
lncRNA,LIHC,11
lncRNA,LUAD,5
lncRNA,LUSC,10
lncRNA,PRAD,2
novel,BLCA,6
novel,BRCA,4


In [22]:
## ORF level   
patients_overexpressed_5percent_general = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEANgenes.csv"))
tumorReact_ORFs = patients_overexpressed_5percent_general.merge(testisRestr, on=["gene_name","gene_id","gene_type","transcript_id"])
tumorReact_ORFs = tumorReact_ORFs[['gene_name','gene_id','transcript_id','gene_type','orfID','length','num_patients_overexpr','percentage_num_patients_overexpr','ctype','orfType','geneORFtype','length_aa','start_codon','ORFpep']]
tumorReact_ORFs.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEAN.csv"), index=None)
tumorReact_ORFs.groupby("gene_type").count()

tumorReact_ORFs

Unnamed: 0,gene_name,gene_id,transcript_id,gene_type,orfID,length,num_patients_overexpr,percentage_num_patients_overexpr,ctype,orfType,geneORFtype,length_aa,start_codon,ORFpep
0,MAGEC2,ENSG00000046774,ENST00000247452,protein_coding,ENST00000247452.4:X:-|1|1994:91:148|uORF|CTG,57,8.0,7.34,BRCA,uORF,protein_coding_uORF,19,CTG,MASPQGEGPEEELRDLPP*
1,MAGEC2,ENSG00000046774,ENST00000247452,protein_coding,ENST00000247452.4:X:-|15|1994:349:1471|canonic...,1122,8.0,7.34,BRCA,canonical,protein_coding_canonical,374,ATG,MPPVPGVPFRNVDNDSPTSVELEDWVDAQHPTDEEEEEASSASSTL...
2,MAGEC2,ENSG00000046774,ENST00000247452,protein_coding,ENST00000247452.4:X:-|8|1994:285:339|uORF|CTG,54,8.0,7.34,BRCA,uORF,protein_coding_uORF,18,CTG,MLDLIIHIPVDTFTCCS*
3,MAGEC2,ENSG00000046774,ENST00000247452,protein_coding,ENST00000247452.4:X:-|2|1994:165:240|uORF|CTG,75,8.0,7.34,BRCA,uORF,protein_coding_uORF,25,CTG,MYCAAVRLVLQEPGGDELGVRHTA*
4,MAGEC2,ENSG00000046774,ENST00000247452,protein_coding,ENST00000247452.4:X:-|1|1994:91:148|uORF|CTG,57,3.0,7.89,BLCA,uORF,protein_coding_uORF,19,CTG,MASPQGEGPEEELRDLPP*
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366,XLOC_001654,XLOC_001654,TCONS_00001755,novel,TCONS_00001755:3:-|33|2041:1080:1203|noncoding...,123,62.0,34.07,LIHC,noncoding,novel_noncoding,41,ATG,MRPSLALFRLYSELRKYCMFTGNMGKCRNTLTAAYIPSSE*
367,PRSS54,ENSG00000103023,ENST00000219301,protein_coding,ENST00000219301.8:16:-|4|1810:52:115|uORF|CTG,63,31.0,21.53,COAD,uORF,protein_coding_uORF,21,CTG,MCQASLSSFLLEQEQFMDEL*
368,PRSS54,ENSG00000103023,ENST00000219301,protein_coding,ENST00000219301.8:16:-|109|1810:1568:1616|odOR...,48,31.0,21.53,COAD,odORF,protein_coding_odORF,16,TTG,MQQYLVQELPHQTEE*
369,PRSS54,ENSG00000103023,ENST00000219301,protein_coding,ENST00000219301.8:16:-|26|1810:396:1584|canoni...,1188,31.0,21.53,COAD,canonical,protein_coding_canonical,396,ATG,MVSAAGLSGDGKMRGVLLVLLGLLYSSTSCGVQKASVFYGPDPKEG...


In [None]:
tumorReact_ORFs_temp = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEAN.csv"))
tumorReact_ORFs_temp_ORFs = tumorReact_ORFs_temp[['orfID','geneORFtype']]
tumorReact_ORFs_temp_ORFs.drop_duplicates(inplace=True)
tumorReact_ORFs_temp_ORFs.groupby("geneORFtype").count()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumorReact_ORFs_temp_ORFs.drop_duplicates(inplace=True)


Unnamed: 0_level_0,orfID
geneORFtype,Unnamed: 1_level_1
lncRNA_noncoding,35
novel_noncoding,30
processed_pseudogene_noncoding,1
protein_coding_canonical,90
protein_coding_dORF,4
protein_coding_odORF,1
protein_coding_ouORF,10
protein_coding_uORF,18


In [None]:
tumorReact_ORFs_temp_genes = tumorReact_ORFs_temp[['gene_name','geneORFtype']]
tumorReact_ORFs_temp_genes.drop_duplicates(inplace=True)
tumorReact_ORFs_temp_genes.groupby("geneORFtype").count()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumorReact_ORFs_temp_genes.drop_duplicates(inplace=True)


Unnamed: 0_level_0,gene_name
geneORFtype,Unnamed: 1_level_1
lncRNA_noncoding,25
novel_noncoding,22
processed_pseudogene_noncoding,1
protein_coding_canonical,90
protein_coding_dORF,4
protein_coding_odORF,1
protein_coding_ouORF,10
protein_coding_uORF,14


In [11]:
tumorReact_ORFs_temp_genes_ctype = tumorReact_ORFs_temp[['orfID','geneORFtype','ctype']]
tumorReact_ORFs_temp_genes_ctype.drop_duplicates(inplace=True)
tumorReact_ORFs_temp_genes_ctype.groupby(["geneORFtype","ctype"]).count()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumorReact_ORFs_temp_genes_ctype.drop_duplicates(inplace=True)


Unnamed: 0_level_0,Unnamed: 1_level_0,orfID
geneORFtype,ctype,Unnamed: 2_level_1
lncRNA_noncoding,BLCA,10
lncRNA_noncoding,BRCA,1
lncRNA_noncoding,COAD,1
lncRNA_noncoding,KIRC,4
lncRNA_noncoding,LIHC,12
lncRNA_noncoding,LUAD,6
lncRNA_noncoding,LUSC,18
lncRNA_noncoding,PRAD,3
novel_noncoding,BLCA,12
novel_noncoding,BRCA,4


In [23]:
tumorReact_ORFs[['gene_name','length_aa','geneORFtype']].drop_duplicates().groupby("geneORFtype").count()
# tumorReact_ORFs[['gene_name','length_aa','geneORFtype','ctype']].drop_duplicates().groupby(["ctype","geneORFtype"]).count()


Unnamed: 0_level_0,gene_name,length_aa
geneORFtype,Unnamed: 1_level_1,Unnamed: 2_level_1
lncRNA_noncoding,35,35
novel_noncoding,30,30
processed_pseudogene_noncoding,1,1
protein_coding_canonical,90,90
protein_coding_dORF,4,4
protein_coding_odORF,1,1
protein_coding_ouORF,10,10
protein_coding_uORF,18,18


In [24]:
set(tumorReact_ORFs[tumorReact_ORFs['geneORFtype'] == "protein_coding_uORF"].gene_name.values.tolist())

{'COX7B2',
 'DCAF8L2',
 'DYDC1',
 'MAGEA1',
 'MAGEB1',
 'MAGEC1',
 'MAGEC2',
 'PLSCR2',
 'PRSS54',
 'RFPL4B',
 'SMIM47',
 'SPANXB1',
 'TEX13C',
 'TPTE'}

Generate bed/gtf file for candidates (TSTR) to look for chromatin acessibility (Cova)

In [25]:
TSTR = pd.read_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEANgenes.csv"))
TSTR
print(len(set(TSTR.transcript_id.values.tolist())))

annotation_gtf = pd.read_csv(annotation, sep="\t", comment="#", header=None)
annotation_gtf = annotation_gtf[annotation_gtf[2] == "transcript"]
annotation_gtf


140


  annotation_gtf = pd.read_csv(annotation, sep="\t", comment="#", header=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8
1,M,StringTie,transcript,1,577,.,-,.,"transcript_id ""TCONS_00002635""; gene_id ""XLOC_..."
2,M,ENSEMBL,transcript,577,647,.,+,.,"gene_id ""ENSG00000210049.1""; transcript_id ""EN..."
5,M,ENSEMBL,transcript,648,1601,.,+,.,"gene_id ""ENSG00000211459.2""; transcript_id ""EN..."
9,M,StringTie,transcript,683,1035,.,-,.,"transcript_id ""TCONS_00002636""; gene_id ""XLOC_..."
11,M,ENSEMBL,transcript,1602,1670,.,+,.,"gene_id ""ENSG00000210077.1""; transcript_id ""EN..."
...,...,...,...,...,...,...,...,...,...
785670,KI270751.1,HAVANA,transcript,34175,73722,.,-,.,"gene_id ""ENSG00000303867.1""; transcript_id ""EN..."
785672,KI270751.1,HAVANA,transcript,44958,53274,.,+,.,"gene_id ""ENSG00000303902.1""; transcript_id ""EN..."
785678,KI270751.1,HAVANA,transcript,133801,139253,.,-,.,"gene_id ""ENSG00000306528.1""; transcript_id ""EN..."
785682,KI270753.1,HAVANA,transcript,43135,44491,.,+,.,"gene_id ""ENSG00000297844.1""; transcript_id ""EN..."


In [26]:
# Convert the list of values into a regex pattern
pattern = '|'.join(TSTR.transcript_id.values.tolist())

# Filter the DataFrame
filtered_annotation = annotation_gtf[annotation_gtf[8].str.contains(pattern)]

filtered_annotation.to_csv(os.path.join(cancer_dir,"log2ratio3x/cancertypes/TSTR_annotation.gtf"), sep="\t", index=None, header=None, quoting=csv.QUOTE_NONE)
