In [137]:
import os,re,glob
import pandas as pd
import numpy as np
from collections import Counter
from rna_seq_normalization import Normalization as Norm
from functools import reduce

users_dir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction"
specie = "human"
GENOMEDIR = "/genomics/users/marta/genomes"

### cancer data
cancer_dir = users_dir + "/cancers"
raw_cancer_dir="/users/genomics/marta/TCGA_RNASeq"
tcga_projects=["TCGA-BRCA","TCGA-LUSC","TCGA-PRAD","TCGA-KIRC","TCGA-KIRP","TCGA-LUAD","TCGA-BLCA"]#,"TCGA-LIHC"]
other_projects=["GSE102101_KIRC","GSE133624_BLCA","GSE22260_PRAD","GSE89223_PRAD","PRJEB2449_PRAD","SRP238334_KIRC","GSE103001_BRCA","GSE214846_LIHC","GSE229705_LUAD","TCGA_COAD"]#,"SRP107326_COAD"]
manuscript_projects = ["liver_adjacent_totalRNA_LIHC","hcc_normal_totalRNA_LIHC","GSE193567_LIHC","LIHC_TCGA_LIHC"]
all_projects = tcga_projects + other_projects + manuscript_projects

cancertypes = ["BRCA","BLCA","LUAD","KIRC","KIRP","PRAD","LUSC","LIHC"]#,"COAD"]
## annotation file
annotation="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/gencode.v38.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf"
transcript_gene=pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/1transcript_1gene.reconstructed.csv")

def count_greater_than_one(row):
    return (row > 1).sum()

## Quantify with TCGAData and new reference
`featureCounts_newRef_cancer.sh`

output in: `/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/cancers/featureCounts`

## TPM

In [58]:
for file in os.listdir(os.path.join(cancer_dir,"featureCounts")):
    if file.endswith(".txt"):

        proj = file.split("featureCounts_")[-1]
        cancer_type = proj[:-4]
        print(cancer_type)
        toc = pd.read_csv(os.path.join(cancer_dir,"featureCounts",file), sep="\t", comment="#")
        toc['Geneid']=toc['Geneid'].str.split('.').str[0]
        toc.rename(columns={'Geneid':'transcript_id'}, inplace=True)

        filter_col = [col for col in toc if col.startswith('/')]
        for col in filter_col:
            new_col=col.split("Aligned")[0]
            new_col=new_col.split("/")[-1]
            if proj in tcga_projects:
                new_col=cancer_type+new_col[4:]
            toc.rename(columns={col:new_col}, inplace=True)

        length = toc['Length']
        genes = toc['transcript_id']
        # we are only interested in the columns with counts
        counts = toc
        counts.drop(["Chr","Start","End","Strand","Length","transcript_id"],axis=1, inplace=True)
        # calculate TPMs
        tpm_df = Norm.tpm(counts, length)
        # add transcript_id and length again
        tpms = pd.concat([genes,tpm_df, length], axis=1)
        tpms.to_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+cancer_type+".csv", index=None)
 

TCGA-PRAD
TCGA-LUSC
GSE89223_PRAD
TCGA-BLCA
GSE102101_KIRC
TCGA_COAD_SE
GSE103001_BRCA
TCGA-BRCA
GSE214846_LIHC
liver_adjacent_totalRNA_LIHC
GSE193567_LIHC
TCGA_COAD_PE
TCGA-KIRP
GSE22260_PRAD
GSE229705_LUAD
GSE133624_BLCA
PRJEB2449_PRAD
TCGA-LUAD
hcc_normal_totalRNA_LIHC
TCGA-KIRC
LIHC_TCGA_LIHC
SRP238334_KIRC


In [3]:
## merge single-end and paired-end TCGA COAD
SE_COAD = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_TCGA_COAD_SE.csv"))
PE_COAD = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_TCGA_COAD_PE.csv"))

COAD = SE_COAD.merge(PE_COAD, on=["transcript_id","Length"])
COAD.to_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_TCGA_COAD.csv"), index=None)

In [134]:
## filter patients we are interested in, from those not TCGA
for proj in all_projects:
    print(proj)
    if proj in tcga_projects:
        cancer_type = proj[5:]

        fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
        tumor_transcripts = list()
        patients=pd.read_csv(os.path.join("/users/genomics/marta/TCGA_RNASeq",proj,"results/QC_patients1.csv"))
        # patients['normal'] = patients['normal'].str.replace('TCGA', cancer_type)
        # patients['tumor'] = patients['tumor'].str.replace('TCGA', cancer_type)

    elif proj in other_projects:
        fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
        tumor_transcripts = list()
        patients=pd.read_csv(os.path.join("/users/genomics/marta/cancers_RNASeq",proj,"results/patients.csv"))

        samples_to_keep = patients.tumor.values.tolist() + patients.normal.values.tolist() + ['Length','transcript_id']
        fc_selected = fc[fc.columns.intersection(samples_to_keep)]

    elif proj in manuscript_projects:
        fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
        tumor_transcripts = list()

        if "liver_adjacent_totalRNA" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/liver_adjacent_totalRNA/results/clean_patients.csv"))
            # patients.rename(columns={'adjacent':'normal'}, inplace=True)
        elif "hcc_normal_totalRNA" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/hcc_normal_totalRNA/results/clean_patients.csv"))
            # patients.rename(columns={'adjacent':'normal'}, inplace=True)
        elif "GSE193567" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/GSE193567/results/clean_patients.csv"))        
            # patients.rename(columns={'adjacent':'normal'}, inplace=True)
        elif "LIHC_TCGA" in proj:
            patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/LIHC_TCGA/results/clean_patients.csv"))    
            patients['normal'] = patients['patient'] + "_normal"
            patients['tumor'] = patients['patient'] + "_tumor"

    for index,patient in patients.iterrows():
        if proj in tcga_projects:
            patient_fc = fc[["transcript_id","Length",patient['tumor']]]

            tumor_patient_fc = patient_fc[patient_fc[patient['tumor']] >= 1 ]
            tumor_transcripts.extend(tumor_patient_fc.transcript_id.values.tolist())
        if proj in other_projects:
            patient_fc = fc[["transcript_id","Length",patient['tumor']]]

            tumor_patient_fc = patient_fc[patient_fc[patient['tumor']] >= 1 ]
            tumor_transcripts.extend(tumor_patient_fc.transcript_id.values.tolist())       
        if proj in manuscript_projects:
            patient_fc = fc[["transcript_id","Length",patient['tumor']]]

            tumor_patient_fc = patient_fc[patient_fc[patient['tumor']] >= 1 ]
            tumor_transcripts.extend(tumor_patient_fc.transcript_id.values.tolist())                        

    tumor1FPKM = fc[fc['transcript_id'].isin(tumor_transcripts)]
    # print(tumor1FPKM)
    known = tumor1FPKM[tumor1FPKM['transcript_id'].str.contains('ENST')]
    novel = tumor1FPKM[tumor1FPKM['transcript_id'].str.contains('TCONS')]
    merged = tumor1FPKM.merge(transcript_gene, on=['transcript_id'], how="inner")
    merged = merged[merged['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]

    lncRNA = merged[merged['gene_type'].isin(['lncRNA','processed_pseudogene'])]
    print("lncRNA: ",len(lncRNA))

    cds = merged[merged['gene_type'] == "protein_coding"]
    print("PROTEIN CODING: ",len(cds))

    novel = merged[merged['gene_type'] == "novel"]
    ## length limitations
    novel = novel[novel['Length'].astype(int) < 91667]
    novel = novel[novel['Length'].astype(int) > 300]

    print("NOVEL: ",len(novel))

    # # tumor1FPKM = pd.concat([novel, lncRNA, cds], ignore_index = True)
    # merged[['transcript_id','gene_id','gene_name','gene_type','Length']].to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_")+cancer_type+".csv",index=False)

    ## keep only tumor samples
    if proj in tcga_projects:
        tumorONLY_merged = merged[merged.columns.drop(list(merged.filter(regex='normal')))]
    else:
        tumorONLY_merged = merged.drop(columns=patients.normal.values.tolist())

    tumorONLY_merged = tumorONLY_merged[['transcript_id','gene_id','gene_id', 'gene_type', 'gene_name','Length', *patients.tumor.values.tolist()]]
    tumorONLY_merged.to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_table_of_counts_")+proj+".csv",index=False)

    ###### GET TUMOR-EXPRESSED > 1 TPM
    # Filter only integer columns
    int_columns = tumorONLY_merged.select_dtypes(include='int64')
    
    # Create a new column with the count of int columns per row whose value is > 1
    # tumorONLY_merged['n'] = int_columns.apply(lambda row: (row > 1).sum(), axis=1)
    tumorONLY_merged['n'] = tumorONLY_merged.drop(columns=['transcript_id','gene_id','gene_name','gene_type','Length']).apply(count_greater_than_one, axis=1)
    tumorONLY_merged = tumorONLY_merged[['transcript_id','gene_id','gene_name','gene_type','Length','n']]
    tumorONLY_merged.sort_values(by=['n'], ascending=False).to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_n_")+proj+".csv",index=False)

TCGA-BRCA
lncRNA:  10685
PROTEIN CODING:  16230
NOVEL:  478
TCGA-LUSC
lncRNA:  9922
PROTEIN CODING:  16280
NOVEL:  456
TCGA-PRAD
lncRNA:  7393
PROTEIN CODING:  15071
NOVEL:  385
TCGA-KIRC
lncRNA:  7978
PROTEIN CODING:  15463
NOVEL:  391
TCGA-KIRP
lncRNA:  7911
PROTEIN CODING:  15436
NOVEL:  390
TCGA-LUAD
lncRNA:  10065
PROTEIN CODING:  16087
NOVEL:  431
TCGA-BLCA
lncRNA:  7942
PROTEIN CODING:  15521
NOVEL:  373
GSE102101_KIRC
lncRNA:  6587
PROTEIN CODING:  14516
NOVEL:  307
GSE133624_BLCA
lncRNA:  8586
PROTEIN CODING:  15619
NOVEL:  425
GSE22260_PRAD
lncRNA:  10545
PROTEIN CODING:  14884
NOVEL:  482
GSE89223_PRAD
lncRNA:  8199
PROTEIN CODING:  7429
NOVEL:  302
PRJEB2449_PRAD
lncRNA:  6886
PROTEIN CODING:  13721
NOVEL:  335
SRP238334_KIRC
lncRNA:  8972
PROTEIN CODING:  15239
NOVEL:  447
GSE103001_BRCA
lncRNA:  11381
PROTEIN CODING:  15184
NOVEL:  559
GSE214846_LIHC
lncRNA:  10739
PROTEIN CODING:  15963
NOVEL:  515
GSE229705_LUAD
lncRNA:  17050
PROTEIN CODING:  16255
NOVEL:  690
TCGA_COA

In [149]:
## create a merged table of counts per ctype and a big patients file
## CAN THEY BE CONSIDERED AS ONE?
for ctype in cancertypes:
    print(ctype)
    
    # Lists to store DataFrames for counts and patients
    fc_list = []
    patients_list = []
    
    for proj in all_projects:
        if proj in tcga_projects and ctype in proj:
            print(proj)
            # Read the CSV files for TCGA projects
            patients = pd.read_csv(os.path.join("/users/genomics/marta/TCGA_RNASeq", proj, "results/QC_patients1.csv"))
            patients['project'] = proj  # Add the project column
            fc = pd.read_csv(os.path.join(cancer_dir, "featureCounts", f"table_of_counts_TPMs_{proj}.csv"))
            fc_list.append(fc)
            patients_list.append(patients)
        
        elif proj in other_projects and ctype in proj:
            print(proj)
            # Read the CSV files for other projects
            patients = pd.read_csv(os.path.join("/users/genomics/marta/cancers_RNASeq", proj, "results/patients.csv"))
            patients['project'] = proj  # Add the project column
            fc = pd.read_csv(os.path.join(cancer_dir, "featureCounts", f"table_of_counts_TPMs_{proj}.csv"))
            fc_list.append(fc)
            patients_list.append(patients)
        
        elif proj in manuscript_projects and ctype in proj:
            print(proj)
            # Read the CSV files for manuscript projects
            if proj == "LIHC_TCGA_LIHC":
                patients = pd.read_csv(os.path.join("/projects_eg/projects/marta", proj[:-5], "results/clean_patients.csv"))
                patients['normal'] = patients['patient'] + "_normal"
                patients['tumor'] = patients['patient'] + "_tumor"

            else:
                patients = pd.read_csv(os.path.join("/projects_eg/projects/marta", proj[:-5], "results/clean_patients.csv"))

            patients['project'] = proj  # Add the project column
            fc = pd.read_csv(os.path.join(cancer_dir, "featureCounts", f"table_of_counts_TPMs_{proj}.csv"))
            fc_list.append(fc)
            patients_list.append(patients)
    
    # Merge all fc DataFrames by 'transcript_id' and 'Length'
    if fc_list:
        merged_fc = reduce(lambda left, right: pd.merge(left, right, on=["transcript_id", "Length"], how='outer'), fc_list)
        # Save the merged fc DataFrame
        merged_fc.to_csv(os.path.join(cancer_dir, f"merged_fc_{ctype}.csv"), index=False)
    
    # Combine all patient DataFrames into one big DataFrame
    if patients_list:
        merged_patients = pd.concat(patients_list, ignore_index=True)
        # Save the merged patients DataFrame
        merged_patients.to_csv(os.path.join(cancer_dir, f"merged_patients_{ctype}.csv"), index=False)


BRCA
TCGA-BRCA
GSE103001_BRCA
BLCA
TCGA-BLCA
GSE133624_BLCA
LUAD
TCGA-LUAD
GSE229705_LUAD
KIRC
TCGA-KIRC
GSE102101_KIRC
SRP238334_KIRC
KIRP
TCGA-KIRP
PRAD
TCGA-PRAD
GSE22260_PRAD
GSE89223_PRAD
PRJEB2449_PRAD
LUSC
TCGA-LUSC
LIHC
GSE214846_LIHC
liver_adjacent_totalRNA_LIHC
hcc_normal_totalRNA_LIHC
GSE193567_LIHC
LIHC_TCGA_LIHC


In [150]:
## Expressed in > 10% of the patients
df_pancancer = pd.DataFrame(columns = ['transcript_id','gene_id','gene_name','gene_type','Length','n'])

## group by cancertype
for ctype in cancertypes:
    print(ctype)
    ctype_df = pd.DataFrame(columns = ['transcript_id','gene_id','gene_name','gene_type','Length','n'])

    num_patients_ctype = 0
    for proj in all_projects:
        if proj in tcga_projects:
            if ctype in proj:
                print(proj)
                fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
                patients=pd.read_csv(os.path.join("/users/genomics/marta/TCGA_RNASeq",proj,"results/QC_patients1.csv"))
                ## count how many patients per cancer type (not dataset)
                num_patients_ctype = num_patients_ctype + len(patients)
                input = pd.read_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_n_")+proj+".csv")
                ctype_df = pd.concat([ctype_df, input])

        elif proj in other_projects:
            if ctype in proj:
                print(proj)
                fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
                tumor_transcripts = list()
                patients=pd.read_csv(os.path.join("/users/genomics/marta/cancers_RNASeq",proj,"results/patients.csv"))
                ## count how many patients per cancer type (not dataset)
                num_patients_ctype = num_patients_ctype + len(patients)

                input = pd.read_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_n_")+proj+".csv")
                ctype_df = pd.concat([ctype_df, input])
        elif proj in manuscript_projects:
            if ctype in proj:
                print(proj)                
                fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
                tumor_transcripts = list()
                patients=pd.read_csv(os.path.join("/projects_eg/projects/marta",proj[:-5],"results/clean_patients.csv"))
                ## count how many patients per cancer type (not dataset)
                num_patients_ctype = num_patients_ctype + len(patients)

                input = pd.read_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_n_")+proj+".csv")
                ctype_df = pd.concat([ctype_df, input])


    print(num_patients_ctype)
    ctype_df_summed = ctype_df
    ctype_df_summed = ctype_df_summed.groupby(["transcript_id","gene_id","gene_name","gene_type","Length"])["n"].sum().reset_index()
    ctype_df_summed.drop_duplicates(inplace=True)
    # print(ctype_df_summed)

    percent10 = ctype_df_summed[ctype_df_summed['n'] > (num_patients_ctype*0.1)]
    percent10.sort_values(by=['n'], ascending=False).to_csv(os.path.join(cancer_dir,"tumorexpressed/cancertypes/tumor_1FPKM_n10percent_")+ctype+".csv",index=False)

    percent10['ctype'] = ctype
    df_pancancer = pd.concat([df_pancancer, percent10])

    df_pancancer.to_csv(os.path.join(cancer_dir,"tumorexpressed/cancertypes/tumor_1FPKM_n10percent_pancancer.csv"),index=False)

BRCA
TCGA-BRCA
GSE103001_BRCA
131


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percent10['ctype'] = ctype


BLCA
TCGA-BLCA
GSE133624_BLCA
38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percent10['ctype'] = ctype


LUAD
TCGA-LUAD
GSE229705_LUAD
179


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percent10['ctype'] = ctype


KIRC
TCGA-KIRC
GSE102101_KIRC
SRP238334_KIRC
142


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percent10['ctype'] = ctype


KIRP
TCGA-KIRP
32


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percent10['ctype'] = ctype


PRAD
TCGA-PRAD
GSE22260_PRAD
GSE89223_PRAD
PRJEB2449_PRAD
88


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percent10['ctype'] = ctype


LUSC
TCGA-LUSC
49


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percent10['ctype'] = ctype


LIHC
GSE214846_LIHC
liver_adjacent_totalRNA_LIHC
hcc_normal_totalRNA_LIHC
GSE193567_LIHC
LIHC_TCGA_LIHC
182


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percent10['ctype'] = ctype


In [24]:
ctype_df_summed
ctype_df[ctype_df['gene_name'] == "M6PR"]

Unnamed: 0,transcript_id,gene_id,gene_name,gene_type,Length,n
1205,ENST00000000412,ENSG00000003056,M6PR,protein_coding,2450,118
1058,ENST00000000412,ENSG00000003056,M6PR,protein_coding,2450,22


In [151]:
#Select normal samples and create a file with min max expression in normal samples per cancer type
tolerance_dictionary = dict()

## group by cancertype
for ctype in cancertypes:
    print(ctype)
    num_patients_ctype = 0
    for proj in all_projects:
        if proj in tcga_projects:
            if ctype in proj:
                print(proj)

                # fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
                patients=pd.read_csv(os.path.join("/users/genomics/marta/TCGA_RNASeq",proj,"results/QC_patients1.csv"))
                num_patients_ctype = num_patients_ctype + len(patients)

        elif proj in other_projects:
            if ctype in proj:
                print(proj)
                # fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
                patients=pd.read_csv(os.path.join("/users/genomics/marta/cancers_RNASeq",proj,"results/patients.csv"))
                num_patients_ctype = num_patients_ctype + len(patients)

        elif proj in manuscript_projects:
            if ctype in proj:
                print(proj)
                # fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
                patients=pd.read_csv(os.path.join("/projects_eg/projects/marta",proj[:-5],"results/clean_patients.csv"))
                num_patients_ctype = num_patients_ctype + len(patients)

    # print(num_patients_ctype)
    print(len(patients))

    tolerance_dictionary[ctype] = num_patients_ctype*0.05
    # tolerance_dictionary[proj] = len(patients)*0.05

BRCA
TCGA-BRCA
GSE103001_BRCA
22
BLCA
TCGA-BLCA
GSE133624_BLCA
20
LUAD
TCGA-LUAD
GSE229705_LUAD
123
KIRC
TCGA-KIRC
GSE102101_KIRC
SRP238334_KIRC
61
KIRP
TCGA-KIRP
32
PRAD
TCGA-PRAD
GSE22260_PRAD
GSE89223_PRAD
PRJEB2449_PRAD
13
LUSC
TCGA-LUSC
49
LIHC
GSE214846_LIHC
liver_adjacent_totalRNA_LIHC
hcc_normal_totalRNA_LIHC
GSE193567_LIHC
LIHC_TCGA_LIHC
47


In [139]:
tolerance_dictionary

{'BRCA': 6.550000000000001,
 'BLCA': 1.9000000000000001,
 'LUAD': 8.950000000000001,
 'KIRC': 7.1000000000000005,
 'KIRP': 1.6,
 'PRAD': 4.4,
 'LUSC': 2.45,
 'LIHC': 9.1}

In [113]:
# Function to count values above 0.1 in a row
def count_values_above_01(row):
    return sum(row > 0.1)

In [153]:
## TUMOR-SPECIFIC - per cancer type
for ctype in cancertypes:
    print(ctype)
    tumorspecific_candidates = pd.DataFrame()

    patients = pd.read_csv(cancer_dir+"/merged_patients_"+ctype+".csv")
    fc=pd.read_csv(cancer_dir+"/merged_fc_"+ctype+".csv")

    fc_info = fc.merge(transcript_gene, on=['transcript_id'], how="inner")
    fc_info = fc_info[fc_info['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]

    ## select only normal samples
    fc_normals = fc_info.drop(columns=patients.tumor.values.tolist())
    
    # Select only the columns with float data types
    float_columns = fc_normals.select_dtypes(include=['float64'])

    # Apply the function only to the float columns and store the result in a new column
    fc_normals['counts_01'] = float_columns.apply(count_values_above_01, axis=1)
    # fc_normals_info = pd.concat([fc_normals, fc_info[['gene_id','transcript_id','gene_name','gene_type']]], axis=1)
    fc_normals_info = fc_normals[['gene_id','transcript_id','gene_name','gene_type','counts_01']]


    for index,patient in patients.iterrows():
        patient_fc = fc_info[["transcript_id","gene_id","gene_name","gene_type","Length",patient['tumor'],patient['normal']]] 
        tumor1TPM = patient_fc[patient_fc[patient['tumor']] > 1 ]
        tumor1TPM_normal01TPM = tumor1TPM[tumor1TPM[patient['normal']] < 0.1 ]

        try:
            os.makedirs(os.path.join(cancer_dir,"tumorspecific/cancertypes",ctype,patient['patient']))
        except:
            pass
        tumor1TPM_normal01TPM.to_csv(os.path.join(cancer_dir,"tumorspecific/cancertypes",ctype,patient['patient'],"tumorspecific_TPMs.csv"), index=None)
        tumorspecific_candidates = pd.concat([tumorspecific_candidates,tumor1TPM_normal01TPM[["transcript_id","gene_id","gene_name","gene_type","Length"]]])


    summary = tumorspecific_candidates.groupby(["transcript_id","gene_id","gene_name","gene_type","Length"]).size().reset_index(name='n')
    summary.sort_values(by="n", ascending=False).to_csv(os.path.join(cancer_dir,"tumorspecific/cancertypes")+"/tumorspecific_n_"+ctype+".csv", index=False)
    ### tolerance 5% only
    summary_tolerance = fc_normals_info.merge(summary, on=["gene_id","transcript_id","gene_name","gene_type"], how="right")
    # ## Expression > 0.1 in less than 5%
    summary_tolerance_accepted = summary_tolerance[summary_tolerance['counts_01'] < tolerance_dictionary[ctype]]
    summary_tolerance_accepted.drop(["counts_01"], axis=1, inplace=True)
    summary_tolerance_accepted.sort_values(by="n", ascending=False).to_csv(os.path.join(cancer_dir,"tumorspecific/cancertypes")+"/tumorspecific_n_"+ctype+"_TOLERANCE5PERCENT.csv", index=False)
    print("Done!")

BRCA


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_tolerance_accepted.drop(["counts_01"], axis=1, inplace=True)


Done!
BLCA


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_tolerance_accepted.drop(["counts_01"], axis=1, inplace=True)


Done!
LUAD


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_tolerance_accepted.drop(["counts_01"], axis=1, inplace=True)


Done!
KIRC


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_tolerance_accepted.drop(["counts_01"], axis=1, inplace=True)


Done!
KIRP


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_tolerance_accepted.drop(["counts_01"], axis=1, inplace=True)


Done!
PRAD


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_tolerance_accepted.drop(["counts_01"], axis=1, inplace=True)


Done!
LUSC


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_tolerance_accepted.drop(["counts_01"], axis=1, inplace=True)


Done!
LIHC
Done!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_tolerance_accepted.drop(["counts_01"], axis=1, inplace=True)


In [None]:
# ## TUMOR-SPECIFIC - per dataset

# for proj in all_projects:
#     tumorspecific_candidates = pd.DataFrame()

#     if proj in tcga_projects:
#         print(proj)
#         fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
#         patients=pd.read_csv(os.path.join("/users/genomics/marta/TCGA_RNASeq",proj,"results/QC_patients1.csv"))

#     elif proj in other_projects:
#         print(proj)
#         fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")
#         patients=pd.read_csv(os.path.join("/users/genomics/marta/cancers_RNASeq",proj,"results/patients.csv"))

#     elif proj in manuscript_projects:
#         print(proj)
#         fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+proj+".csv")

#         if "LIHC_TCGA" in proj:
#             patients=pd.read_csv(os.path.join("/projects_eg/projects/marta/LIHC_TCGA/results/clean_patients.csv"))    
#             patients['normal'] = patients['patient'] + "_normal"
#             patients['tumor'] = patients['patient'] + "_tumor"
        
#             normals = [col for col in fc_info.columns if 'normal' in col]

#     fc_info = fc.merge(transcript_gene, on=['transcript_id'], how="inner")
#     fc_info = fc_info[fc_info['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]

#     ## keep only normal samples
#     if proj in tcga_projects:
#         fc_normals = fc_info[fc_info.columns.drop(list(fc_info.filter(regex='tumor')))]
#     else:
#         fc_normals = fc_info.drop(columns=patients.tumor.values.tolist())
    
#     # Select only the columns with float data types
#     float_columns = fc_normals.select_dtypes(include=['float64'])

#     # Apply the function only to the float columns and store the result in a new column
#     fc_normals['counts_01'] = float_columns.apply(count_values_above_01, axis=1)

#     fc_normals_info = pd.concat([fc_normals, fc_info[['gene_id','transcript_id','gene_name','gene_type']]], axis=1)
#     fc_normals_info = fc_normals_info[['gene_id','transcript_id','gene_name','gene_type','counts_01']]


#     for index,patient in patients.iterrows():
#         if proj in tcga_projects:
#             patient_fc = fc_info[["transcript_id","gene_id","gene_name","gene_type","Length",patient['tumor'],patient['normal']]] 
#             tumor1TPM = patient_fc[patient_fc[patient['tumor']] > 1 ]
#             tumor1TPM_normal01TPM = tumor1TPM[tumor1TPM[patient['normal']] < 0.1 ]
#         else:
#             patient_fc = fc_info[["transcript_id","gene_id","gene_name","gene_type","Length",patient['tumor'],patient['adjacent']]] 
#             tumor1TPM = patient_fc[patient_fc[patient['tumor']] > 1 ]
#             tumor1TPM_normal01TPM = tumor1TPM[tumor1TPM[patient['normal']] < 0.1 ]

#         try:
#             os.makedirs(os.path.join(cancer_dir,"tumorspecific",proj,patient['patient']))
#         except:
#             print("Directory exists")
#         tumor1TPM_normal01TPM.to_csv(os.path.join(cancer_dir,"tumorspecific",proj,patient['patient'],"tumorspecific_TPMs.csv"), index=None)
#         tumorspecific_candidates = pd.concat([tumorspecific_candidates,tumor1TPM_normal01TPM[["transcript_id","gene_id","gene_name","gene_type","Length"]]])


# summary = tumorspecific_candidates.groupby(["transcript_id","gene_id","gene_name","gene_type","Length"]).size().reset_index(name='n')
# summary.sort_values(by="n", ascending=False).to_csv(os.path.join(cancer_dir,"tumorspecific")+"/tumorspecific_n_"+proj+".csv", index=False)
# print(len(summary))
# ### tolerance 5% only
# summary_tolerance = fc_normals_info.merge(summary, on=["gene_id","transcript_id"], how="right")
# # ## Expression > 0.1 in less than 5%
# summary_tolerance_accepted = summary_tolerance[summary_tolerance['counts_01'] < tolerance_dictionary[proj]]
# summary_tolerance_accepted.drop(["counts_01"], axis=1, inplace=True)
# summary_tolerance_accepted.sort_values(by="n", ascending=False).to_csv(os.path.join(cancer_dir,"tumorspecific")+"/tumorspecific_n_"+proj+"_TOLERANCE5PERCENT.csv", index=False)
# print(len(summary_tolerance_accepted))
# print("Done!")

## GTEx

In [162]:
GTEx=pd.read_csv(os.path.join(GENOMEDIR,"GTEx/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct"), sep="\t", skiprows=2)
GTEx.drop('Description', inplace=True, axis=1)
GTEx['Name'] = GTEx['Name'].str[:-2]
for ctype in cancertypes:
    tumorspecific = pd.read_csv(os.path.join(cancer_dir,"tumorspecific/cancertypes")+"/tumorspecific_n_"+ctype+"_TOLERANCE5PERCENT.csv")
    ## outfiles
    output_full = os.path.join(os.path.join(cancer_dir,"tumorspecific/cancertypes/GTEx")+"/tumorspecific_n_"+ctype+"_TOLERANCE5PERCENT_GTExfull.csv")
    output_shared = os.path.join(os.path.join(cancer_dir,"tumorspecific/cancertypes/GTEx")+"/tumorspecific_n_"+ctype+"_TOLERANCE5PERCENT_inGTEx.csv")
    output_absent = os.path.join(os.path.join(cancer_dir,"tumorspecific/cancertypes/GTEx")+"/tumorspecific_n_"+ctype+"_TOLERANCE5PERCENT_noGTEx.csv")

    full = GTEx[GTEx['Name'].isin(tumorspecific.gene_id.values.tolist())]
    full.to_csv(output_full, index=False)

    absent = tumorspecific[~tumorspecific['gene_id'].isin(GTEx.Name.values.tolist())]
    absent.to_csv(output_absent, index=False)

    shared = tumorspecific[tumorspecific['gene_id'].isin(GTEx.Name.values.tolist())]
    shared.to_csv(output_shared, index=False)

In [165]:
##  MEDIAN TPM = 0.5
for ctype in cancertypes:
    print(ctype)
    ### GTEx output
    fullGTEx = pd.read_csv(os.path.join(os.path.join(cancer_dir,"tumorspecific/cancertypes/GTEx")+"/tumorspecific_n_"+ctype+"_TOLERANCE5PERCENT_GTExfull.csv"))
    ### present in GTEx
    inGTEx = pd.read_csv(os.path.join(os.path.join(cancer_dir,"tumorspecific/cancertypes/GTEx")+"/tumorspecific_n_"+ctype+"_TOLERANCE5PERCENT_inGTEx.csv"))
    ### absent in GTEx
    noGTEx = pd.read_csv(os.path.join(os.path.join(cancer_dir,"tumorspecific/cancertypes/GTEx")+"/tumorspecific_n_"+ctype+"_TOLERANCE5PERCENT_noGTEx.csv"))

    to_save = fullGTEx[['Name','Testis','Ovary']]
    fullGTEx.drop(['Testis','Ovary','Name'], axis=1, inplace=True)

    # True False greater than 0
    inGTEx_05 = fullGTEx.gt(0.5)

    # count True columns (greater than 0)
    fullGTEx['gt'] = inGTEx_05[(inGTEx_05 == True)].count(axis='columns')

    # get rows with no grater than 0
    selected_in_GTEx = fullGTEx.loc[fullGTEx['gt'] == 0]
    selected_in_GTEx.drop('gt', axis=1, inplace=True)

    selected_in_GTEx = pd.merge(to_save, selected_in_GTEx , left_index=True, right_index=True)
    selected_in_GTEx.rename(columns={'Name':'gene_id'}, inplace=True)
    selected_in_GTEx.to_csv(os.path.join(cancer_dir,"tumorspecific/cancertypes/GTEx")+"/GTExvalues_"+ctype+"_TOLERANCE5PERCENT.csv", index=False)

    afterGTEx = pd.concat([inGTEx[inGTEx['gene_id'].isin(selected_in_GTEx.gene_id.values.tolist())], noGTEx])
    afterGTEx.to_csv(os.path.join(cancer_dir,"tumorspecific/cancertypes/GTEx")+"/tumorspecific_n_GTEx_"+ctype+"_TOLERANCE5PERCENT.csv", index=False)


BRCA
BLCA
LUAD


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_in_GTEx.drop('gt', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_in_GTEx.drop('gt', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_in_GTEx.drop('gt', axis=1, inplace=True)


KIRC
KIRP
PRAD
LUSC


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_in_GTEx.drop('gt', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_in_GTEx.drop('gt', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_in_GTEx.drop('gt', axis=1, inplace=True)


LIHC


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_in_GTEx.drop('gt', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_in_GTEx.drop('gt', axis=1, inplace=True)
