In [33]:
import os,re,glob
import pandas as pd
import numpy as np
from collections import Counter
from rna_seq_normalization import Normalization as Norm

users_dir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction"
specie = "human"
GENOMEDIR = "/genomics/users/marta/genomes"

### cancer data
cancer_dir = users_dir + "/cancers"
raw_cancer_dir="/users/genomics/marta/TCGA_RNASeq"
projects=["TCGA-BRCA","TCGA-LUSC","TCGA-PRAD","TCGA-KIRC","TCGA-KIRP","TCGA-LUAD","TCGA-BLCA","TCGA-LIHC"]

## annotation file
annotation="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/gencode.v38.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf"
transcript_gene=pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/1transcript_1gene.reconstructed.csv")

## Quantify with TCGAData and new reference
`featureCounts_newRef_cancer.sh`

output in: `/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/cancers/featureCounts`

## TPM

In [21]:
for file in os.listdir(os.path.join(cancer_dir,"featureCounts")):
    if file.endswith(".txt"):

        cancer_type = file[-8:-4]
        print(cancer_type)
        toc = pd.read_csv(os.path.join(cancer_dir,"featureCounts",file), sep="\t", comment="#")
        toc['Geneid']=toc['Geneid'].str.split('.').str[0]
        toc.rename(columns={'Geneid':'transcript_id'}, inplace=True)

        filter_col = [col for col in toc if col.startswith('/')]
        for col in filter_col:
            new_col=col.split("Aligned")[0]
            new_col=new_col.split("/")[-1]
            new_col=cancer_type+new_col[4:]
            toc.rename(columns={col:new_col}, inplace=True)

        length = toc['Length']
        genes = toc['transcript_id']
        # we are only interested in the columns with counts
        counts = toc
        counts.drop(["Chr","Start","End","Strand","Length","transcript_id"],axis=1, inplace=True)
        # calculate TPMs
        tpm_df = Norm.tpm(counts, length)
        # add transcript_id and length again
        tpms = pd.concat([genes,tpm_df, length], axis=1)
        tpms.to_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+cancer_type+".csv", index=None)
 

PRAD
KIRP
LUAD


In [43]:
for proj in ['TCGA-PRAD']:
    cancer_type = proj[5:]
    print(cancer_type)

    fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+cancer_type+".csv")
    tumor_transcripts = list()
    patients=pd.read_csv(os.path.join("/users/genomics/marta/TCGA_RNASeq",proj,"results/QC_patients.csv"))
    patients['normal'] = patients['normal'].str.replace('TCGA', cancer_type)
    patients['tumor'] = patients['tumor'].str.replace('TCGA', cancer_type)

    for index,patient in patients.iterrows():
        patient_fc = fc[["transcript_id","Length",patient.iloc[2]]]

        tumor_patient_fc = patient_fc[patient_fc[patient.iloc[2]] >= 1 ]
        tumor_transcripts.extend(tumor_patient_fc.transcript_id.values.tolist())

    tumor1FPKM = fc[fc['transcript_id'].isin(tumor_transcripts)]

    known = tumor1FPKM[tumor1FPKM['transcript_id'].str.contains('ENST')]
    novel = tumor1FPKM[tumor1FPKM['transcript_id'].str.contains('TCONS')]
    merged = tumor1FPKM.merge(transcript_gene, on=['transcript_id'], how="inner")

    lncRNA = merged[merged['gene_type'].isin(['lncRNA','processed_pseudogene'])]
    print("lncRNA: ",len(lncRNA))

    cds = merged[merged['gene_type'] == "protein_coding"]
    print("PROTEIN CODING: ",len(cds))

    novel = merged[merged['gene_type'] == "novel"]
    novel = novel[novel['Length'].astype(int) < 91667]
    print("NOVEL: ",len(novel))

    # tumor1FPKM = pd.concat([novel, lncRNA, cds], ignore_index = True)
    merged.to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_table_of_counts_")+cancer_type+".csv",index=False)
    merged[['transcript_id','gene_id','gene_name','gene_type','Length']].to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_")+cancer_type+".csv",index=False)

    tumorONLY_merged = merged[merged.columns.drop(list(merged.filter(regex='normal')))]
    print(tumorONLY_merged)

    ###### GET TUMOR-EXPRESSED > 1 TPM
    # Filter only integer columns
    int_columns = tumorONLY_merged.select_dtypes(include='int64')

    # Create a new column with the count of int columns per row whose value is > 1
    tumorONLY_merged['n'] = int_columns.apply(lambda row: (row > 1).sum(), axis=1)
    tumorONLY_merged = tumorONLY_merged[['transcript_id','gene_id','gene_name','gene_type','Length','n']]
    tumorONLY_merged.sort_values(by=['n'], ascending=False).to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_n_")+cancer_type+".csv",index=False)

PRAD
lncRNA:  7393
PROTEIN CODING:  15071
NOVEL:  608
         transcript_id  PRAD-CH-5761_tumor  PRAD-CH-5767_tumor  \
0       TCONS_00002128          144.470157          200.289666   
1      ENST00000387314           95.324684          178.278104   
2      ENST00000389680         1150.468400         4236.862487   
3       TCONS_00002129         2458.473628         7450.921126   
4      ENST00000387342          427.346993          293.274179   
...                ...                 ...                 ...   
26675  ENST00000600468            0.644878            0.147132   
26676  ENST00000425340            0.382016            0.193857   
26677  ENST00000318083            0.713887            0.635445   
26678  ENST00000222145            5.804824            2.531997   
26679  ENST00000645652            4.301946            6.207849   

       PRAD-CH-5768_tumor  PRAD-CH-5769_tumor  PRAD-EJ-7115_tumor  \
0              226.420020          328.384368          510.315150   
1              

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumorONLY_merged['n'] = int_columns.apply(lambda row: (row > 1).sum(), axis=1)


In [41]:
## TUMOR-SPECIFIC

Unnamed: 0,transcript_id,gene_id,gene_name,gene_type,Length,n
0,TCONS_00002128,XLOC_001837,XLOC_001837,novel,577,1
1,ENST00000387314,ENSG00000210049,MT-TF,Mt_tRNA,71,1
2,ENST00000389680,ENSG00000211459,MT-RNR1,Mt_rRNA,954,1
3,TCONS_00002129,XLOC_001838,XLOC_001838,novel,353,1
4,ENST00000387342,ENSG00000210077,MT-TV,Mt_tRNA,69,1
...,...,...,...,...,...,...
26675,ENST00000600468,ENSG00000142233,NTN5,protein_coding,2426,1
26676,ENST00000425340,ENSG00000176920,FUT2,protein_coding,3116,1
26677,ENST00000318083,ENSG00000176909,MAMSTR,protein_coding,1858,1
26678,ENST00000222145,ENSG00000105538,RASIP1,protein_coding,3199,1
