In [19]:
import os,re,glob
import pandas as pd
import numpy as np
from collections import Counter
from rna_seq_normalization import Normalization as Norm

users_dir = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction"
specie = "human"
GENOMEDIR = "/genomics/users/marta/genomes"

### cancer data
cancer_dir = users_dir + "/cancers"
raw_cancer_dir="/users/genomics/marta/TCGA_RNASeq"
projects=["TCGA-BRCA","TCGA-LUSC","TCGA-PRAD","TCGA-KIRC","TCGA-KIRP","TCGA-LUAD"]#,"TCGA-BLCA","TCGA-LIHC"]

## annotation file
annotation="/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/gencode.v38.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf"
transcript_gene=pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/1transcript_1gene.reconstructed.csv")

def count_greater_than_one(row):
    return (row > 1).sum()

## Quantify with TCGAData and new reference
`featureCounts_newRef_cancer.sh`

output in: `/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/cancers/featureCounts`

## TPM

In [2]:
for file in os.listdir(os.path.join(cancer_dir,"featureCounts")):
    if file.endswith(".txt"):

        cancer_type = file[-8:-4]
        print(cancer_type)
        toc = pd.read_csv(os.path.join(cancer_dir,"featureCounts",file), sep="\t", comment="#")
        toc['Geneid']=toc['Geneid'].str.split('.').str[0]
        toc.rename(columns={'Geneid':'transcript_id'}, inplace=True)

        filter_col = [col for col in toc if col.startswith('/')]
        for col in filter_col:
            new_col=col.split("Aligned")[0]
            new_col=new_col.split("/")[-1]
            new_col=cancer_type+new_col[4:]
            toc.rename(columns={col:new_col}, inplace=True)

        length = toc['Length']
        genes = toc['transcript_id']
        # we are only interested in the columns with counts
        counts = toc
        counts.drop(["Chr","Start","End","Strand","Length","transcript_id"],axis=1, inplace=True)
        # calculate TPMs
        tpm_df = Norm.tpm(counts, length)
        # add transcript_id and length again
        tpms = pd.concat([genes,tpm_df, length], axis=1)
        tpms.to_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+cancer_type+".csv", index=None)
 

PRAD
LUSC
BRCA
KIRP
LUAD
KIRC


In [20]:
for proj in projects:
    cancer_type = proj[5:]
    print(cancer_type)

    fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+cancer_type+".csv")
    tumor_transcripts = list()
    patients=pd.read_csv(os.path.join("/users/genomics/marta/TCGA_RNASeq",proj,"results/QC_patients.csv"))
    patients['normal'] = patients['normal'].str.replace('TCGA', cancer_type)
    patients['tumor'] = patients['tumor'].str.replace('TCGA', cancer_type)

    for index,patient in patients.iterrows():
        if "_" not in patient.iloc[0]: ## if there are two samples per patient, keep only one
            patient_fc = fc[["transcript_id","Length",patient.iloc[2]]]

            tumor_patient_fc = patient_fc[patient_fc[patient.iloc[2]] >= 1 ]
            tumor_transcripts.extend(tumor_patient_fc.transcript_id.values.tolist())

    tumor1FPKM = fc[fc['transcript_id'].isin(tumor_transcripts)]

    known = tumor1FPKM[tumor1FPKM['transcript_id'].str.contains('ENST')]
    novel = tumor1FPKM[tumor1FPKM['transcript_id'].str.contains('TCONS')]
    merged = tumor1FPKM.merge(transcript_gene, on=['transcript_id'], how="inner")
    merged = merged[merged['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]

    lncRNA = merged[merged['gene_type'].isin(['lncRNA','processed_pseudogene'])]
    print("lncRNA: ",len(lncRNA))

    cds = merged[merged['gene_type'] == "protein_coding"]
    print("PROTEIN CODING: ",len(cds))

    novel = merged[merged['gene_type'] == "novel"]
    novel = novel[novel['Length'].astype(int) < 91667]
    novel = novel[novel['Length'].astype(int) > 300]

    print("NOVEL: ",len(novel))

    # # tumor1FPKM = pd.concat([novel, lncRNA, cds], ignore_index = True)
    # merged.to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_table_of_counts_")+cancer_type+".csv",index=False)
    # merged[['transcript_id','gene_id','gene_name','gene_type','Length']].to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_")+cancer_type+".csv",index=False)

    tumorONLY_merged = merged[merged.columns.drop(list(merged.filter(regex='normal')))]
    ###### GET TUMOR-EXPRESSED > 1 TPM
    # Filter only integer columns
    int_columns = tumorONLY_merged.select_dtypes(include='int64')
    
    # Create a new column with the count of int columns per row whose value is > 1
    # tumorONLY_merged['n'] = int_columns.apply(lambda row: (row > 1).sum(), axis=1)
    tumorONLY_merged['n'] = tumorONLY_merged.drop(columns=['transcript_id','gene_id','gene_name','gene_type','Length']).apply(count_greater_than_one, axis=1)
    print(tumorONLY_merged)
    tumorONLY_merged = tumorONLY_merged[['transcript_id','gene_id','gene_name','gene_type','Length','n']]
    tumorONLY_merged.sort_values(by=['n'], ascending=False).to_csv(os.path.join(cancer_dir,"tumorexpressed/tumor_1FPKM_n_")+cancer_type+".csv",index=False)

BRCA
lncRNA:  10685
PROTEIN CODING:  16230
NOVEL:  478


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumorONLY_merged['n'] = tumorONLY_merged.drop(columns=['transcript_id','gene_id','gene_name','gene_type','Length']).apply(count_greater_than_one, axis=1)


         transcript_id  BRCA-A7-A0CE_tumor  BRCA-A7-A0CH_tumor  \
0       TCONS_00002128          267.709487          282.620818   
3       TCONS_00002129         1892.387570         3628.566131   
7      ENST00000361390         4922.762555         6200.533782   
11     ENST00000361453         4632.671860         5120.310994   
17     ENST00000361624        12520.123574        12805.411471   
...                ...                 ...                 ...   
33013  ENST00000600468            0.050594            0.058366   
33014  ENST00000425340            6.479699           10.709122   
33015  ENST00000318083            2.510292            1.574990   
33016  ENST00000222145            4.556231            4.455789   
33017  ENST00000645652            6.365758            1.211338   

       BRCA-A7-A0D9_tumor  BRCA-A7-A0DB_2_tumor  BRCA-A7-A0DB_3_tumor  \
0              361.264756            185.775806             11.623270   
3             4115.244777           3399.586815            96

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumorONLY_merged['n'] = tumorONLY_merged.drop(columns=['transcript_id','gene_id','gene_name','gene_type','Length']).apply(count_greater_than_one, axis=1)


         transcript_id  LUSC-22-4593_tumor  LUSC-22-4609_tumor  \
0       TCONS_00002128           51.003975          134.036441   
3       TCONS_00002129         1071.286059          908.034969   
7      ENST00000361390         2250.151348         2307.340567   
11     ENST00000361453         2767.517768         1845.464335   
17     ENST00000361624         8874.354968         5926.298466   
...                ...                 ...                 ...   
31495  ENST00000425340            2.286746           11.556526   
31496  ENST00000318083           17.205136            2.935649   
31497  ENST00000222145            4.531111           12.698763   
31498  ENST00000597553            0.055010            0.207989   
31499  ENST00000645652            7.776261           10.438391   

       LUSC-22-5471_tumor  LUSC-22-5472_tumor  LUSC-22-5478_tumor  \
0               37.784509          106.868616           61.738106   
3             1294.795172         1241.888811          838.418900   


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumorONLY_merged['n'] = tumorONLY_merged.drop(columns=['transcript_id','gene_id','gene_name','gene_type','Length']).apply(count_greater_than_one, axis=1)


         transcript_id  PRAD-CH-5761_tumor  PRAD-CH-5767_tumor  \
0       TCONS_00002128          144.470157          200.289666   
3       TCONS_00002129         2458.473628         7450.921126   
7      ENST00000361390         6678.010599        10488.199868   
11     ENST00000361453         9911.395469         8530.082248   
17     ENST00000361624        19463.648535        20967.183057   
...                ...                 ...                 ...   
26675  ENST00000600468            0.644878            0.147132   
26676  ENST00000425340            0.382016            0.193857   
26677  ENST00000318083            0.713887            0.635445   
26678  ENST00000222145            5.804824            2.531997   
26679  ENST00000645652            4.301946            6.207849   

       PRAD-CH-5768_tumor  PRAD-CH-5769_tumor  PRAD-EJ-7115_tumor  \
0              226.420020          328.384368          510.315150   
3             1961.811550         3546.985440         3589.945905   


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumorONLY_merged['n'] = tumorONLY_merged.drop(columns=['transcript_id','gene_id','gene_name','gene_type','Length']).apply(count_greater_than_one, axis=1)


         transcript_id  KIRC-A3-3358_tumor  KIRC-A3-3387_tumor  \
0       TCONS_00002128          304.946790          192.179407   
3       TCONS_00002129         4986.046589         1906.270954   
7      ENST00000361390        10920.358956         4938.040304   
11     ENST00000361453        10844.425826         4016.360207   
17     ENST00000361624        18364.112545        10536.273892   
...                ...                 ...                 ...   
27694  ENST00000594723            4.237029            2.839989   
27695  ENST00000084798            5.050094            8.333380   
27697  ENST00000318083            0.678024            1.242293   
27698  ENST00000222145            7.716036           13.531194   
27699  ENST00000645652            2.201132            2.337848   

       KIRC-B0-4700_tumor  KIRC-B0-4712_tumor  KIRC-B0-5402_tumor  \
0              149.322490          212.805185          165.340737   
3             5599.814533         5658.823521         5616.135322   


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumorONLY_merged['n'] = tumorONLY_merged.drop(columns=['transcript_id','gene_id','gene_name','gene_type','Length']).apply(count_greater_than_one, axis=1)


         transcript_id  KIRP-A4-A4ZT_tumor  KIRP-A4-A57E_tumor  \
0       TCONS_00002128          294.209363          161.241017   
3       TCONS_00002129         7291.239248        15292.071033   
7      ENST00000361390        11544.965302         4320.608463   
11     ENST00000361453         9045.966882         5539.460963   
17     ENST00000361624        28101.293761        36216.373746   
...                ...                 ...                 ...   
27451  ENST00000084798            8.700903            1.310541   
27453  ENST00000425340            0.249967            0.371580   
27454  ENST00000318083            0.463341            1.008063   
27455  ENST00000222145            2.114447           10.953976   
27456  ENST00000645652            0.263027            0.208092   

       KIRP-B9-4115_tumor  KIRP-BQ-5875_tumor  KIRP-BQ-5877_tumor  \
0              740.683401          521.762270         1138.027864   
3             3068.542507        17463.736697          563.632531   


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumorONLY_merged['n'] = tumorONLY_merged.drop(columns=['transcript_id','gene_id','gene_name','gene_type','Length']).apply(count_greater_than_one, axis=1)


         transcript_id  LUAD-38-4625_tumor  LUAD-38-4626_tumor  \
0       TCONS_00002128           33.081218          167.607586   
3       TCONS_00002129          559.679343         1482.097508   
7      ENST00000361390         1205.134477         5597.900024   
11     ENST00000361453         2538.791386         6117.085762   
17     ENST00000361624         3116.938737        12994.530576   
...                ...                 ...                 ...   
31910  ENST00000084798           25.689259            7.925141   
31912  ENST00000425340            4.456336            5.032938   
31913  ENST00000318083            5.428071            1.052470   
31914  ENST00000222145            2.926995           13.684228   
31915  ENST00000645652            6.753087            5.483625   

       LUAD-38-4627_tumor  LUAD-38-4632_tumor  LUAD-44-2655_tumor  \
0              112.342463           46.514476          176.747297   
3             1980.756407         1469.437189         2027.146515   


In [21]:
tumorONLY_merged

Unnamed: 0,transcript_id,gene_id,gene_name,gene_type,Length,n
0,TCONS_00002128,XLOC_001837,XLOC_001837,novel,577,70
3,TCONS_00002129,XLOC_001838,XLOC_001838,novel,353,70
7,ENST00000361390,ENSG00000198888,MT-ND1,protein_coding,956,70
11,ENST00000361453,ENSG00000198763,MT-ND2,protein_coding,1042,70
17,ENST00000361624,ENSG00000198804,MT-CO1,protein_coding,1542,70
...,...,...,...,...,...,...
31910,ENST00000084798,ENSG00000063180,CA11,protein_coding,1715,65
31912,ENST00000425340,ENSG00000176920,FUT2,protein_coding,3116,69
31913,ENST00000318083,ENSG00000176909,MAMSTR,protein_coding,1858,40
31914,ENST00000222145,ENSG00000105538,RASIP1,protein_coding,3199,65


In [2]:
#Select normal samples and create a file with min max expression in normal samples per cancer type
tolerance_dictionary = dict()

for proj in projects:
    cancer_type = proj[5:]
    fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+cancer_type+".csv")
    normals = [col for col in fc.columns if 'normal' in col]
    tolerance_dictionary[cancer_type] = len(normals)*0.05

In [3]:
tolerance_dictionary

{'BRCA': 5.6000000000000005,
 'LUSC': 2.45,
 'PRAD': 2.6,
 'KIRC': 3.5500000000000003,
 'KIRP': 1.6,
 'LUAD': 2.9000000000000004}

In [14]:
## TUMOR-SPECIFIC
for proj in projects:
    tumorspecific_candidates = pd.DataFrame()
    cancer_type = proj[5:]
    print(cancer_type)

    ## import table of counts
    fc = pd.read_csv(os.path.join(cancer_dir,"featureCounts/table_of_counts_TPMs_")+cancer_type+".csv")
    fc_info = fc.merge(transcript_gene, on=['transcript_id'], how="inner")
    fc_info = fc_info[fc_info['gene_type'].isin(['lncRNA','processed_pseudogene','novel','protein_coding'])]

    ## table of counts normal samples
    normals = [col for col in fc_info.columns if 'normal' in col]
    fc_normals = fc_info[normals]
    # Function to count values above 0.1 in a row
    def count_values_above_01(row):
        return sum(row > 0.1)
    fc_normals['counts_01'] = fc_normals.apply(count_values_above_01, axis=1)
    fc_normals_info = pd.concat([fc_normals, fc_info[['gene_id','transcript_id','gene_name','gene_type']]], axis=1)
    fc_normals_info = fc_normals_info[['gene_id','transcript_id','gene_name','gene_type','counts_01']]

    ## import patients data
    patients=pd.read_csv(os.path.join("/users/genomics/marta/TCGA_RNASeq",proj,"results/QC_patients.csv"))
    patients['normal'] = patients['normal'].str.replace('TCGA', cancer_type)
    patients['tumor'] = patients['tumor'].str.replace('TCGA', cancer_type)

    for index,patient in patients.iterrows():
        if "_" not in patient.iloc[0]: ## if there are two samples per patient, keep only one
            patient_fc = fc_info[["transcript_id","gene_id","gene_name","gene_type","Length",patient.iloc[1],patient.iloc[2]]] ## patient - normal - tumor
            tumor1TPM = patient_fc[patient_fc[patient.iloc[2]] > 1 ]
            tumor1TPM_normal01TPM = tumor1TPM[tumor1TPM[patient.iloc[1]] < 0.1 ]
            try:
                os.makedirs(os.path.join(cancer_dir,"tumorspecific",cancer_type,patient.iloc[0]))
            except:
                pass
            tumor1TPM_normal01TPM.to_csv(os.path.join(cancer_dir,"tumorspecific",cancer_type,patient.iloc[0],"tumorspecific_TPMs.csv"), index=None)
            tumorspecific_candidates = pd.concat([tumorspecific_candidates,tumor1TPM_normal01TPM[["transcript_id","gene_id","gene_name","gene_type","Length"]]])
summary = tumorspecific_candidates.groupby(["transcript_id","gene_id","gene_name","gene_type","Length"]).size().reset_index(name='n')
summary.sort_values(by="n", ascending=False).to_csv(os.path.join(cancer_dir,"tumorspecific")+"/tumorspecific_n_"+cancer_type+".csv", index=False)
print(len(summary))
### tolerance 5% only
summary_tolerance = fc_normals_info.merge(summary, on=["gene_id","transcript_id"], how="right")
# ## Expression > 0.1 in less than 5%
summary_tolerance_accepted = summary_tolerance[summary_tolerance['counts_01'] < tolerance_dictionary[cancer_type]]
summary_tolerance_accepted.drop(["counts_01"], axis=1, inplace=True)
summary_tolerance_accepted.sort_values(by="n", ascending=False).to_csv(os.path.join(cancer_dir,"tumorspecific")+"/tumorspecific_n_"+cancer_type+"_TOLERANCE5PERCENT.csv", index=False)
print(len(summary_tolerance_accepted))
print("Done!")

BRCA


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fc_normals['counts_01'] = fc_normals.apply(count_values_above_01, axis=1)


LUSC


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fc_normals['counts_01'] = fc_normals.apply(count_values_above_01, axis=1)


PRAD


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fc_normals['counts_01'] = fc_normals.apply(count_values_above_01, axis=1)


KIRC


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fc_normals['counts_01'] = fc_normals.apply(count_values_above_01, axis=1)


KIRP


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fc_normals['counts_01'] = fc_normals.apply(count_values_above_01, axis=1)


LUAD


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fc_normals['counts_01'] = fc_normals.apply(count_values_above_01, axis=1)


5956


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_tolerance_accepted.drop(["counts_01"], axis=1, inplace=True)


1225
Done!
