In [5]:
import os,re
import pandas as pd
from collections import Counter
from rna_seq_normalization import Normalization as Norm
from functools import reduce

annot = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/1transcript_1gene.reconstructed.csv")


## Quantify in thymus

In [1]:
%%bash

###PREPARING NEEDED DATA
thymus_dir=/projects_eg/projects/marta/thymus
outdir=/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/quantification_thymus

mkdir -p $outdir
AnnotGTF=/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/newReference_Resconstructed/gencode.v38.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf

module load Subread/2.0.3
########################

# countReadPairs may need to be removed in case of single-end reads
featureCounts -T 10 -p -s 2 -g transcript_id -O --countReadPairs -a $AnnotGTF -o ${outdir}/gffcompare_stranded_featureCounts.txt $thymus_dir/GSE*/analysis/05_STAR/uniquely_mapped_2pass_BAM_files/*Aligned.sortedByCoord.out.bam 




        =====         / ____| |  | |  _ \|  __ \|  ____|   /\   |  __ \ 
          =====      | (___ | |  | | |_) | |__) | |__     /  \  | |  | |
            ====      \___ \| |  | |  _ <|  _  /|  __|   / /\ \ | |  | |
              ====    ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
	  v2.0.3

||                                                                            ||
||             Input files : 30 BAM files                                     ||
||                                                                            ||
||                           SRR8668611Aligned.sortedByCoord.out.bam          ||
||                           SRR8668612Aligned.sortedByCoord.out.bam          ||
||                           SRR8668613Aligned.sortedByCoord.out.bam          ||
||                           SRR8668614Aligned.sortedByCoord.out.bam          ||
||                           SRR8668615Aligned.sortedByCoord.out.bam          ||
||                           SRR8668616Aligned.s

In [6]:
### modify headers
file = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/quantification_thymus/gffcompare_stranded_featureCounts.txt"

toc = pd.read_csv(file, sep="\t", comment="#")
toc = toc[toc['Geneid'].str.contains('PAR_')==False]
toc['Geneid']=toc['Geneid'].str.split('.').str[0]
toc.rename(columns={'Geneid':'transcript_id'}, inplace=True)

filter_col = [col for col in toc if col.startswith('/')]
for col in filter_col:
    new_col=col.split("Aligned")[0]
    new_col=new_col.split("/")[-1]
    toc.rename(columns={col:new_col}, inplace=True)

length = toc['Length']
genes = toc['transcript_id']
# we are only interested in the columns with counts
counts = toc
counts.drop(["Chr","Start","End","Strand","Length","transcript_id"],axis=1, inplace=True)
# calculate TPMs
tpm_df = Norm.tpm(counts, length)
# add transcript_id and length again
tpms = pd.concat([genes,tpm_df, length], axis=1)
tpms.to_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/quantification_thymus/table_of_counts_TPMs_thymus.csv", index=None)
 

## Candidates

In [9]:
tumorReact = pd.read_csv("/projects_eg/projects/marta/TestisRestricted_Microproteins_TSA/with_TranscriptomeReconstruction/Q4_TestisRestricted_TumorSpecific/human/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEAN.csv")
tumorReact_genes = tumorReact[['transcript_id','gene_id','gene_name']]
tumorReact_genes.drop_duplicates(inplace=True)
tumorReact_genes

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumorReact_genes.drop_duplicates(inplace=True)


Unnamed: 0,transcript_id,gene_id,gene_name
0,ENST00000265007,ENSG00000039600,SOX30
4,ENST00000247452,ENSG00000046774,MAGEC2
11,ENST00000376150,ENSG00000068985,PAGE1
14,ENST00000158009,ENSG00000073598,FNDC8
22,ENST00000378988,ENSG00000099399,MAGEB2
...,...,...,...
392,TCONS_00001462,XLOC_001220,XLOC_001220
395,TCONS_00001466,XLOC_001230,XLOC_001230
404,TCONS_00001617,XLOC_001342,XLOC_001342
409,TCONS_00001992,XLOC_001676,XLOC_001676


In [18]:
tpms_candidates = tpms[tpms['transcript_id'].isin(tumorReact_genes.transcript_id.values.tolist())]
tpms_candidates.drop(["Length"], axis=1, inplace=True)
tpms_candidates.to_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/quantification_thymus/table_of_counts_TPMs_thymus_TOv3xlog2ratio3xmean.csv")

## compute mean, median, max
tpms_candidates['mean'] = tpms_candidates.iloc[:, 1:].mean(axis=1)
tpms_candidates['median'] = tpms_candidates.iloc[:, 1:].median(axis=1)
tpms_candidates['max'] = tpms_candidates.iloc[:, 1:].max(axis=1)

tpms_candidates_thymus = tpms_candidates[['transcript_id','mean','median','max']]
tpms_candidates_thymus = tpms_candidates_thymus.merge(annot, on="transcript_id")
tpms_candidates_thymus.to_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/human/quantification_thymus/thymus_summary_TOv3xlog2ratio3xmean.csv")
tpms_candidates_thymus

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tpms_candidates.drop(["Length"], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tpms_candidates['mean'] = tpms_candidates.iloc[:, 1:].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tpms_candidates['median'] = tpms_candidates.iloc[:, 1:].median(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Tr

Unnamed: 0,transcript_id,mean,median,max,gene_id,gene_type,gene_name
0,ENST00000381059,0.161225,0.000000,0.969202,ENSG00000182583,protein_coding,VCX
1,ENST00000317103,0.072923,0.000000,0.691067,ENSG00000177504,protein_coding,VCX2
2,ENST00000543214,0.056680,0.000000,0.331745,ENSG00000183304,protein_coding,FAM9A
3,ENST00000327968,0.000875,0.000000,0.013950,ENSG00000184735,protein_coding,DDX53
4,ENST00000412172,0.000499,0.000000,0.014967,ENSG00000224960,protein_coding,PPP4R3C
...,...,...,...,...,...,...,...
131,ENST00000269881,0.332265,0.111484,2.686197,ENSG00000269058,protein_coding,CALR3
132,ENST00000601693,0.006701,0.000000,0.081271,ENSG00000196350,protein_coding,ZNF729
133,ENST00000596209,0.014091,0.011938,0.103742,ENSG00000213973,protein_coding,ZNF99
134,ENST00000600766,0.017706,0.000000,0.076942,ENSG00000268696,protein_coding,ZNF723
