In [2]:
import os,re
import pandas as pd
from collections import Counter
from rna_seq_normalization import Normalization as Norm
from functools import reduce

annot = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed/transID_geneID_isoforms_selected.1to1.csv")
Ref_DIR = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed"

In [2]:
%%bash -s "$Ref_DIR"


sed 's/""/"/g' $1/gencode.v47.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf > test.gtf
sed 's/\(transcript_id "[^"]*\).*/\1"/' test.gtf > $1/gencode.v47.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.fixed.gtf
rm test.gtf

## Quantify in thymus

In [3]:
%%bash

###PREPARING NEEDED DATA
thymus_dir=/projects_eg/projects/marta/thymus
outdir=/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/quantification_thymus

mkdir -p $outdir
AnnotGTF=/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed/gencode.v47.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.fixed.gtf

module load Subread/2.0.3
########################

# countReadPairs may need to be removed in case of single-end reads
featureCounts -T 10 -p -s 2 -g transcript_id -O --countReadPairs -a $AnnotGTF -o ${outdir}/gffcompare_stranded_featureCounts.txt $thymus_dir/GSE*/analysis/05_STAR/uniquely_mapped_2pass_BAM_files/*Aligned.sortedByCoord.out.bam 




        =====         / ____| |  | |  _ \|  __ \|  ____|   /\   |  __ \ 
          =====      | (___ | |  | | |_) | |__) | |__     /  \  | |  | |
            ====      \___ \| |  | |  _ <|  _  /|  __|   / /\ \ | |  | |
              ====    ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
	  v2.0.3

||                                                                            ||
||             Input files : 30 BAM files                                     ||
||                                                                            ||
||                           SRR8668611Aligned.sortedByCoord.out.bam          ||
||                           SRR8668612Aligned.sortedByCoord.out.bam          ||
||                           SRR8668613Aligned.sortedByCoord.out.bam          ||
||                           SRR8668614Aligned.sortedByCoord.out.bam          ||
||                           SRR8668615Aligned.sortedByCoord.out.bam          ||
||                           SRR8668616Aligned.s

In [3]:
### modify headers
file = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/quantification_thymus/gffcompare_stranded_featureCounts.txt"

toc = pd.read_csv(file, sep="\t", comment="#")
toc = toc[toc['Geneid'].str.contains('PAR_')==False]
toc['Geneid']=toc['Geneid'].str.split('.').str[0]
toc.rename(columns={'Geneid':'transcript_id'}, inplace=True)

filter_col = [col for col in toc if col.startswith('/')]
for col in filter_col:
    new_col=col.split("Aligned")[0]
    new_col=new_col.split("/")[-1]
    toc.rename(columns={col:new_col}, inplace=True)

length = toc['Length']
genes = toc['transcript_id']
# we are only interested in the columns with counts
counts = toc
counts.drop(["Chr","Start","End","Strand","Length","transcript_id"],axis=1, inplace=True)
# calculate TPMs
tpm_df = Norm.tpm(counts, length)
# add transcript_id and length again
tpms = pd.concat([genes,tpm_df, length], axis=1)
tpms.to_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/quantification_thymus/table_of_counts_TPMs_thymus.csv", index=None)
 

## Candidates

In [7]:
tumorReact = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/cancers/log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEANgenes.csv")
tumorReact_genes = tumorReact[['transcript_id','gene_id','gene_name']]
tumorReact_genes.drop_duplicates(inplace=True)
tumorReact_genes

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumorReact_genes.drop_duplicates(inplace=True)


Unnamed: 0,transcript_id,gene_id,gene_name
0,ENST00000247452,ENSG00000046774,MAGEC2
1,ENST00000376979,ENSG00000101435,CST9L
2,ENST00000375406,ENSG00000117148,ACTL8
3,ENST00000376919,ENSG00000126752,SSX1
4,ENST00000326279,ENSG00000131914,LIN28A
...,...,...,...
236,ENST00000600766,ENSG00000268696,ZNF723
239,ENST00000622113,ENSG00000274391,TPTE
240,ENST00000632600,ENSG00000282815,TEX13C
242,ENST00000749288,ENSG00000287164,ENSG00000287164


In [9]:
tpms_candidates = tpms[tpms['transcript_id'].isin(tumorReact_genes.transcript_id.values.tolist())]
tpms_candidates.drop(["Length"], axis=1, inplace=True)
tpms_candidates.to_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/quantification_thymus/table_of_counts_TPMs_thymus_TOv3xlog2ratio3xmean.csv")

## compute mean, median, max
tpms_candidates['mean'] = tpms_candidates.iloc[:, 1:].mean(axis=1)
tpms_candidates['median'] = tpms_candidates.iloc[:, 1:].median(axis=1)
tpms_candidates['max'] = tpms_candidates.iloc[:, 1:].max(axis=1)

tpms_candidates_thymus = tpms_candidates[['transcript_id','mean','median','max']]
tpms_candidates_thymus = tpms_candidates_thymus.merge(annot, on="transcript_id")
tpms_candidates_thymus.to_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/quantification_thymus/thymus_summary_TOv3xlog2ratio3xmean.csv")
tpms_candidates_thymus

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tpms_candidates.drop(["Length"], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tpms_candidates['mean'] = tpms_candidates.iloc[:, 1:].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tpms_candidates['median'] = tpms_candidates.iloc[:, 1:].median(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Tr

Unnamed: 0,transcript_id,mean,median,max,chr,gene_id,gene_name,transcript_type,gene_type
0,ENST00000381089,0.160864,0.106866,0.858360,X,ENSG00000169059,VCX3A,protein_coding,protein_coding
1,ENST00000381059,0.140549,0.000000,0.837284,X,ENSG00000182583,VCX,protein_coding,protein_coding
2,ENST00000317103,0.063568,0.000000,0.602536,X,ENSG00000177504,VCX2,protein_coding,protein_coding
3,ENST00000543214,0.049121,0.000000,0.287587,X,ENSG00000183304,FAM9A,protein_coding,protein_coding
4,ENST00000327968,0.000761,0.000000,0.012223,X,ENSG00000184735,DDX53,protein_coding,protein_coding
...,...,...,...,...,...,...,...,...,...
131,ENST00000622113,0.009812,0.000000,0.084195,21,ENSG00000274391,TPTE,protein_coding,protein_coding
132,ENST00000400424,0.857918,0.707291,2.370339,21,ENSG00000177398,UMODL1,protein_coding,protein_coding
133,TCONS_00000781,0.111709,0.055255,0.688044,21,XLOC_000680,XLOC_000680,novel,novel
134,ENST00000332271,0.286586,0.069442,1.007379,22,ENSG00000184571,PIWIL3,protein_coding,protein_coding


In [10]:
tpms_candidates_thymus = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/quantification_thymus/thymus_summary_TOv3xlog2ratio3xmean.csv")
tpms_candidates_thymus[tpms_candidates_thymus['mean'] > 1]

Unnamed: 0.1,Unnamed: 0,transcript_id,mean,median,max,chr,gene_id,gene_name,transcript_type,gene_type
54,54,ENST00000295453,1.151701,0.664429,7.568078,2,ENSG00000163286,ALPG,protein_coding,protein_coding
102,102,ENST00000377208,11.901478,10.289783,46.226755,13,ENSG00000152192,POU4F1,protein_coding,protein_coding
111,111,ENST00000357424,1.166155,0.130044,9.213451,17,ENSG00000159224,GIP,protein_coding,protein_coding
114,114,ENST00000663592,1.318409,1.195574,3.689594,18,ENSG00000267374,MIR924HG,lncRNA,lncRNA
122,122,ENST00000636757,1.219491,0.960492,6.377228,19,ENSG00000261341,SMIM47,protein_coding,protein_coding
123,123,ENST00000424985,68.006441,47.925253,143.738921,19,ENSG00000180043,GARIN5B,protein_coding,protein_coding
