In [5]:
import os,re
import pandas as pd
from collections import Counter
from rna_seq_normalization import Normalization as Norm
from functools import reduce

annot = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed/transID_geneID_isoforms_selected.1to1.csv")
Ref_DIR = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed"

In [6]:
%%bash -s "$Ref_DIR"


sed 's/""/"/g' $1/gencode.v47.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.gtf > test.gtf
sed 's/\(transcript_id "[^"]*\).*/\1"/' test.gtf > $1/gencode.v47.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.fixed.gtf
rm test.gtf

## Quantify in thymus

In [7]:
%%bash

###PREPARING NEEDED DATA
thymus_dir=/projects_eg/projects/marta/thymus
outdir=/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/quantification_thymus

mkdir -p $outdir
AnnotGTF=/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/newReference_Resconstructed/gencode.v47.gffcompare.TestisLiverBrain.annotation.sorted.1transcript.sorted.NOchr.fixed.gtf

module load Subread/2.0.3
########################

# countReadPairs may need to be removed in case of single-end reads
featureCounts -T 10 -p -s 2 -g transcript_id -O --countReadPairs -a $AnnotGTF -o ${outdir}/gffcompare_stranded_featureCounts.txt $thymus_dir/GSE*/analysis/05_STAR/uniquely_mapped_2pass_BAM_files/*Aligned.sortedByCoord.out.bam 




        =====         / ____| |  | |  _ \|  __ \|  ____|   /\   |  __ \ 
          =====      | (___ | |  | | |_) | |__) | |__     /  \  | |  | |
            ====      \___ \| |  | |  _ <|  _  /|  __|   / /\ \ | |  | |
              ====    ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
	  v2.0.3

||                                                                            ||
||             Input files : 30 BAM files                                     ||
||                                                                            ||
||                           SRR8668611Aligned.sortedByCoord.out.bam          ||
||                           SRR8668612Aligned.sortedByCoord.out.bam          ||
||                           SRR8668613Aligned.sortedByCoord.out.bam          ||
||                           SRR8668614Aligned.sortedByCoord.out.bam          ||
||                           SRR8668615Aligned.sortedByCoord.out.bam          ||
||                           SRR8668616Aligned.s

In [8]:
### modify headers
file = "/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/quantification_thymus/gffcompare_stranded_featureCounts.txt"

toc = pd.read_csv(file, sep="\t", comment="#")
toc = toc[toc['Geneid'].str.contains('PAR_')==False]
toc['Geneid']=toc['Geneid'].str.split('.').str[0]
toc.rename(columns={'Geneid':'transcript_id'}, inplace=True)

filter_col = [col for col in toc if col.startswith('/')]
for col in filter_col:
    new_col=col.split("Aligned")[0]
    new_col=new_col.split("/")[-1]
    toc.rename(columns={col:new_col}, inplace=True)

length = toc['Length']
genes = toc['transcript_id']
# we are only interested in the columns with counts
counts = toc
counts.drop(["Chr","Start","End","Strand","Length","transcript_id"],axis=1, inplace=True)
# calculate TPMs
tpm_df = Norm.tpm(counts, length)
# add transcript_id and length again
tpms = pd.concat([genes,tpm_df, length], axis=1)
tpms.to_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/quantification_thymus/table_of_counts_TPMs_thymus.csv", index=None)
 

## Candidates

In [9]:
tumorReact = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/cancers/log2ratio3x/cancertypes/TOv3x_5percent_TestisRestrictedGTEx_Translated_Ctypes_log2ratio3xMEANgenes.csv")
tumorReact_genes = tumorReact[['transcript_id','gene_id','gene_name']]
tumorReact_genes.drop_duplicates(inplace=True)
tumorReact_genes

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tumorReact_genes.drop_duplicates(inplace=True)


Unnamed: 0,transcript_id,gene_id,gene_name
0,ENST00000247452,ENSG00000046774,MAGEC2
1,ENST00000376979,ENSG00000101435,CST9L
2,ENST00000375406,ENSG00000117148,ACTL8
3,ENST00000376919,ENSG00000126752,SSX1
4,ENST00000326279,ENSG00000131914,LIN28A
...,...,...,...
239,ENST00000749288,ENSG00000287164,ENSG00000287164
240,TCONS_00000085,XLOC_000211,XLOC_000211
243,TCONS_00001755,XLOC_001654,XLOC_001654
246,ENST00000219301,ENSG00000103023,PRSS54


In [10]:
tpms_candidates = tpms[tpms['transcript_id'].isin(tumorReact_genes.transcript_id.values.tolist())]
tpms_candidates.drop(["Length"], axis=1, inplace=True)
tpms_candidates.to_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/quantification_thymus/table_of_counts_TPMs_thymus_TOv3xlog2ratio3xmean.csv")

## compute mean, median, max
tpms_candidates['mean'] = tpms_candidates.iloc[:, 1:].mean(axis=1)
tpms_candidates['median'] = tpms_candidates.iloc[:, 1:].median(axis=1)
tpms_candidates['max'] = tpms_candidates.iloc[:, 1:].max(axis=1)

tpms_candidates_thymus = tpms_candidates[['transcript_id','mean','median','max']]
tpms_candidates_thymus = tpms_candidates_thymus.merge(annot, on="transcript_id")
tpms_candidates_thymus.to_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/quantification_thymus/thymus_summary_TOv3xlog2ratio3xmean.csv")
tpms_candidates_thymus

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tpms_candidates.drop(["Length"], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tpms_candidates['mean'] = tpms_candidates.iloc[:, 1:].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tpms_candidates['median'] = tpms_candidates.iloc[:, 1:].median(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Tr

Unnamed: 0,transcript_id,mean,median,max,chr,gene_id,gene_name,transcript_type,gene_type
0,ENST00000381089,0.160622,0.106713,0.856997,X,ENSG00000169059,VCX3A,protein_coding,protein_coding
1,ENST00000381059,0.140349,0.000000,0.836275,X,ENSG00000182583,VCX,protein_coding,protein_coding
2,ENST00000543214,0.049054,0.000000,0.287198,X,ENSG00000183304,FAM9A,protein_coding,protein_coding
3,ENST00000327968,0.000760,0.000000,0.012205,X,ENSG00000184735,DDX53,protein_coding,protein_coding
4,ENST00000412172,0.000430,0.000000,0.012894,X,ENSG00000224960,PPP4R3C,protein_coding,protein_coding
...,...,...,...,...,...,...,...,...,...
135,ENST00000622113,0.009798,0.000000,0.084074,21,ENSG00000274391,TPTE,protein_coding,protein_coding
136,ENST00000400424,0.856670,0.706154,2.367336,21,ENSG00000177398,UMODL1,protein_coding,protein_coding
137,TCONS_00001672,0.111527,0.055174,0.686810,21,XLOC_001518,XLOC_001518,novel,novel
138,ENST00000332271,0.286123,0.069327,1.005759,22,ENSG00000184571,PIWIL3,protein_coding,protein_coding


In [11]:
tpms_candidates_thymus = pd.read_csv("/users/genomics/marta/TestisProject_SaraRazquin/with_TranscriptomeReconstruction/v47/human/quantification_thymus/thymus_summary_TOv3xlog2ratio3xmean.csv")
tpms_candidates_thymus[tpms_candidates_thymus['mean'] > 1]

Unnamed: 0.1,Unnamed: 0,transcript_id,mean,median,max,chr,gene_id,gene_name,transcript_type,gene_type
48,48,TCONS_00001486,1.899517,0.355686,13.296815,2,XLOC_001350,XLOC_001350,novel,novel
51,51,ENST00000295453,1.150093,0.66323,7.558909,2,ENSG00000163286,ALPG,protein_coding,protein_coding
53,53,TCONS_00001755,1.495611,1.413604,3.625657,3,XLOC_001654,XLOC_001654,novel,novel
104,104,ENST00000377208,11.883755,10.270708,46.168181,13,ENSG00000152192,POU4F1,protein_coding,protein_coding
114,114,ENST00000357424,1.164594,0.129854,9.202288,17,ENSG00000159224,GIP,protein_coding,protein_coding
118,118,ENST00000663592,1.316451,1.193847,3.683874,18,ENSG00000267374,MIR924HG,lncRNA,lncRNA
126,126,ENST00000636757,1.21759,0.958908,6.367341,19,ENSG00000261341,SMIM47,protein_coding,protein_coding
127,127,ENST00000424985,67.902782,47.84621,143.544888,19,ENSG00000180043,GARIN5B,protein_coding,protein_coding
