# Tximport
aggregate transcript counts and produce gene-level count matrices and normalizing offsets

In [1]:
#Imports required libraries
import os
import pandas as pd
import tools.utilities as utils

## Pre-process quantification results for downstream analysis
* Keep only transcript name in the quant files and trim other identifiers:

In [2]:
def trim_ids(file):
    fileout = file.split('.')[0]+'.txt'
    if os.path.exists(fileout):
        print(fileout,'already exists')
    else:
        with open(file, 'r') as f, open(fileout, 'x') as t:
            for line in f:
                t.write(re.sub(r'\|.*\|','', line))

In [3]:
QUANTDIR = 'quant/'
OUTDIR = os.path.join(QUANTDIR,'salmon_output')
for sample in [s for s in os.listdir(OUTDIR) if s.startswith('SRR')]:
    file = os.path.join(OUTDIR,sample,'quant.sf')
    trim_ids(file)

quant/salmon_output/SRR6231087/quant.txt already exists
quant/salmon_output/SRR6231080/quant.txt already exists
quant/salmon_output/SRR6231089/quant.txt already exists
quant/salmon_output/SRR6231088/quant.txt already exists
quant/salmon_output/SRR6231081/quant.txt already exists
quant/salmon_output/SRR6231086/quant.txt already exists
quant/salmon_output/SRR6231083/quant.txt already exists
quant/salmon_output/SRR6231077/quant.txt already exists
quant/salmon_output/SRR6231084/quant.txt already exists
quant/salmon_output/SRR6231079/quant.txt already exists
quant/salmon_output/SRR6231078/quant.txt already exists
quant/salmon_output/SRR6231085/quant.txt already exists
quant/salmon_output/SRR6231076/quant.txt already exists
quant/salmon_output/SRR6231082/quant.txt already exists


## Aggregate transcript counts using tximport
* This step is performed in the R script 'tx2gene.R'

In [4]:
#Download gtf_file
gtf_file = os.path.join(QUANTDIR,'gencode.v30.annotation.gtf.gz')
utils.download_ftp('ftp.ebi.ac.uk', '/pub/databases/gencode/Gencode_human/release_30/gencode.v30.annotation.gtf.gz', gtf_file)

quant/gencode.v30.annotation.gtf.gz already downloaded


In [5]:
utils.run_command(f'Rscript tx2gene_salmon.R data/ {gtf_file} quant/')

Setting WORKDIR to: /Users/Jb_Macbook/Documents/GitHub/RNAseq/pipelines/data/ 
Le chargement a nécessité le package : lmtest
Le chargement a nécessité le package : zoo

Attachement du package : ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric

Parsed with column specification:
cols(
  TXNAME = col_character(),
  GENEID = col_character()
)
# A tibble: 6 x 2
  TXNAME            GENEID           
  <chr>             <chr>            
1 ENST00000456328.2 ENSG00000223972.5
2 ENST00000450305.2 ENSG00000223972.5
3 ENST00000473358.1 ENSG00000243485.5
4 ENST00000469289.1 ENSG00000243485.5
5 ENST00000607096.1 ENSG00000284332.1
6 ENST00000606857.1 ENSG00000268020.3
reading in files with read_tsv
1 2 3 4 5 6 7 8 9 10 11 12 13 14 
summarizing abundance
summarizing counts
summarizing length
reading in files with read_tsv
1 2 3 4 5 6 7 8 9 10 11 12 13 14 
summarizing abundance
summarizing counts
summarizing length
reading in files with read_tsv
1 2 3 4 5 6 7 

Check results and export gene counts and length as separate matrices

In [6]:
## Load the expression matrix
txi = pd.read_csv(os.path.join(os.getcwd(),'data', 'txi.csv'), index_col=0)
print(txi.shape)
txi.head(3)

(58434, 43)


Unnamed: 0,abundance.SRR6231076,abundance.SRR6231077,abundance.SRR6231078,abundance.SRR6231079,abundance.SRR6231080,abundance.SRR6231081,abundance.SRR6231082,abundance.SRR6231083,abundance.SRR6231084,abundance.SRR6231085,...,length.SRR6231081,length.SRR6231082,length.SRR6231083,length.SRR6231084,length.SRR6231085,length.SRR6231086,length.SRR6231087,length.SRR6231088,length.SRR6231089,countsFromAbundance
ENSG00000000003.14,0.904746,1.728894,2.916374,1.672743,1.783191,0.839223,2.240574,0.740899,2.093251,0.184064,...,2267.289697,1631.755246,2116.088202,2106.547661,3547.0,2008.672473,3547.0,3547.0,1375.863773,no
ENSG00000000005.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,624.5,624.5,624.5,624.5,624.5,624.5,624.5,624.5,624.5,no
ENSG00000000419.12,49.583837,42.782856,70.891017,45.358546,61.096438,49.674101,42.417625,49.32638,43.877557,53.120515,...,740.182891,659.593372,689.528983,752.280805,704.728915,675.620847,657.309817,634.535057,655.377461,no


Here we add a column with trimmed ENSEMBL IDs (remove version .XX) and check for possible duplicates

In [7]:
txi['ENSEMBL'] = [ID.split('.')[0] for ID in txi.index]
txi[txi.duplicated(subset='ENSEMBL', keep=False)].head(6)

Unnamed: 0,abundance.SRR6231076,abundance.SRR6231077,abundance.SRR6231078,abundance.SRR6231079,abundance.SRR6231080,abundance.SRR6231081,abundance.SRR6231082,abundance.SRR6231083,abundance.SRR6231084,abundance.SRR6231085,...,length.SRR6231082,length.SRR6231083,length.SRR6231084,length.SRR6231085,length.SRR6231086,length.SRR6231087,length.SRR6231088,length.SRR6231089,countsFromAbundance,ENSEMBL


No duplicates

In [8]:
#Drop duplicates
#txi = txi.drop_duplicates(subset='ENSEMBL', keep='first').reset_index(drop=True).set_index('ENSEMBL')
#print(txi.shape)
#txi.head(3)

In [9]:
#TPM correspond to abundance calculated by salmon/tximport
TPM = txi[[col for col in txi.columns if col.startswith("abundance.")]]
#remove prefix "abundance."
TPM.columns = [col.split('.')[1] for col in TPM.columns]
print(TPM.shape)
TPM.to_csv(os.path.join(os.getcwd(),'data', 'TPM.csv'))
TPM.head(3)

(58434, 14)


Unnamed: 0,SRR6231076,SRR6231077,SRR6231078,SRR6231079,SRR6231080,SRR6231081,SRR6231082,SRR6231083,SRR6231084,SRR6231085,SRR6231086,SRR6231087,SRR6231088,SRR6231089
ENSG00000000003.14,0.904746,1.728894,2.916374,1.672743,1.783191,0.839223,2.240574,0.740899,2.093251,0.184064,1.386037,0.135561,0.098742,2.971087
ENSG00000000005.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000000419.12,49.583837,42.782856,70.891017,45.358546,61.096438,49.674101,42.417625,49.32638,43.877557,53.120515,50.326623,34.554971,50.109034,69.002702


In [10]:
counts = txi[[col for col in txi.columns if col.startswith("counts.")]]
#remove prefix "counts."
counts.columns = [col.split('.')[1] for col in counts.columns]
print(counts.shape)
counts.to_csv(os.path.join(os.getcwd(),'data', 'counts.csv'))
counts.head(3)

(58434, 14)


Unnamed: 0,SRR6231076,SRR6231077,SRR6231078,SRR6231079,SRR6231080,SRR6231081,SRR6231082,SRR6231083,SRR6231084,SRR6231085,SRR6231086,SRR6231087,SRR6231088,SRR6231089
ENSG00000000003.14,31.812,60.686,121.274,34.846,69.902,25.15,66.644,22.725,48.893,10.447,49.374,9.59,7.226,80.811
ENSG00000000005.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000000419.12,574.999,565.0,1041.999,516.999,789.0,486.0,510.001,493.0,366.001,598.999,603.0,453.0,656.0,894.0


In [11]:
lengths = txi[[col for col in txi.columns if col.startswith("length.")]]
#remove prefix "counts."
lengths.columns = [col.split('.')[1] for col in lengths.columns]
print(lengths.shape)
lengths.to_csv(os.path.join(os.getcwd(),'data', 'lengths.csv'))
lengths.head(3)

(58434, 14)


Unnamed: 0,SRR6231076,SRR6231077,SRR6231078,SRR6231079,SRR6231080,SRR6231081,SRR6231082,SRR6231083,SRR6231084,SRR6231085,SRR6231086,SRR6231087,SRR6231088,SRR6231089
ENSG00000000003.14,2076.907355,1807.147185,2014.507984,1320.162089,2139.434518,2267.289697,1631.755246,2116.088202,2106.547661,3547.0,2008.672473,3547.0,3547.0,1375.863773
ENSG00000000005.6,624.5,624.5,624.5,624.5,624.5,624.5,624.5,624.5,624.5,624.5,624.5,624.5,624.5,624.5
ENSG00000000419.12,684.978195,679.919879,712.065909,722.332476,704.799894,740.182891,659.593372,689.528983,752.280805,704.728915,675.620847,657.309817,634.535057,655.377461


For use with limma-voom, export counts calculated with the lengthScaled TPM method (from [tximport vignette](https://bioc.ism.ac.jp/packages/3.4/bioc/vignettes/tximport/inst/doc/tximport.html): "limma-voom does not use the offset matrix stored in y$offset, so we recommend using the scaled counts generated from abundances, either 'scaledTPM' or 'lengthScaledTPM' ")

In [12]:
## Load the expression matrix
txi_lengthScaledTPM = pd.read_csv(os.path.join(os.getcwd(),'data', 'txi_lengthScaledTPM.csv'), index_col=0)
print(txi_lengthScaledTPM.shape)

(58434, 43)


In [13]:
# Add trimmed ENSEMBL IDs and drop duplicates
txi_lengthScaledTPM['ENSEMBL'] = [ID.split('.')[0] for ID in txi_lengthScaledTPM.index]
txi_lengthScaledTPM = txi_lengthScaledTPM.drop_duplicates(subset='ENSEMBL', keep='first').reset_index(drop=True).set_index('ENSEMBL')
print(txi_lengthScaledTPM.shape)
txi_lengthScaledTPM.head(3)

(58434, 43)


Unnamed: 0_level_0,abundance.SRR6231076,abundance.SRR6231077,abundance.SRR6231078,abundance.SRR6231079,abundance.SRR6231080,abundance.SRR6231081,abundance.SRR6231082,abundance.SRR6231083,abundance.SRR6231084,abundance.SRR6231085,...,length.SRR6231081,length.SRR6231082,length.SRR6231083,length.SRR6231084,length.SRR6231085,length.SRR6231086,length.SRR6231087,length.SRR6231088,length.SRR6231089,countsFromAbundance
ENSEMBL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,0.904746,1.728894,2.916374,1.672743,1.783191,0.839223,2.240574,0.740899,2.093251,0.184064,...,2267.289697,1631.755246,2116.088202,2106.547661,3547.0,2008.672473,3547.0,3547.0,1375.863773,lengthScaledTPM
ENSG00000000005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,624.5,624.5,624.5,624.5,624.5,624.5,624.5,624.5,624.5,lengthScaledTPM
ENSG00000000419,49.583837,42.782856,70.891017,45.358546,61.096438,49.674101,42.417625,49.32638,43.877557,53.120515,...,740.182891,659.593372,689.528983,752.280805,704.728915,675.620847,657.309817,634.535057,655.377461,lengthScaledTPM


In [14]:
counts_lengthScaledTPM = txi_lengthScaledTPM[[col for col in txi_lengthScaledTPM.columns if col.startswith("counts.")]]
#remove prefix "counts."
counts_lengthScaledTPM.columns = [col.split('.')[1] for col in counts_lengthScaledTPM.columns]
print(counts_lengthScaledTPM.shape)
counts_lengthScaledTPM.to_csv(os.path.join(os.getcwd(),'data', 'counts_lengthScaledTPM.csv'))
counts_lengthScaledTPM.head(3)

(58434, 14)


Unnamed: 0_level_0,SRR6231076,SRR6231077,SRR6231078,SRR6231079,SRR6231080,SRR6231081,SRR6231082,SRR6231083,SRR6231084,SRR6231085,SRR6231086,SRR6231087,SRR6231088,SRR6231089
ENSEMBL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ENSG00000000003,33.784907,73.529544,130.314754,57.585567,72.725508,24.40879,89.253819,23.650357,50.903977,6.355984,52.17118,5.709173,4.373763,128.361576
ENSG00000000005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000000419,568.491699,558.664686,972.589376,479.436939,765.055138,443.595188,518.802202,483.443683,327.612748,563.20192,581.623002,446.824455,681.486577,915.321605


After you completed successfully the above steps, you can start to analyze the processed gene expression matrix

## References
---

1. Soneson C., Love M.I., Robinson M.D. (2015): **Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences.** _F1000Research_ http://dx.doi.org/10.12688/f1000research.7563.1