In [1]:
import polars as pl
import polars.selectors as cs
import time
from datetime import timedelta

**Metagenomic occurrences**

Abundance of genes in metagenomic reads.

(from https://www.genoscope.cns.fr/tara/localdata/data/Geneset-v1/metagenomic_occurrences.tsv.gz)

In [2]:
t0=time.time()
meta = pl.read_csv('/srv/storage/oceania@storage1.grenoble.grid5000.fr/TARA/Eukaryote_Gene_Catalog_MATOU/metagenomic_occurrences.tsv.gz',
                        separator='\t')
t1=time.time()
print(f"Read time = {str(timedelta(seconds=round(t1-t0)))}")
print(f"Dataframe dimensions: {meta.shape}")
meta.head(4)

Read time = 0:02:14
Dataframe dimensions: (1288554892, 3)


unigeneID,sampleCode,Occurrence
i64,str,f64
83028116,"""128SUR1QQSS11""",1.683e-10
85649520,"""128SUR1QQSS11""",5.1929e-10
67525,"""128SUR1QQSS11""",2.2329e-10
103429863,"""128SUR1QQSS11""",2.6594e-10


**Molecular function information** 

The protein families database (Pfam).

(from https://www.genoscope.cns.fr/tara/localdata/data/Geneset-v1/pfam.tsv.gz)

In [3]:
t0=time.time()
pfam = pl.read_csv('/srv/storage/oceania@storage1.grenoble.grid5000.fr/TARA/Eukaryote_Gene_Catalog_MATOU/pfam.tsv.gz',
                        separator='\t')
t1=time.time()
print(f"Read time = {str(timedelta(seconds=round(t1-t0)))}")
print(f"Dataframe dimensions: {pfam.shape}")

pfam = pfam.rename({"geneID":"unigeneID"})

pfam.head(4)

Read time = 0:00:05
Dataframe dimensions: (50913598, 3)


unigeneID,pfamAcc,domainBitScore
i64,str,f64
2629491,"""PF00001""",248.9
505662,"""PF00001""",245.0
544990,"""PF00001""",231.5
2748908,"""PF00001""",210.0


**Unigene sequences dataset**

See file format

In [22]:
!zcat ~/Ocean_IA/group_storage/TARA/Eukaryote_Gene_Catalog_MATOU/MATOU-v1.fna.gz | head -n 20

>MATOU-v1_1
GACAGCTCGAGGCGAACCTGTGGCTGAACGAGGGCTCAGGTTCAGCCTCTTGACGTCCGC
AACTACCGTGGTGGCACGGCTTTCCAATGCAACTCTGTCTTCCTGCTCCTACTCGGGCAG
CGGCTGCTCCAGCTCCATTTCACCGCTTGGCTGACGGCGGCGGCGGTAAGAAACTGTTGC
CACACAGGCTGCAAAAACTGCGGCGCTAGCAGCAAAGACCGGCATGACTGCCGATGGTGT
CTCAAAGCGTGGGATGTTGTCAGCATAAAGGCGCGAAGGGACAGCATCTGTGGCACATTT
CTTCAGCCCGTCGTAGCTGTCCTCATTTTCGACACAGTCCTTCAACTTCTGCTTGAGAAC
AGCCACGTCGCTTACATCCTTCACGCAGCACTCGGACGCGGTGCTCCCGCTTCTTGTGCT
GCTAACTTTTTCGCCTGATTTGCAAGCACTGCTGCTCAGCACATCGCTGCAATACGCCCT
GCACACCGCTTTTTGCTCCCTTGTATTGTGAAATCCACACTTCCCATTGTTGTCACACCA
AGTAAAATGGGGCATCTTCCCCTCCCGGTCGTTGAAGTACGCGCAGTATGTGAAAGTGTT
GTCATCGGCCTTCTCCGTGCCCTCGCCAGCGACTGCGAATCCAC
>MATOU-v1_2
GTAACGTTTCCTGTACAAATCTGATTTCAGATGTATTCGGAAACACTCATCGCTGTAATA
GGCACAGACTTCACAAGCCTGTGGCTGAACGAGGGCTCAGGTCACGGACTCCTGACGCGC
CCCTCAGGCCCTACAGTCGTCCGCACTTTCCGTGGTGGCACTTCTTTTCAATGCCGACTC
TGTCTTCCTGCCCCTACTCGGCCAGCGGCAGCTCCACTTCACCACTTGGCTGGCGGCGGC
GGCGGTAAGAAACTGCTGCCACACAAGCTGCAAAAACTGCAGCGCCAGCAGCGAAGACCG
GCATGACTGCCGAAGG

Number of unigenes

In [23]:
!zcat ~/Ocean_IA/group_storage/TARA/Eukaryote_Gene_Catalog_MATOU/MATOU-v1.fna.gz | grep -c "^>"

116849350


In [16]:
ID = pl.read_csv('~/Ocean_IA/group_storage/TARA/Eukaryote_Gene_Catalog_MATOU/clusters.tsv.gz',
                        separator='\t')
print(f"Dataframe dimensions: {ID.shape}")
ID.head(8)

Dataframe dimensions: (6225695, 5)


Proteic-GroupID,ClusterID,NumberUnigenes,Class,TaxonomicAssignation
i64,i64,i64,str,str
0,1,132813,"""ftGF""","""O/U Eukaryota"""
39,2,115340,"""ftGF""","""root"""
35,3,102348,"""ftGF""","""O/U Eukaryota"""
38,4,73872,"""ftGF""","""O/U Eukaryota"""
24677,5,25070,"""ftGF""","""Haptophyceae"""
20,6,23873,"""ftGF""","""O/U Eukaryota"""
33,7,22324,"""tGF""","""Copepoda"""
17923,8,20580,"""ftGF""","""O/U Deuterosto…"


**Join metagenomic dataframe with pfam**

In [4]:
t0=time.time()

metaG = meta.join(pfam, on="unigeneID", how="left" )

t1=time.time()
print(f"Join time = {str(timedelta(seconds=round(t1-t0)))}")
print(f"Dataframe dimensions: {metaG.shape}")
metaG.head(4)

Join time = 0:00:46
Dataframe dimensions: (1736232225, 5)


unigeneID,sampleCode,Occurrence,pfamAcc,domainBitScore
i64,str,f64,str,f64
83028116,"""128SUR1QQSS11""",1.683e-10,,
85649520,"""128SUR1QQSS11""",5.1929e-10,,
67525,"""128SUR1QQSS11""",2.2329e-10,"""PF01694""",81.6
103429863,"""128SUR1QQSS11""",2.6594e-10,,


**Filter dataset based on COI ID**

Pfam ID of Cytochrome c oxidase sub-unit I (COI) --> PF00115

In [12]:
t0 = time.time()

COI = metaG.filter(pl.col("pfamAcc") == "PF00115")

t1 = time.time()
print(f"Time = {str(timedelta(seconds=round(t1-t0)))}")
print(f"Dataframe dimensions: {COI.shape}")
print(f"n° of unique genes: {len(COI['unigeneID'].unique())}")
print(f"n° of diferent samples: {len(COI['sampleCode'].unique())}")
COI.head(4)

Time = 0:00:10
Dataframe dimensions: (22959, 5)
n° of unique genes: 1272
n° of diferent samples: 440


unigeneID,sampleCode,Occurrence,pfamAcc,domainBitScore
i64,str,f64,str,f64
69763190,"""128SUR1QQSS11""",1.7281e-10,"""PF00115""",24.9
14050984,"""128SUR1QQSS11""",1.1164e-10,"""PF00115""",27.6
68688293,"""128SUR1QQSS11""",1.3836e-10,"""PF00115""",34.7
35343029,"""128SUR1QQSS11""",1.1164e-10,"""PF00115""",24.0


**Basic graphical representation**

Desorden

In [15]:
ID = pl.read_csv('~/Ocean_IA/group_storage/TARA/Eukaryote_Gene_Catalog_MATOU/unigeneID_clusterID.tsv.gz',
                        separator='\t')
print(f"Dataframe dimensions: {ID.shape}")
ID.head(8)

Dataframe dimensions: (68756633, 2)


1747,1
i64,i64
1786,1
2183,1
2676,1
3286,1
4386,1
4387,1
4388,1
5023,1
