# Web of Life 2 (WoL2) genome selection

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
plt.rcParams.update({
    'figure.autolayout': True,
    'savefig.bbox': 'tight',
    'svg.fonttype': 'none',
    'font.sans-serif': 'Arial',
    'font.size': 12,
    'boxplot.medianprops.linewidth': 2,
    'boxplot.flierprops.markeredgecolor': 'none',
    'boxplot.flierprops.markersize': 5})

Load metadata of WoL2 genomes. Available at [WoL2 FTP server](http://ftp.microbio.me/pub/wol2/genomes/)

In [3]:
df = pd.read_table(f'./input_data/assembly.tsv', sep = '\t', index_col = 0)
df.shape

(15953, 22)

Load checkM annotations. Available at [WoL2 FTP server](http://ftp.microbio.me/pub/wol2/genomes/)

In [4]:
checkm = pd.read_table(f'./input_data/checkm.tsv', sep = '\t', index_col = 0)
checkm.shape

(15953, 29)

Load GTDB taxonomy. [Avilable at WoL2 FTP server](http://ftp.microbio.me/pub/wol2/taxonomy/gtdb/)

In [5]:
levels = ['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']
taxonomy = pd.read_csv(f'./input_data/lineages_gtdb.txt', index_col = 0, 
                     sep = r'\t|;', engine = 'python', header = None,
                    names = levels)
taxonomy = taxonomy.map(lambda x: x.strip())

In [6]:
taxonomy.shape

(15953, 7)

# Selection

## 1. Exclude incomplete genomes

In [7]:
df_quality = df.query('assembly_level == "Complete Genome" or assembly_level == "Chromosome"')
df_quality.shape

(2733, 22)

2. Exclude taxa with less than `num` representative taxa

In [9]:
taxonomy_quality = taxonomy.loc[df_quality.index]
taxonomy_quality.shape

(2733, 7)

In [10]:
num = 10
tmp = []
for level in levels[:-1]:
    counts = taxonomy_quality[taxonomy_quality[level].notna()][[level]].value_counts()
    for taxon in counts.index:
        # Avoid taxa such as g__ or f__
        if len(taxon[0].split('__')[1]) > 0:
            n = counts.loc[taxon]
            if n > num:
                tmp.append([level, taxon[0], n])
df_taxa = pd.DataFrame(tmp, columns = ['rank', 'taxon', 'counts'])

In [11]:
level = 'family'
taxa = df_taxa.query('rank == @level')['taxon'].values

In [12]:
sampled = taxonomy_quality[taxonomy_quality[level].isin(taxa)]
sampled.shape

(1510, 7)

In [15]:
genomes_sampled = sampled.index

Save selected genomes

In [16]:
with open(f'./output_data/genomes_sampled_family_{num}.txt', 'w') as f:
    for genome in genomes_sampled:
        f.write(f'{genome}\n')