# Check if sufficient genomes are present per species

## Preparation

In [125]:
import pandas as pd
from pathlib import Path

In [126]:
# Parameters
threshold = 10 # minimal number of genomes per species

In [127]:
project_path = Path().resolve().parent

## Lactobacillus

In [138]:
path_Lactobacillus = project_path / "data" / "genomes_Lactobacillus.csv"

In [139]:
genomes_Lactobacillus = pd.read_csv(path_Lactobacillus)
genomes_Lactobacillus.columns = ["genome", "gtdb_species"]

In [140]:
genomes_Lactobacillus

Unnamed: 0,genome,gtdb_species
0,GCA_000219475.3,Lactobacillus johnsonii
1,GCA_000442765.1,Lactobacillus amylovorus
2,GCA_000980505.1,Lactobacillus amylovorus
3,GCA_001311275.1,Lactobacillus delbrueckii
4,GCA_001311335.1,Lactobacillus kefiranofaciens
...,...,...
273,GCA_006740305.1,Lactobacillus delbrueckii
274,GCA_006982025.1,Lactobacillus paragasseri
275,GCA_900112665.1,Lactobacillus bombicola
276,GCA_900196735.1,Lactobacillus delbrueckii


In [141]:
counts = genomes_Lactobacillus.groupby(by = "gtdb_species", as_index = False).count()
counts.columns = ['gtdb_species', 'counts']
counts

Unnamed: 0,gtdb_species,counts
0,Lactobacillus acetotolerans,6
1,Lactobacillus acidophilus,2
2,Lactobacillus amylolyticus,4
3,Lactobacillus amylovorus,13
4,Lactobacillus apis,4
5,Lactobacillus bombicola,8
6,Lactobacillus crispatus,48
7,Lactobacillus delbrueckii,37
8,Lactobacillus delbrueckii_A,1
9,Lactobacillus equicursoris,2


In [142]:
counts.gtdb_species[counts.counts >= threshold].count()

8

## Lactobacillales

In [143]:
path_Lactobacillales = project_path / "data" / "genomes_metadata.csv"

In [144]:
genomes_Lactobacillales = pd.read_csv(path_Lactobacillales).loc[:,'genome':'gtdb_species']
genomes_Lactobacillales

Unnamed: 0,genome,gtdb_species
0,GCA_000143435.1,Ligilactobacillus salivarius
1,GCA_000167775.1,Streptococcus agalactiae
2,GCA_000167795.1,Streptococcus agalactiae
3,GCA_000179475.1,Ligilactobacillus salivarius
4,GCA_000195355.2,Lactobacillus helveticus
...,...,...
4366,GCA_902165155.1,Enterococcus_B faecium_B
4367,GCA_902165245.1,Enterococcus_B faecium
4368,GCA_902165765.1,Enterococcus_B faecium_B
4369,GCA_902165865.1,Enterococcus_B faecium


In [145]:
counts = genomes_Lactobacillales.groupby(by = "gtdb_species", as_index = False).count()
counts.columns = ['gtdb_species', 'counts']
counts

Unnamed: 0,gtdb_species,counts
0,26KH-42 sp004358325,1
1,Abiotrophia defectiva,1
2,Abiotrophia sp001815865,1
3,Abiotrophia sp900604935,1
4,Aerococcus christensenii,3
...,...,...
811,Weissella paramesenteroides,3
812,Weissella soli,1
813,Weissella thailandensis,2
814,Weissella viridescens,2


In [146]:
counts.gtdb_species[counts.counts >= threshold].count()

82

Conclusion: there are only 8 species of Lactobacillus that have sufficient genomes present in the dataset to make the distinction between core/accessory genome. For the Lactobacillales order, there are 82 species available with a sufficient amount of genomes. Therefore, we continue with the Lactobacillales order.