In [1]:
import pandas as pd
from pathlib import Path

## Input dataset

In [2]:
df = pd.read_excel("../data/external/103_table_v5.xlsx")
df_genomes_meta = pd.read_csv("../data/external/df_genomes_meta.csv")

## Statistics

In [3]:
print("Total genera:", len(df_genomes_meta.genus_gtdb.value_counts()) -1)
df_genomes_meta.genus_gtdb.value_counts()

Total genera: 27


genus_gtdb
g__Streptomyces         885
g__Micromonospora        36
g__Kitasatospora         27
g__Nocardia              21
g__Kribbella             11
g__Amycolatopsis          8
g__Streptosporangium      7
g__Microbispora           5
g__Nonomuraea             4
g__Kocuria                4
g__Nocardioides           3
g__Spirillospora          3
g__Embleya                2
g__Rhodococcus_C          2
g__Micromonospora_G       2
g__                       2
g__Mycobacterium          1
g__Aldersonia             1
g__Kineococcus            1
g__Micrococcus            1
g__Williamsia_A           1
g__Dactylosporangium      1
g__Actinoplanes           1
g__Lentzea                1
g__Sphaerisporangium      1
g__Micromonospora_E       1
g__Actinoallomurus        1
g__Actinomycetospora      1
Name: count, dtype: int64

There are two unassigned genus:

In [4]:
df_genomes_meta[df_genomes_meta.genus_gtdb == "g__"]

Unnamed: 0,genome_id,quality,organism,genus,species,strain,isolation_source,country,location,lat_lon,collection_date,genus_gtdb,species_gtdb,Mash_species
695,NBC_01635,HQ,Streptomyces sp. NBC 01635,Streptomyces,sp.,NBC 01635,soil,Denmark,"Kolding, ved Kolding Hus I rabarber bed",55.4914 N 9.4758 E,2021-05,g__,s__,
775,NBC_01309,HQ,Streptomycetaceae,Streptomycetaceae,,NBC_01309,,Denmark,"Moensted, Moensted open limestone mine",56.4519 N 9.1717 E,2020-07-21,g__,s__,


Based on GTDBtk, There are a total of 26 genera + 2 unassigned genus. Making it in total ~28 genera in the dataset.

In [5]:
df_g1032 = df_genomes_meta[df_genomes_meta.genus_gtdb != "g__"]
species_gtdb_identified = df_g1032[df_g1032.species_gtdb != "s__"].species_gtdb.value_counts()
#species_gtdb_identified

In [6]:
species_mash_identified = df_g1032[~df_g1032.Mash_species.isnull()].Mash_species.value_counts()
#species_mash_identified

In [7]:
f"Total species that can be identified: {len(species_gtdb_identified) + len(species_mash_identified)}. Identifided with GTDBtk: {len(species_gtdb_identified)}. Identifided with MASH: {len(species_mash_identified)}. There are {len(df_genomes_meta[df_genomes_meta.genus_gtdb == 'g__'])} genomes that cannot be assigned."

'Total species that can be identified: 389. Identifided with GTDBtk: 145. Identifided with MASH: 244. There are 2 genomes that cannot be assigned.'

## Cleanup

In [8]:
species_assignment = df_genomes_meta.loc[:, ["genome_id", "genus_gtdb", "species_gtdb", "Mash_species"]]
species_assignment['genus_gtdb'] = species_assignment['genus_gtdb'].str.replace('g__', '')
species_assignment['species_gtdb'] = species_assignment['species_gtdb'].str.replace('s__', 'sp.')
species_assignment["assigned_species"] = "(" + species_assignment['Mash_species'].fillna("") + ")"
species_assignment["assigned_species"] = species_assignment['assigned_species'].str.replace("()", "")
species_assignment['species_gtdb'] = species_assignment['genus_gtdb'] + ' ' + species_assignment['species_gtdb']
species_assignment['assigned_species'] = species_assignment['species_gtdb'] + ' ' + species_assignment['assigned_species']
species_assignment['assigned_species'] = species_assignment['assigned_species'].str.strip()
for idx in df_genomes_meta[df_genomes_meta.genus_gtdb == "g__"].index:
    species_assignment.loc[idx, 'assigned_species'] = "N/A"
    species_assignment.loc[idx, 'species_gtdb'] = "N/A"
    species_assignment.loc[idx, 'genus_gtdb'] = "N/A"
df_species_assignment = species_assignment.rename(columns={"Mash_species" : "MASH_species"}).set_index("genome_id")

In [9]:
df_final = df.set_index("strain_name").copy()
outfile = Path("../tables/Supplementary.xlsx")
outfile.parent.mkdir(exist_ok=True, parents=True)
df_final = df_final.merge(df_species_assignment, left_index=True, right_index=True)
df_final.to_excel(outfile)
#df_final