# Merge GTDB classifications 

In [4]:
import json
import pandas as pd 
from pprint import pprint

Due to RAM constraints, GTDBtk was run for separated batches of genomes. This code is adapted to read $3$ different results files from GTDBtk

In [None]:
def table_load(filenum):
    """Reads GTDBtk results file and extracts essential information"""
    phylotable = pd.read_csv(f'gtdbtk.bac120.summary_{filenum}.tsv', sep = '\t')
    sset = phylotable[["user_genome", "classification"]]
    return sset

In [51]:
# Open already annotated classifications
with open('GTDB_classification.json', 'r') as file:
    phylojson = json.load(file)

# Delete "release" info from phylojson
for key, value in phylojson.items():
    del value["release"]

In [None]:
# Open GTDBtk classification batches
# In this case, the number of batches is 3
sset1 = table_load(1)
sset2 = table_load(2)
sset3 = table_load(3)

sset_list = [sset1, sset2, sset3]

In [55]:
# Merge classifications
for sset in sset_list:
    for row, info in sset.iterrows():
        cl_dict = {}
        genome = info[0]
        cl = info[1]

        taxa = cl.split(';')

        for taxon in taxa:
            cl_dict[taxon[0]] = taxon

        phylojson[genome] = cl_dict  

    print(len(phylojson))

46


In [56]:
# Write new file
with open('GTDB_full_classification.json', 'w') as file:
    json.dump(phylojson, file, indent=4)