In [1]:
#  kraken2 report to OTU table and Tax table script
import pandas as pd
import numpy as np
import os
from pandas import read_csv, DataFrame
from ete3 import NCBITaxa
import click
#  one-time download, later just updates silently
ncbi = NCBITaxa()
ncbi.update_taxonomy_database()

NCBI database not present yet (first time used?)
Downloading taxdump.tar.gz from NCBI FTP site (via HTTP)...
Done. Parsing...


Loading node names...
2577896 names loaded.
327591 synonyms loaded.
Loading nodes...
2577896 nodes loaded.
Linking nodes...
Tree is loaded.
Updating database: /home/fedorov_de/.etetoolkit/taxa.sqlite ...
 2577000 generating entries... 

Inserting synonyms:      15000 


Uploading to /home/fedorov_de/.etetoolkit/taxa.sqlite



Inserting synonyms:      325000 




Inserting taxids:       15000     




Inserting taxids:       2575000            




Local taxdump.tar.gz seems up-to-date


Loading node names...
2577896 names loaded.
327591 synonyms loaded.
Loading nodes...
2577896 nodes loaded.
Linking nodes...
Tree is loaded.
Updating database: /home/fedorov_de/.etetoolkit/taxa.sqlite ...
 2577000 generating entries... 
Uploading to /home/fedorov_de/.etetoolkit/taxa.sqlite


Inserting synonyms:      15000 




Inserting synonyms:      325000 




Inserting taxids:       20000   




Inserting taxids:       2575000     




In [None]:
#  tryin' click for the 1st time
@click.command()
@click.option('--folder_path', default='./', help='Path to Kraken2 reports folder.')
@click.option('--outdir',default ='./', help='Folder to store resulting OTU table and Tax table.')
def read_path_to_data(folder_path, outdir):
    """Read path to Kraken2 reports folder"""
        click.echo(f'Received input argument: {folder_path}')
        

if __name__ == '__main__':
    read_path_to_data()

In [47]:
folder_path = './reports'  # Replace with the actual path to your folder, hardcoded now
file_extension = '.report'

COUNT_COL_IDX = 2
SCIENTIFIC_NAME_COL_IDX = 5
TAXID_COL_IDX = 4


# create an empty DataFrame to store the data
merged_df = pd.DataFrame()
merged_df_taxid = pd.DataFrame()
# iterate over the files in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(file_extension):
        file_path = os.path.join(folder_path, file_name)
        
        
        #  extract the desired columns from each file. For Tax table it is 5th col, for OTU table it is 3rd and 6th columns
        temp_df = pd.read_csv(file_path, delimiter='\t', skiprows=1, usecols=[COUNT_COL_IDX, SCIENTIFIC_NAME_COL_IDX, TAXID_COL_IDX], header=None)
        temp_df_taxid = pd.read_csv(file_path, delimiter='\t', skiprows=1, usecols=[TAXID_COL_IDX], header=None)
        
        #  extract the sample number from the file name
        sample_number = os.path.splitext(file_name)[0]
        
        #  add the sample number as a new column in the DataFrame
        temp_df['sample_number'] = sample_number
        temp_df_taxid['sample_number'] = sample_number
        
        # concatenate the data into one DataFrame
        merged_df = pd.concat([merged_df, temp_df])
        merged_df_taxid = pd.concat([merged_df_taxid, temp_df_taxid])
#  renaming columns for the pre-OTU table (here in long format)
merged_df.columns=["Count", "Taxid", "Taxon",   "Sample"]
merged_df= merged_df.reset_index(drop=True)
#  Removing leading spaces of taxonomic labels with regex
merged_df = merged_df.replace(r"^ +| +$", r"", regex=True)
#  The same renaming for Tax table
merged_df_taxid.columns =['TaxID', 'Sample']
merged_df_taxid = merged_df_taxid.reset_index(drop=True)

In [45]:
otu_table = merged_df[['Count', 'Taxid', 'Sample']].pivot(index='Sample', columns='Taxid', values='Count')

In [52]:
#  creating an OTU table
tax_table = []
desired_ranks = ['root', 'domain', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain']

def fill_list_with_nulls_to_desired_length(my_list, target_length):
    while len(my_list) < target_length:
        my_list.append(None)
    return my_list


for row in merged_df.to_dict(orient='records')[0:10]:
    # TODO Обработать момент когда такс id = 2637697 
    lineage = ncbi.get_lineage(row['Taxid'])

    names = ncbi.get_taxid_translator(lineage)
    names = [names[taxid] for taxid in lineage]
    
    names_filled = fill_list_with_nulls_to_desired_length(names, len(desired_ranks))
    names_filled = [row['Taxid']] + names_filled

    tax_table.append(names_filled)
    
tax_table = pd.DataFrame(tax_table)
tax_table.set_index(0, inplace=True)

tax_table.columns = desired_ranks

In [53]:
tax_table

Unnamed: 0_level_0,root,domain,kingdom,phylum,class,order,family,genus,species,strain
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,root,,,,,,,,,
131567,root,cellular organisms,,,,,,,,
2,root,cellular organisms,Bacteria,,,,,,,
1224,root,cellular organisms,Bacteria,Pseudomonadota,,,,,,
1236,root,cellular organisms,Bacteria,Pseudomonadota,Gammaproteobacteria,,,,,
91347,root,cellular organisms,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,,,,
543,root,cellular organisms,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,,,
561,root,cellular organisms,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,,
562,root,cellular organisms,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli,
83334,root,cellular organisms,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli,Escherichia coli O157:H7


In [54]:
tax_table

Unnamed: 0_level_0,root,domain,kingdom,phylum,class,order,family,genus,species,strain
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,root,,,,,,,,,
131567,root,cellular organisms,,,,,,,,
2,root,cellular organisms,Bacteria,,,,,,,
1224,root,cellular organisms,Bacteria,Pseudomonadota,,,,,,
1236,root,cellular organisms,Bacteria,Pseudomonadota,Gammaproteobacteria,,,,,
91347,root,cellular organisms,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,,,,
543,root,cellular organisms,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,,,
561,root,cellular organisms,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,,
562,root,cellular organisms,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli,
83334,root,cellular organisms,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli,Escherichia coli O157:H7


In [41]:


for sample in merged_df['Sample'].unique():
    
    sample_subset = merged_df[merged_df['Sample'] == sample]
    otu_table[sample] = {sample_subset['Taxid'][idx] : sample_subset['Count'][idx] for idx in sample_subset.index}
otu_table = DataFrame(otu_table).fillna(0).T
zero_cols = otu_table.columns[(otu_table <=0).all()]
otu_table.drop(labels=zero_cols, axis=1, inplace=True)
otu_table.head(10)

TypeError: list indices must be integers or slices, not str

In [64]:
#  smol df just for testing
taxid_test = merged_df_taxid.sample(n=3)
taxid_test

Unnamed: 0,TaxID,Sample
75984,79604,23NVShed4_a
111240,110505,23NVShed1_o
114016,1301283,23NVShed1_o


In [67]:
taxo_names_df = pd.DataFrame()
for _, row in taxid_test.iterrows():
    taxid = int(row['TaxID'])
    lineage = ncbi.get_lineage(taxid)
    #print(lineage)
    names = ncbi.get_taxid_translator(lineage)
    print([names[taxid] for taxid in lineage])
#  later add filtration to save only necessary ranks
#  this might help i guess: 
#  https://stackoverflow.com/questions/36503042/how-to-get-taxonomic-specific-ids-for-kingdom-phylum-class-order-family-gen/36517712#36517712?newreg=1131bb17c142425887049bc55eff0c9f


['root', 'cellular organisms', 'Bacteria', 'Terrabacteria group', 'Actinomycetota', 'Coriobacteriia', 'Eggerthellales', 'Eggerthellaceae', 'Denitrobacterium', 'Denitrobacterium detoxificans']
['root', 'cellular organisms', 'Bacteria', 'Terrabacteria group', 'Actinomycetota', 'Actinomycetes', 'Mycobacteriales', 'Mycobacteriaceae', 'Mycobacterium', 'Mycobacterium heckeshornense']
['root', 'cellular organisms', 'Bacteria', 'Terrabacteria group', 'Cyanobacteriota/Melainabacteria group', 'Cyanobacteriota', 'Cyanophyceae', 'Oscillatoriophycideae']
