In [1]:
import pandas as pd 
from pathlib import Path
import numpy as np

def read_uniprot(uniprot_metadata):
    uniprot_info = pd.read_csv(
        uniprot_metadata,
        sep='\t'
        ).drop(columns= ["Proteome_ID","OSCODE","#(1)", "#(2)","#(3)"]).rename(columns={
            "Species Name": "Species_Name"
        })

    return uniprot_info[uniprot_info["SUPERREGNUM"] != "viruses"]

def read_goa(
    goa_metadata,
    processed_uniprot_metadata
):
    goa_info = pd.read_csv(
        goa_metadata,
        sep='\t',
        header = None,
        names = [
            "Species_Name",
            "Tax_ID",
            "GOA_file"
        ])

    return goa_info[goa_info['Tax_ID'].isin(processed_uniprot_metadata['Tax_ID'])]


In [None]:
uniprot_metadata = read_uniprot(
    "UniProt_metadata.txt"
)

goa_metadata = read_goa(
    "GOA_metadata.txt",
    uniprot_metadata
)

metadata = pd.merge(
        goa_metadata,
        uniprot_metadata[['Tax_ID', "SUPERREGNUM"]],
        on='Tax_ID',
        how='left'
    ).drop(columns=['GOA_file'])


In [3]:
lineage_names = [
    'Tax_ID',
    'Name', 
    'Species', 
    'Genus', 
    'Family', 
    'Order',
    'Class', 
    'Phylum', 
    'Kingdom', 
    'Realm', 
    'Domain'
]

In [4]:
lineage = pd.read_csv("rankedlineage.dmp",  
                    sep='|',
                    header=None,
                    names=lineage_names,
                    dtype=str,
                    keep_default_na=False)

for col in lineage.columns:
    lineage[col] = lineage[col].str.strip()

lineage['Tax_ID'] = pd.to_numeric(lineage['Tax_ID'], errors='coerce')
lineage = lineage.replace('', np.nan)
lineage = lineage[lineage['Tax_ID'].isin(metadata['Tax_ID'])]
lineage = lineage.drop(columns=['Domain','Species'])
lineage = lineage.rename(columns={
    "Realm": "Superkingdom"
})


  lineage = lineage.replace('', np.nan)


In [5]:
lineage_set = set(lineage['Tax_ID'].unique().tolist())
metadata_set = set(metadata['Tax_ID'].unique().tolist())
metadata_set.difference(lineage_set)

{59451, 85948, 436010, 1914757}

Add the missing data from the NCBI Taxonomy Browser site

In [6]:
missing_data = [
    [59451, 'Cnephaeus nilssonii', 'Cnephaeus', 'Vespertilionidae', 'Chiroptera', 'Mammalia', 'Chordata', 'Metazoa', 'Eukaryota'],
    [85948, 'Lojkania enalia', 'Lojkania', np.nan, 'Pleosporales', 'Dothideomycetes', 'Ascomycota', 'Fungi', 'Eukaryota'],
    [436010, 'Athelia psychrophila', 'Athelia', 'Atheliaceae', 'Atheliales', 'Agaricomycetes', 'Basidiomycota', 'Fungi', 'Eukaryota'],
    [1914757, 'Sphingobacterium cellulitidis', 'Sphingobacterium', "Sphingobacteriaceae", 'Sphingobacteriales', 'Sphingobacteriia', 'Bacteroidota', 'Pseudomonadati', 'Bacteria']
]
lineage_names = [
    'Tax_ID',
    'Name', 
    'Genus', 
    'Family', 
    'Order',
    'Class', 
    'Phylum', 
    'Kingdom', 
    'Superkingdom'
]

missing_data_df = pd.DataFrame(missing_data, columns=lineage_names)
full_lineage = pd.concat([lineage, missing_data_df]).reset_index(drop=True)

In [7]:
taxonomic_levels = ['Genus', 'Family', 'Order', 'Class', 'Phylum', 'Kingdom', 'Superkingdom']

full_lineage_tax_columns = full_lineage[taxonomic_levels]
full_lineage_anchors = full_lineage_tax_columns.bfill(axis=1)

for col in taxonomic_levels:
    original_na_mask = full_lineage[col].isna()
    
    anchor_found_mask = full_lineage_anchors[col].notna()
    
    effective_mask_to_fill = original_na_mask & anchor_found_mask
    
    if effective_mask_to_fill.any():
        anchor_values_for_filling = full_lineage_anchors.loc[effective_mask_to_fill, col]
        
        fill_strings = "Unclassified_" + anchor_values_for_filling.astype(str)
    
        full_lineage.loc[effective_mask_to_fill, col] = fill_strings

full_lineage = full_lineage.rename(columns={
    "Name": "Species"
})

In [8]:
def capitalize_species_name(
    species_name
):
    species_name = species_name.replace("'","").replace("[","").replace("]","")
    return species_name[0].upper() + species_name[1:]

full_lineage['Species'] = full_lineage['Species'].apply(capitalize_species_name)

for col in full_lineage.columns:
    full_lineage[col] = full_lineage[col].astype(str).str.replace(' ', '_', regex=False)

final_lineage = full_lineage.sort_values(by="Superkingdom")

final_lineage.to_csv("lineage.txt", index=False, sep='\t')