In [1]:
from pathlib import Path

In [2]:
import pandas as pd

In [3]:
from ete3 import NCBITaxa
from Bio import Entrez
from xml.etree import ElementTree

# REQUIRED: replace with your email
Entrez.email = "your_email@example.com"


# Define paths

In [4]:
Data_path = "/nfs/research/rdf/kam/projects/EVADES_final-2025-10-09/analyses/ADP_homologs/"

In [6]:
ncbi = NCBITaxa()

def get_lineage_from_taxid_local(taxid):
    try:
        lineage = ncbi.get_lineage(taxid)
        names = ncbi.get_taxid_translator(lineage)
        ranks = ncbi.get_rank(lineage)
        return [(ranks[t], names[t]) for t in lineage]
    except Exception as e:
        return f"❌ Error fetching TaxID {taxid}: {e}"

In [7]:
# Convert lineage column to separate taxonomy columns
def parse_lineage(lineage_list):
    # skip the first tuple
    lineage_dict = {rank: name for rank, name in lineage_list[1:]}
    return lineage_dict

## GenBank_phage

In [8]:
ls /nfs/research/rdf/kam/projects/EVADES_final-2025-10-09/analyses/ADP_homologs/GenBank_phage

GenBank_phage.faa
GenBank_phage.txt
GenBank_phage_unique.txt
blast_filtered.tsv
blast_filtered_with_sequences.tsv
blast_filtered_with_sequences_and_lineage.tsv
blast_raw.tsv
fetch_sequences.sh
filter-55686223.out
protein_ids_accessions.txt


In [9]:
GenBank_phage_matches = pd.read_csv(Path(Data_path) / "GenBank_phage" / "blast_filtered_with_sequences.tsv", 
                                    sep='\t')

In [10]:
GenBank_phage_matches.head()

Unnamed: 0,qseqid,sseqid,evalue,pident,qlen,slen,qstart,qend,sstart,send,sequence
0,pnk,CAA26792.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...
1,pnk,CAM1377250.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...
2,pnk,CAM1376987.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...
3,pnk,QPI17447.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...
4,pnk,AIT75404.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...


In [11]:
GenBank_phage_accessions = pd.read_csv(Path(Data_path) / "GenBank_phage" / "protein_ids_accessions.txt", 
                                       sep=';', names=["sseqid", "protein name", "species", "accession_and_coordinates", "lineage"])

In [12]:
GenBank_phage_accessions['accession'] = GenBank_phage_accessions['accession_and_coordinates'].apply(lambda x: x.split(":")[0])

In [13]:
GenBank_phage_accessions['sseqid'] = GenBank_phage_accessions['sseqid'].apply(lambda x: x[1:])

In [14]:
GenBank_phage_accessions = GenBank_phage_accessions[['sseqid', 'accession', 'lineage']]

In [15]:
GenBank_phage_matches.shape

(18585, 11)

In [16]:
GenBank_phage_matches = GenBank_phage_matches.merge(GenBank_phage_accessions, on='sseqid')

In [17]:
get_lineage_from_taxid_local('10665')

[('no rank', 'root'),
 ('acellular root', 'Viruses'),
 ('realm', 'Duplodnaviria'),
 ('kingdom', 'Heunggongvirae'),
 ('phylum', 'Uroviricota'),
 ('class', 'Caudoviricetes'),
 ('family', 'Straboviridae'),
 ('subfamily', 'Tevenvirinae'),
 ('genus', 'Tequatrovirus'),
 ('species', 'Tequatrovirus T4')]

In [18]:
GenBank_phage_matches['lineage'] = GenBank_phage_matches['lineage'].apply(lambda x: get_lineage_from_taxid_local(x))

In [19]:
GenBank_phage_matches.head()

Unnamed: 0,qseqid,sseqid,evalue,pident,qlen,slen,qstart,qend,sstart,send,sequence,accession,lineage
0,pnk,CAA26792.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...,X03007.1,"[(no rank, root), (acellular root, Viruses), (..."
1,pnk,CAM1377250.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...,OZ209057.1,"[(no rank, root), (acellular root, Viruses), (..."
2,pnk,CAM1376987.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...,OZ209023.1,"[(no rank, root), (acellular root, Viruses), (..."
3,pnk,QPI17447.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...,MT984581.1,"[(no rank, root), (acellular root, Viruses), (..."
4,pnk,AIT75404.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...,KM607003.1,"[(no rank, root), (acellular root, Viruses), (..."


In [20]:
# Apply transformation
lineage_expanded = GenBank_phage_matches['lineage'].apply(parse_lineage).apply(pd.Series)

In [21]:
GenBank_phage_matches.head()

Unnamed: 0,qseqid,sseqid,evalue,pident,qlen,slen,qstart,qend,sstart,send,sequence,accession,lineage
0,pnk,CAA26792.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...,X03007.1,"[(no rank, root), (acellular root, Viruses), (..."
1,pnk,CAM1377250.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...,OZ209057.1,"[(no rank, root), (acellular root, Viruses), (..."
2,pnk,CAM1376987.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...,OZ209023.1,"[(no rank, root), (acellular root, Viruses), (..."
3,pnk,QPI17447.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...,MT984581.1,"[(no rank, root), (acellular root, Viruses), (..."
4,pnk,AIT75404.1,0.0,100.0,301,301,1,301,1,301,MKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHEE...,KM607003.1,"[(no rank, root), (acellular root, Viruses), (..."


In [22]:
lineage_expanded

Unnamed: 0,acellular root,realm,kingdom,phylum,class,family,subfamily,genus,species,no rank,order,isolate
0,Viruses,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Straboviridae,Tevenvirinae,Tequatrovirus,Tequatrovirus T4,,,
1,Viruses,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Straboviridae,Tevenvirinae,Tequatrovirus,Tequatrovirus T4,Escherichia phage T4,,
2,Viruses,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Straboviridae,Tevenvirinae,Tequatrovirus,Tequatrovirus T4,Escherichia phage T4,,
3,Viruses,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Straboviridae,Tevenvirinae,Tequatrovirus,Tequatrovirus T4,Escherichia phage T4,,
4,Viruses,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Straboviridae,Tevenvirinae,Tequatrovirus,Tequatrovirus T4,Enterobacteria phage RB59,,
...,...,...,...,...,...,...,...,...,...,...,...,...
18580,Viruses,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Straboviridae,Tevenvirinae,Mosigvirus,Escherichia phage vB-Eco-KMB38,unclassified Mosigvirus,,
18581,Viruses,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Straboviridae,Tevenvirinae,Mosigvirus,Mosigvirus phapec2,Escherichia phage AlbertHofmann,,
18582,Viruses,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Straboviridae,Tevenvirinae,Mosigvirus,Escherichia phage PTK,unclassified Mosigvirus,,
18583,Viruses,Duplodnaviria,Heunggongvirae,Uroviricota,Caudoviricetes,Straboviridae,,Carettavirus,Carettavirus e142,Escherichia phage phiE142,,


In [23]:
GenBank_phage_matches_with_lineage = pd.concat([GenBank_phage_matches.drop(columns=['lineage']), lineage_expanded], axis=1)


In [24]:
GenBank_phage_matches_with_lineage['source'] = 'GenBank Phage'

In [25]:
GenBank_phage_matches_with_lineage.to_csv(Path(Data_path) / "GenBank_phage" / "blast_filtered_with_sequences_and_lineage.tsv", 
                                    sep='\t')

## Genbank_viral

In [26]:
GenBank_viral_matches = pd.read_csv(Path(Data_path) / "GenBank_viral" / "blast_filtered_with_sequences.tsv", 
                                    sep='\t')

In [27]:
GenBank_viral_accessions = pd.read_csv(Path(Data_path) / "GenBank_viral" / "protein_ids_accessions.txt", 
                                       sep=';', names=["sseqid", "protein name", "species", "accession_and_coordinates", "taxid"])

In [28]:
GenBank_viral_accessions

Unnamed: 0,sseqid,protein name,species,accession_and_coordinates,taxid
0,>BFL61626.1,hypothetical protein,Moumouvirus lavasanguinem,LC813553.1:820597..821103,3138182
1,>QKU33483.1,hypothetical protein,Tupanvirus deep ocean,MF405918.2:66142..66651,2126984
2,>AVG46464.1,DUF1768 domain-containing,Acanthamoeba polyphaga mimivirus,MG602507.1:847845..848345,212035
3,>AVG47577.1,hypothetical protein,Acanthamoeba polyphaga mimivirus,MG602508.1:855389..855889,212035
4,>AUV58690.1,hypothetical protein,Bandra megavirus,MG779364.1:4137..4637,2071566
...,...,...,...,...,...
110,>APC25795.1,ATP-dependent DNA ligase,Only Syngen Nebraska virus 5,KX857749.1:254843..255739,1917232
111,>AQN68550.1,hypothetical protein,Saudi moumouvirus,KY110734.1:780068..780589,1956188
112,>QKU34715.1,hypothetical protein,Tupanvirus soda lake,KY523104.2:67394..67903,2126985
113,>BBO53960.1,YbiA homolog protein,Abalone asfa-like virus,LC506465.1:6221..6709,2839893


In [29]:
GenBank_viral_accessions['accession'] = GenBank_viral_accessions['accession_and_coordinates'].apply(lambda x: x.split(":")[0])

In [30]:
GenBank_viral_accessions['sseqid'] = GenBank_viral_accessions['sseqid'].apply(lambda x: x[1:])

In [31]:
GenBank_viral_accessions = GenBank_viral_accessions[['sseqid', 'accession', 'taxid']]

In [32]:
GenBank_viral_accessions.head()

Unnamed: 0,sseqid,accession,taxid
0,BFL61626.1,LC813553.1,3138182
1,QKU33483.1,MF405918.2,2126984
2,AVG46464.1,MG602507.1,212035
3,AVG47577.1,MG602508.1,212035
4,AUV58690.1,MG779364.1,2071566


In [33]:
GenBank_viral_matches.head()

Unnamed: 0,qseqid,sseqid,evalue,pident,qlen,slen,qstart,qend,sstart,send,sequence
0,pnk,CCV02387.1,1.4e-38,33.333,301,676,4,291,384,663,MEVIKYLKSKSLVDLQLEFSIKSKSYTEGLFVLNYDQIASPKNSIA...
1,pnk,UIB20764.1,1.67e-37,32.423,301,695,2,291,401,682,MKVIEYLKLNGLNDLQSNYNIKVKKYKEEGLIVLNYDQVFSPKNIL...
2,pnk,QEA08271.1,1.6799999999999999e-37,32.423,301,699,2,291,405,686,MKVIEYLKLNGLNDLQSNYNIKVKKYKEEGLIVLNYDQVFSPKNIL...
3,aris,URG13417.1,7.59e-50,40.385,259,250,1,253,1,246,MQALQTKSNIGEMFNIQEKENGEIAISGRELHQALEVKTPYKKWFE...
4,atd1,CAK7596717.1,1.27e-14,46.341,86,266,4,85,185,266,MSYFTNINHSIYYYQSNGYTYVESPWIVDEEISNITKPSDRTNFYV...


In [34]:
GenBank_viral_matches = GenBank_viral_matches.merge(GenBank_viral_accessions, on='sseqid')

In [35]:
GenBank_viral_matches.head()

Unnamed: 0,qseqid,sseqid,evalue,pident,qlen,slen,qstart,qend,sstart,send,sequence,accession,taxid
0,pnk,CCV02387.1,1.4e-38,33.333,301,676,4,291,384,663,MEVIKYLKSKSLVDLQLEFSIKSKSYTEGLFVLNYDQIASPKNSIA...,HF920637.1,72201
1,pnk,UIB20764.1,1.67e-37,32.423,301,695,2,291,401,682,MKVIEYLKLNGLNDLQSNYNIKVKKYKEEGLIVLNYDQVFSPKNIL...,OK181107.1,2905229
2,pnk,QEA08271.1,1.6799999999999999e-37,32.423,301,699,2,291,405,686,MKVIEYLKLNGLNDLQSNYNIKVKKYKEEGLIVLNYDQVFSPKNIL...,MN081869.1,2594309
3,aris,URG13417.1,7.59e-50,40.385,259,250,1,253,1,246,MQALQTKSNIGEMFNIQEKENGEIAISGRELHQALEVKTPYKKWFE...,ON229621.1,2939125
4,atd1,CAK7596717.1,1.27e-14,46.341,86,266,4,85,185,266,MSYFTNINHSIYYYQSNGYTYVESPWIVDEEISNITKPSDRTNFYV...,OZ003748.1,3072188


In [36]:
GenBank_viral_matches['lineage'] = GenBank_viral_matches['taxid'].apply(lambda x: get_lineage_from_taxid_local(x))

In [37]:
lineage_expanded = GenBank_viral_matches['lineage'].apply(parse_lineage).apply(pd.Series)

In [38]:
lineage_expanded

Unnamed: 0,acellular root,realm,kingdom,phylum,class,order,no rank,family,subfamily,genus,species
0,Viruses,Varidnaviria,Bamfordvirae,Nucleocytoviricota,Megaviricetes,Pimascovirales,Armadillidium vulgare iridescent virus,Iridoviridae,Betairidovirinae,Iridovirus,Invertebrate iridescent virus 31
1,Viruses,Varidnaviria,Bamfordvirae,Nucleocytoviricota,Megaviricetes,Pimascovirales,unclassified Iridoviridae,Iridoviridae,,,Cricket iridovirus
2,Viruses,Varidnaviria,Bamfordvirae,Nucleocytoviricota,Megaviricetes,Pimascovirales,unclassified Iridoviridae,Iridoviridae,,,Iridovirus Liz-CrIV
3,Viruses,Duplodnaviria,Heunggongvirae,,,,unclassified Heunggongvirae,,,,Staphylococcus phage M13
4,Viruses,Varidnaviria,Bamfordvirae,Nucleocytoviricota,Megaviricetes,Imitervirales,unclassified Klosneuvirinae,Mimiviridae,Klosneuvirinae,Catovirus,Catovirus sp. 'naegleriensis'
...,...,...,...,...,...,...,...,...,...,...,...
110,Viruses,Varidnaviria,Bamfordvirae,Nucleocytoviricota,Megaviricetes,Algavirales,unclassified Chlorovirus,Phycodnaviridae,,Chlorovirus,Chlorovirus sp.
111,Viruses,Varidnaviria,Bamfordvirae,Nucleocytoviricota,Megaviricetes,Algavirales,unclassified Chlorovirus,Phycodnaviridae,,Chlorovirus,Paramecium bursaria Chlorella virus NY2B
112,Viruses,Varidnaviria,Bamfordvirae,Nucleocytoviricota,Megaviricetes,Algavirales,unclassified Chlorovirus,Phycodnaviridae,,Chlorovirus,Paramecium bursaria Chlorella virus IL-5-2s1
113,Viruses,Varidnaviria,Bamfordvirae,Nucleocytoviricota,Megaviricetes,Algavirales,unclassified Chlorovirus,Phycodnaviridae,,Chlorovirus,Chlorovirus sp.


In [39]:
GenBank_viral_matches_with_lineage = pd.concat([GenBank_viral_matches.drop(columns=['lineage']), lineage_expanded], axis=1)


In [40]:
GenBank_viral_matches_with_lineage.head()

Unnamed: 0,qseqid,sseqid,evalue,pident,qlen,slen,qstart,qend,sstart,send,...,realm,kingdom,phylum,class,order,no rank,family,subfamily,genus,species
0,pnk,CCV02387.1,1.4e-38,33.333,301,676,4,291,384,663,...,Varidnaviria,Bamfordvirae,Nucleocytoviricota,Megaviricetes,Pimascovirales,Armadillidium vulgare iridescent virus,Iridoviridae,Betairidovirinae,Iridovirus,Invertebrate iridescent virus 31
1,pnk,UIB20764.1,1.67e-37,32.423,301,695,2,291,401,682,...,Varidnaviria,Bamfordvirae,Nucleocytoviricota,Megaviricetes,Pimascovirales,unclassified Iridoviridae,Iridoviridae,,,Cricket iridovirus
2,pnk,QEA08271.1,1.6799999999999999e-37,32.423,301,699,2,291,405,686,...,Varidnaviria,Bamfordvirae,Nucleocytoviricota,Megaviricetes,Pimascovirales,unclassified Iridoviridae,Iridoviridae,,,Iridovirus Liz-CrIV
3,aris,URG13417.1,7.59e-50,40.385,259,250,1,253,1,246,...,Duplodnaviria,Heunggongvirae,,,,unclassified Heunggongvirae,,,,Staphylococcus phage M13
4,atd1,CAK7596717.1,1.27e-14,46.341,86,266,4,85,185,266,...,Varidnaviria,Bamfordvirae,Nucleocytoviricota,Megaviricetes,Imitervirales,unclassified Klosneuvirinae,Mimiviridae,Klosneuvirinae,Catovirus,Catovirus sp. 'naegleriensis'


In [41]:
GenBank_viral_matches_with_lineage['source'] = 'GenBank Virus'

In [42]:
GenBank_viral_matches_with_lineage.to_csv(Path(Data_path) / "GenBank_viral" / "blast_filtered_with_sequences_and_lineage.tsv", 
                                    sep='\t')

## IMG_VR

In [43]:
IMG_VR_matches = pd.read_csv(Path(Data_path) / "IMG_VR" / "blast_filtered_with_sequences.tsv", 
                                    sep='\t')

In [44]:
IMG_VR_matches['UVIG'] = IMG_VR_matches['sseqid'].apply(lambda x: x.split('|')[0])

In [45]:
IMG_VR_matches.head()

Unnamed: 0,qseqid,sseqid,evalue,pident,qlen,slen,qstart,qend,sstart,send,sequence,UVIG
0,pnk,IMGVR_UViG_2974509353_000001|2974509353|297450...,0.0,99.336,301,302,1,301,2,302,MMKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHE...,IMGVR_UViG_2974509353_000001
1,pnk,IMGVR_UViG_2974728631_000001|2974728631|297472...,0.0,98.671,301,302,1,301,2,302,MMKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHE...,IMGVR_UViG_2974728631_000001
2,pnk,IMGVR_UViG_2974740604_000001|2974740604|297474...,0.0,98.339,301,302,1,301,2,302,MMKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHE...,IMGVR_UViG_2974740604_000001
3,pnk,IMGVR_UViG_2974746430_000001|2974746430|297474...,0.0,98.339,301,302,1,301,2,302,MMKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMGHE...,IMGVR_UViG_2974746430_000001
4,pnk,IMGVR_UViG_2706795554_000001|2706795554|270866...,0.0,97.674,301,302,1,301,2,302,MMKKIILTVGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMGHE...,IMGVR_UViG_2706795554_000001


In [46]:
IMG_VR_accessions = pd.read_csv(Path(Data_path) / "IMG_VR" / "taxonomy.txt", 
                                       sep='\t')

In [47]:
IMG_VR_accessions = IMG_VR_accessions[['UVIG', 'Taxonomic classification', 'Host taxonomy prediction']]

In [48]:
IMG_VR_matches_with_taxonomy = IMG_VR_matches.merge(IMG_VR_accessions, on='UVIG')

In [49]:
IMG_VR_matches_with_taxonomy.head()

Unnamed: 0,qseqid,sseqid,evalue,pident,qlen,slen,qstart,qend,sstart,send,sequence,UVIG,Taxonomic classification,Host taxonomy prediction
0,pnk,IMGVR_UViG_2974509353_000001|2974509353|297450...,0.0,99.336,301,302,1,301,2,302,MMKKIILTIGCPGSGKSTWAREFIAKNPGFYNINRDDYRQSIMAHE...,IMGVR_UViG_2974509353_000001,r__Duplodnaviria;k__Heunggongvirae;p__Uroviric...,;;;;;;
1,rna_ligase,IMGVR_UViG_2974509353_000001|2974509353|297450...,0.0,98.396,374,374,1,374,1,374,MQELFNNLMELCKDSQRKFFYSDDVSASGRTYRIFSYNYASYSDWL...,IMGVR_UViG_2974509353_000001,r__Duplodnaviria;k__Heunggongvirae;p__Uroviric...,;;;;;;
2,ipi_,IMGVR_UViG_2974509353_000001|2974509353|297450...,5.15e-61,100.0,95,95,1,95,1,95,MKTFKEFTSTTTPVSTITEATLTSEVIKANKGREGKPMISLVDGEE...,IMGVR_UViG_2974509353_000001,r__Duplodnaviria;k__Heunggongvirae;p__Uroviric...,;;;;;;
3,arn,IMGVR_UViG_2974509353_000001|2974509353|297450...,2.5299999999999998e-57,97.826,92,92,1,92,1,92,MIIDSQSVVQYTIKIDILEKLYKFLPNLYHSIVNELVEELHLENND...,IMGVR_UViG_2974509353_000001,r__Duplodnaviria;k__Heunggongvirae;p__Uroviric...,;;;;;;
4,adfa,IMGVR_UViG_2974509353_000001|2974509353|297450...,2.34e-92,67.327,212,208,11,212,13,208,MLYQMHKCKDTYKYKGAQCYIINRENAGPGHSHQSRFVFVKNNEII...,IMGVR_UViG_2974509353_000001,r__Duplodnaviria;k__Heunggongvirae;p__Uroviric...,;;;;;;


In [50]:
IMG_VR_matches_with_taxonomy.to_csv(Path(Data_path) / "IMG_VR" / "blast_filtered_with_sequences_and_taxonomy.tsv", 
                                    sep='\t')

In [51]:
ranks = [
    "realm",
    "kingdom",
    "phylum",
    "class",
    "order",
    "family",
    "genus",
    "species"]

# Split taxonomy strings into a dictionary per row
def parse_taxonomy(tax_str):
    parts = tax_str.split(';')
    rank_map = {rank: None for rank in ranks}
    for part in parts:
        if '__' in part:
            prefix, value = part.split('__', 1)
            rank_key = {
                'r': 'realm',
                'k': 'kingdom',
                'p': 'phylum',
                'c': 'class',
                'o': 'order',
                'f': 'family',
                'g': 'genus',
                's': 'species'
            }.get(prefix)
            if rank_key:
                rank_map[rank_key] = value if value else None
    return rank_map

# Apply and expand into columns
tax_df = IMG_VR_matches_with_taxonomy['Taxonomic classification'].apply(parse_taxonomy).apply(pd.Series)

# Combine with original dataframe
IMG_VR_matches_with_taxonomy = pd.concat([IMG_VR_matches_with_taxonomy, tax_df], axis=1)
IMG_VR_matches_with_taxonomy['source'] = 'IMG_VR'

## Merge results

In [52]:
IMG_list = ['qseqid', 'sseqid', 'evalue', 'pident', 'qlen', 'slen', 'qstart',
            'qend', 'sstart', 'send', 'sequence', 'realm', 'kingdom', 'phylum', 
            'class', 'order', 'family', 'genus', 'species', 'source']

In [53]:
IMG_VR_matches_with_taxonomy = IMG_VR_matches_with_taxonomy[IMG_list]

In [54]:
GenBank_viral_matches_with_lineage = GenBank_viral_matches_with_lineage[IMG_list]

In [55]:
GenBank_phage_matches_with_lineage = GenBank_phage_matches_with_lineage[IMG_list]

In [57]:
all_ADP_homologs = pd.concat([IMG_VR_matches_with_taxonomy, GenBank_viral_matches_with_lineage, GenBank_phage_matches_with_lineage])

In [58]:
all_ADP_homologs.to_csv(Path(Data_path) / "all_ADP_homologs.tsv", 
                        sep="\t")

In [59]:
Data_path

'/nfs/research/rdf/kam/projects/EVADES_final-2025-10-09/analyses/ADP_homologs/'