In [2]:
import pandas as pd
import bte
import numpy as np
from collections import defaultdict, Counter

** This notebook traverses the full UShER tree to identify molnupiravir-associated (MOV) mutational events. MOV clusters are defined as the earliest nodes with class LLRs > 3 and sum_contexts > 2 with a postorder traversal whose parent does not exhibit a MOV-like signature, while isolated MOV-positive leaves are classified as singletons.

Identified Mov clusters are then merged with Metafitch metadata to assign inferred country and year to both internal nodes and descendant leaves. The resulting MOV annotation table is then saved.**

In [3]:
tree = bte.MATree(pb_file = "/Users/reem/final_tree.pb")

Finished 'from_pb' in 82.7371 seconds


In [4]:
df = pd.read_csv("/Users/reem/merged_bte_metafitch.tsv", sep="\t")        # Load bte nodes with sum_llrs, sum_contexts and metafitch metadata
df.head()

Unnamed: 0,node_id,parent_id,children,mutations,num_mutations,level,branch_length,subs,Counts,LLR,...,AtoG_counts,A>G_llr,CtoT_counts,C>T_llr,TtoC_counts,T>C_llr,sum_llrs,sum_contexts,country,Year
0,CHN/YN-0306-466/2020|MT396241.1|2020-03-06,node_1,[],'G15910T',1,2,1.0,G>T,{'G>T': 1},-1.50973,...,Counter(),0.0,Counter(),0.0,Counter(),0.0,-1.50973,0.0,China,2020
1,DP0803|LC571037.1|2020-02-17,node_1,[],'G4162T',1,2,1.0,G>T,{'G>T': 1},-1.50973,...,Counter(),0.0,Counter(),0.0,Counter(),0.0,-1.50973,0.0,Unknown,2020
2,node_2,node_1,['England/LEED-2A8B52/2020|OA971832.1|2020-04-...,'T13090C',1,2,1.0,T>C,{'T>C': 1},-0.528901,...,Counter(),0.0,Counter(),0.0,Counter({'A[T>C]G': 1}),-0.310864,-0.839765,-0.310864,England,2020
3,England/LEED-2A8B52/2020|OA971832.1|2020-04-04,node_2,[],"'C1191T', 'C11674T'",2,3,2.0,"C>T,C>T",{'C>T': 2},0.87236,...,Counter(),0.0,"Counter({'C[C>T]A': 1, 'A[C>T]T': 1})",-0.267798,Counter(),0.0,0.604561,-0.267798,England,2020
4,England/SHEF-C06CE/2020|OY362267.1|2020-03-25,node_2,[],'T15821C',1,3,1.0,T>C,{'T>C': 1},-0.528901,...,Counter(),0.0,Counter(),0.0,Counter({'A[T>C]G': 1}),-0.310864,-0.839765,-0.310864,England,2020


In [5]:
len(df)

8884126

In [5]:
df_mov = pd.read_csv("/Users/reem/mov_llr_context_3_2.tsv", sep="\t")
len(df_mov)

1024

In [7]:
mov_set = dict(zip(df_mov["node_id"], zip(df_mov["LLR"], df_mov["sum_contexts"])))
is_mov_event = set(mov_set.keys())

mov_count = {}
leaf_count = {}
mov_cluster_roots = []

"""If a node is leaf, it has one leaf (itself).
If a node is internal, its leaf count is the sum of its children's leaf counts.
Similarly for mov_count, but only counting MOV events.
A MOV-event overrides its children's mov_counts to equal its leaf_count as a MOV event will likely lead to MOV descendants."""    # not working properly

for node in tree.depth_first_expansion(reverse=True):
    nid = node.id
    if node.is_leaf():
        leaf_count[nid] = 1
        mov_count[nid] = 1 if nid in is_mov_event else 0
    else:
        leaf_count[nid] = 0
        mov_count[nid] = 0
        for child in node.children:
            leaf_count[nid] += leaf_count[child.id]
            mov_count[nid] += mov_count[child.id]

    if nid in is_mov_event:
        mov_count[nid] = leaf_count[nid]
    else:
        mov_count[nid] = mov_count[nid]


mov_clusters = []


for node in tree.depth_first_expansion():
    nid = node.id

    # must be a MOV event
    if nid not in is_mov_event:
        continue

    parent = node.parent

    # parent must be non-MOV (or root)
    if parent is not None and parent.id in is_mov_event:   # To avoid nesting clusters, parent must not be a MOV event
        continue

   
    mov_clusters.append((node, mov_count[nid]))       # Append all MOV (clusters and singletons)
    
mov_clusters

[(id: Tanzania/NIMR-R40128/2021|EPI_ISL_19837217|2021-11-04
  level: 26
  parent: node_24833
  children: []
  mutations: ['C1758T', 'C7600T', 'G8984A', 'C11200T', 'C16667T', 'C17074T', 'C17251T', 'C23679T', 'C26250T', 'C27874T', 'C29555T', 'T29581C', 'C29708T']
  annotations: ['', '']
  branch length: 13.0,
  1),
 (id: India/KA-042197_2_S61_R1_001/2020|EPI_ISL_1708373|2020-09-28
  level: 13
  parent: node_243021
  children: []
  mutations: ['C8299T', 'C10369T', 'C10714T', 'C12528T', 'C14576T', 'C14649T', 'C25539T', 'C26028T', 'C28969T']
  annotations: ['', '']
  branch length: 9.0,
  1),
 (id: Mexico/CMX-INER-IBT-0454/2020|EPI_ISL_3758018|2020-09-22
  level: 14
  parent: node_243348
  children: []
  mutations: ['C676T', 'C1387T', 'C5986T', 'G8990A', 'C9487T', 'C10790T', 'T15801C', 'G20788A', 'G22352A', 'G23821A']
  annotations: ['', '']
  branch length: 10.0,
  1),
 (id: USA/MA-CDC-LC0049944/2021|MZ140843.1|2021-04-19
  level: 24
  parent: node_254503
  children: []
  mutations: ['C211

In [8]:
sorted(mov_clusters, key=lambda x: x[1])[::-1][:10]

[(id: node_1793301
  level: 56
  parent: node_1793300
  children: ['node_1793302', 'Australia/NSW-SAVID-9682/2022|EPI_ISL_15081448|2022-08-22']
  mutations: ['C2595T', 'T3607C', 'C4464T', 'C6525T', 'G9092A', 'G11272A', 'G12067A', 'C12784T', 'G14430A', 'C14605T', 'G18589A', 'G18712A', 'G20839A', 'A22124G', 'C23170T', 'T25735C', 'A26169G', 'C28344T', 'C28697T', 'C29098T']
  annotations: ['', '']
  branch length: 20.0,
  20),
 (id: node_2172435
  level: 62
  parent: node_2172434
  children: ['node_2172436', 'Australia/ACT10023/2022|EPI_ISL_16359724|2022-12-15']
  mutations: ['G509A', 'G611A', 'A1802G', 'G1868A', 'C2881T', 'G3179A', 'G3287A', 'T5980C', 'T7120C', 'G8785A', 'G13897A', 'C15633T', 'G17027A', 'T17955C', 'G20247A', 'G21830A', 'A23973G', 'A24900T', 'A25980G', 'C27573T', 'C27812T', 'G28202A', 'C29065T', 'C29119T']
  annotations: ['', '']
  branch length: 24.0,
  14),
 (id: node_2176874
  level: 62
  parent: node_2176869
  children: ['NewZealand/23YA0642/2023|EPI_ISL_16953828|2023-

In [21]:
#Â Create dataframe of clusters

mov_clusters_df = pd.DataFrame({
    "node_id": [node.id for node, count in mov_clusters],
    "Children": [[child.id for child in node.children] for node, count in mov_clusters],
    "Mutations": [node.mutations for node, count in mov_clusters],
    "Cluster size": [count for node, count in mov_clusters],
    "LLR": [mov_set[node.id][0] for node, count in mov_clusters],
    "sum_contexts": [mov_set[node.id][1] for node, count in mov_clusters],
    "sum_llrs": [mov_set[node.id][0] + mov_set[node.id][1] for node, count in mov_clusters]
    })
mov_clusters_df = mov_clusters_df.sort_values(by="Cluster size", ascending=False)
mov_clusters_df.head()

Unnamed: 0,node_id,Children,Mutations,Cluster size,LLR,sum_contexts,sum_llrs
615,node_1793301,"[node_1793302, Australia/NSW-SAVID-9682/2022|E...","[C2595T, T3607C, C4464T, C6525T, G9092A, G1127...",20,5.98313,7.843964,13.827095
802,node_2172435,"[node_2172436, Australia/ACT10023/2022|EPI_ISL...","[G509A, G611A, A1802G, G1868A, C2881T, G3179A,...",14,4.446203,2.826779,7.272982
804,node_2176874,[NewZealand/23YA0642/2023|EPI_ISL_16953828|202...,"[G1212A, C3634T, G5326A, C5893T, G10523A, G118...",13,3.625232,5.083141,8.708374
411,node_1472129,"[node_1472130, Australia/ACT8719/2022|EPI_ISL_...","[G816A, A895G, G1325A, C1454T, C1549T, G4124A,...",9,13.332206,3.851261,17.183468
932,node_2487450,"[node_2487451, node_2487454]","[C884T, G1462A, G1823A, G5180A, G5612A, C7318T...",9,6.060567,3.175023,9.23559


In [22]:
metafitch_out = pd.read_csv("/Users/reem/full_metafitch_output.tsv", sep="\t")
metafitch_out.head()

Unnamed: 0,strain,country,Year
0,node_1,"Philippines, Cote d'Ivoire, Viet Nam, Namibia,...",0
1,Guangzhou/GZ8H0013/2020|CNA0013706|2020-01-28,Unknown,2020
2,Guangzhou/GZ8H0014/2020|CNA0013707|2020-01-28,Unknown,2020
3,FRA/22022100202/2022|ON333670.1|2022-01-11,France,2022
4,USA/TX-CDC-ASC210033573/2021|MZ196053.1|2021-0...,USA,2021


In [23]:
# Change name of strain column to node_id to allow merging

metafitch_out = metafitch_out.rename(columns = {"strain" : "node_id"})
metafitch_out.head()

Unnamed: 0,node_id,country,Year
0,node_1,"Philippines, Cote d'Ivoire, Viet Nam, Namibia,...",0
1,Guangzhou/GZ8H0013/2020|CNA0013706|2020-01-28,Unknown,2020
2,Guangzhou/GZ8H0014/2020|CNA0013707|2020-01-28,Unknown,2020
3,FRA/22022100202/2022|ON333670.1|2022-01-11,France,2022
4,USA/TX-CDC-ASC210033573/2021|MZ196053.1|2021-0...,USA,2021


In [24]:
mov_clusters_df = pd.merge(mov_clusters_df, metafitch_out[["node_id", "country","Year"]], how="left", on="node_id")
mov_clusters_df.head()

Unnamed: 0,node_id,Children,Mutations,Cluster size,LLR,sum_contexts,sum_llrs,country,Year
0,node_1793301,"[node_1793302, Australia/NSW-SAVID-9682/2022|E...","[C2595T, T3607C, C4464T, C6525T, G9092A, G1127...",20,5.98313,7.843964,13.827095,Australia,2022
1,node_2172435,"[node_2172436, Australia/ACT10023/2022|EPI_ISL...","[G509A, G611A, A1802G, G1868A, C2881T, G3179A,...",14,4.446203,2.826779,7.272982,Australia,2022
2,node_2176874,[NewZealand/23YA0642/2023|EPI_ISL_16953828|202...,"[G1212A, C3634T, G5326A, C5893T, G10523A, G118...",13,3.625232,5.083141,8.708374,New Zealand,2023
3,node_1472129,"[node_1472130, Australia/ACT8719/2022|EPI_ISL_...","[G816A, A895G, G1325A, C1454T, C1549T, G4124A,...",9,13.332206,3.851261,17.183468,Australia,2022
4,node_2487450,"[node_2487451, node_2487454]","[C884T, G1462A, G1823A, G5180A, G5612A, C7318T...",9,6.060567,3.175023,9.23559,USA,2025


In [25]:
len(mov_clusters_df[mov_clusters_df["Cluster size"] == 1])

852

In [None]:
# Fix year column

mov_clusters_df["Year"] = mov_clusters_df["Year"].fillna(0).astype(int)
mov_clusters_df.head()

Unnamed: 0,node_id,Children,Mutations,Cluster size,sum_llrs,country,Year
0,node_1793301,[id: node_1793302\nlevel: 57\nparent: node_179...,"[C2595T, T3607C, C4464T, C6525T, G9092A, G1127...",20,13.827095,Australia,2022
1,node_2172435,[id: node_2172436\nlevel: 63\nparent: node_217...,"[G509A, G611A, A1802G, G1868A, C2881T, G3179A,...",14,7.272982,Australia,2022
2,node_2176874,[id: NewZealand/23YA0642/2023|EPI_ISL_16953828...,"[G1212A, C3634T, G5326A, C5893T, G10523A, G118...",13,8.708374,New Zealand,2023
3,node_2987982,[id: node_2987983\nlevel: 36\nparent: node_298...,"[C222T, C2295T, C5392T, C7125T, C7765T, G10870...",10,6.849888,India,2021
4,node_2487450,[id: node_2487451\nlevel: 80\nparent: node_248...,"[C884T, G1462A, G1823A, G5180A, G5612A, C7318T...",9,9.23559,USA,2025


In [26]:
mov_clusters_df.to_csv("/Users/reem/all_mov_bte_final.tsv", sep="\t", index=False)