# Run Variant List Optimizer

In [1]:
import sys

import ray
from ray.job_config import JobConfig

import pandas as pd

In [2]:
num_cpus = 37 # usable cpus on BII nodes on rivanna

In [3]:
code_dir = "/home/pb5gj/variant-list-optimizer"

sys.path.append(code_dir)
ray.init(job_config=JobConfig(code_search_path=[code_dir]))

2024-04-18 14:20:07,543	INFO worker.py:1743 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.11.8
Ray version:,2.10.0
Dashboard:,http://127.0.0.1:8265


In [4]:
from variant_list_optimizer import *

In [5]:
oi_fname = "/project/nssac_covid19/COVID-19_commons/products/external_data_collection/variants/outbreak_info_variants_states_long.csv"

In [6]:
usecols = ["date", "lineage", "fips", "lineage_count"]
parse_dates = ["date"]

oi_df = pd.read_csv(oi_fname, usecols=usecols, parse_dates=parse_dates, low_memory=False)
oi_df = oi_df.rename(columns={
    "lineage": "variant",
    "lineage_count": "weight"
})

oi_df = oi_df.dropna(subset=["date", "variant"])

In [7]:
oi_df.sample(10)

Unnamed: 0,weight,date,variant,fips
478533,1,2021-04-20,B.1.351,34
352460,1,2022-12-08,BQ.1.2,24
32992,1,2024-02-07,JN.1.4.6,04
87518,1,2023-02-24,BQ.1.1.7,06
676886,3,2021-08-28,AY.3.1,47
115508,1,2023-09-06,XBB.1.16.3,06
890510,8,2020-04-10,B.1.371,US
440288,22,2020-11-18,B.1.2,30
737315,2,2020-08-10,B.1.1.1,49
927065,49,2022-10-20,BA.5.2.9,US


In [8]:
data = oi_df[oi_df.fips.isin(["42", "54", "51", "10", "24"])]
data = data[["date", "variant", "weight"]]

In [9]:
ref_time = pd.to_datetime("2024-03-01")

In [10]:
data = data[data.date < ref_time]

In [11]:
time_importance = get_time_importance(1.0, 0.1)
distance_badness = get_distance_badness(1.0, 10.0)

In [12]:
tree = make_variant_tree(data, ref_time, time_importance)

In [13]:
tree.number_of_edges(), tree.number_of_nodes()

(1896, 1897)

In [14]:
pruned_tree = prune_tree(tree)

In [15]:
pruned_tree.number_of_edges(), pruned_tree.number_of_nodes()

(1815, 1816)

In [16]:
%%time

greedy_opt = optimize_greedy(20, pruned_tree, distance_badness, par=num_cpus)

Selected BA.5.2.60
Selected BA.2.42
Selected BA.1.10
Selected BA.1.14.1
Selected XBB.1.44.1
Selected B.1.1
Selected BA.5.2.4
Selected BQ.1.1.10
Selected BA.2.9.1
Selected BA.5.1.4
Selected BA.2.49
Selected B.1.561
Selected B.1.349
Selected BF.7
Selected BA.2.3.20
Selected BA.2.36
Selected BF.2
Selected B.1.427
Selected XBB.1.5.7
CPU times: user 10.3 s, sys: 783 ms, total: 11.1 s
Wall time: 16.6 s


In [17]:
v_nia = make_nearest_included_ancestor(greedy_opt, tree, distance_badness)
greedy_badness = sum(nia.badness for nia in v_nia.values())
greedy_badness

1727142.1444598727

In [18]:
%%time

beam10_opt = optimize_beam_search(20, 10, pruned_tree, distance_badness, par=num_cpus)

cur_size=2
cur_size=3
cur_size=4
cur_size=5
cur_size=6
cur_size=7
cur_size=8
cur_size=9
cur_size=10
cur_size=11
cur_size=12
cur_size=13
cur_size=14
cur_size=15
cur_size=16
cur_size=17
cur_size=18
cur_size=19
cur_size=20
CPU times: user 13.5 s, sys: 1.19 s, total: 14.7 s
Wall time: 1min 11s


In [19]:
v_nia = make_nearest_included_ancestor(beam10_opt, tree, distance_badness)
beam10_badness = sum(nia.badness for nia in v_nia.values())
beam10_badness

4308.894213959181

In [20]:
beam10_opt

frozenset({'BA.2',
           'BQ.1',
           'CH.1.1',
           'EG.5.1',
           'FL.1.5',
           'GE.1.2.1',
           'GJ.1.2',
           'HV.1',
           'JD.1.1',
           'JN.1',
           'JN.1.11.1',
           'JN.1.4',
           'JN.1.8.1',
           'SARS-CoV-2',
           'XBB',
           'XBB.1',
           'XBB.1.16',
           'XBB.1.5.70',
           'XBB.1.9.1',
           'XBB.2.3'})

In [21]:
%%time

beam20_opt = optimize_beam_search(20, 20, pruned_tree, distance_badness, par=num_cpus)

cur_size=2
cur_size=3
cur_size=4
cur_size=5
cur_size=6
cur_size=7
cur_size=8
cur_size=9
cur_size=10
cur_size=11
cur_size=12
cur_size=13
cur_size=14
cur_size=15
cur_size=16
cur_size=17
cur_size=18
cur_size=19
cur_size=20
CPU times: user 17.1 s, sys: 1.29 s, total: 18.4 s
Wall time: 2min 9s


In [22]:
v_nia = make_nearest_included_ancestor(beam20_opt, tree, distance_badness)
beam20_badness = sum(nia.badness for nia in v_nia.values())
beam20_badness

3934.973136476179

In [23]:
beam20_opt

frozenset({'B.1.1.529',
           'BA.2.86.1',
           'BQ.1',
           'CH.1.1',
           'EG.5.1',
           'FL.1.5',
           'GE.1.2.1',
           'GJ.1.2',
           'HV.1',
           'JD.1.1',
           'JN.1',
           'JN.1.11.1',
           'JN.1.4',
           'JN.1.8.1',
           'SARS-CoV-2',
           'XBB.1',
           'XBB.1.16',
           'XBB.1.5.70',
           'XBB.1.9.1',
           'XBB.2.3'})

In [24]:
%%time

beam30_opt = optimize_beam_search(20, 30, pruned_tree, distance_badness, par=num_cpus)

cur_size=2
cur_size=3
cur_size=4
cur_size=5
cur_size=6
cur_size=7
cur_size=8
cur_size=9
cur_size=10
cur_size=11
cur_size=12
cur_size=13
cur_size=14
cur_size=15
cur_size=16
cur_size=17
cur_size=18
cur_size=19
cur_size=20
CPU times: user 21.4 s, sys: 1.66 s, total: 23.1 s
Wall time: 3min 9s


In [25]:
v_nia = make_nearest_included_ancestor(beam30_opt, tree, distance_badness)
beam30_badness = sum(nia.badness for nia in v_nia.values())
beam30_badness

3934.973136476179

In [26]:
beam30_opt

frozenset({'B.1.1.529',
           'BA.2.86.1',
           'BQ.1',
           'CH.1.1',
           'EG.5.1',
           'FL.1.5',
           'GE.1.2.1',
           'GJ.1.2',
           'HV.1',
           'JD.1.1',
           'JN.1',
           'JN.1.11.1',
           'JN.1.4',
           'JN.1.8.1',
           'SARS-CoV-2',
           'XBB.1',
           'XBB.1.16',
           'XBB.1.5.70',
           'XBB.1.9.1',
           'XBB.2.3'})