In [None]:
import os
from Bio import SeqIO
import matplotlib.pyplot as plt
import numpy as np
from math import sqrt
from tqdm import tqdm
from multiprocessing import Pool
import seaborn as sns

# Code for generating the containment index figures
**NOTE:** paths have been anonymized

In [None]:
EXPERIMENTS = [
    "baseline",
    "country",
    "state",
]
METHODS = [
    "all",
	"centroid",
    "ggrasp",
    "vlq",
	"meshclust-0.95",
	"meshclust-0.99",
    "gclust-0.95",
	"gclust-0.99",
	"gclust-0.999",
    "vsearch-0.95",
    "vsearch-0.99",
    "vsearch-0.999",
    "single-linkage-1",
    "single-linkage-5",
    "single-linkage-10",
    "single-linkage-25",
    "single-linkage-50",
    "single-linkage-90",
    "single-linkage-99",
	"complete-linkage-1",
    "complete-linkage-5",
    "complete-linkage-10",
    "complete-linkage-25",
    "complete-linkage-50",
    "complete-linkage-90",
    "complete-linkage-99",

]
METHOD_LABELS = {
    "all": "All",
	"centroid": "Centroid",
    "ggrasp": "GGRaSP",
    "vlq": "VLQ",
	"meshclust-0.95": "MC-0.95",
	"meshclust-0.99": "MC-0.99",
    "gclust-0.95": "GC-0.95",
	"gclust-0.99": "GC-0.99",
	"gclust-0.999": "GC-0.999",
    "vsearch-0.95": "VS-0.95",
    "vsearch-0.99": "VS-0.99",
    "vsearch-0.999": "VS-0.999",
    "single-linkage-1": r"SL-$P_{1}$",
    "single-linkage-5": r"SL-$P_{5}$",
    "single-linkage-10": r"SL-$P_{10}$",
    "single-linkage-25": r"SL-$P_{25}$",
    "single-linkage-50": r"SL-$P_{50}$",
    "single-linkage-90": r"SL-$P_{90}$",
    "single-linkage-99": r"SL-$P_{99}$",
	"complete-linkage-1": r"CL-$P_{1}$",
    "complete-linkage-5": r"CL-$P_{5}$",
    "complete-linkage-10": r"CL-$P_{10}$",
    "complete-linkage-25": r"CL-$P_{25}$",
    "complete-linkage-50": r"CL-$P_{50}$",
    "complete-linkage-90": r"CL-$P_{90}$",
    "complete-linkage-99": r"CL-$P_{99}$",
}

ALPHA = 0.05

In [None]:
def determine_nonsingletons(experiment):
    filepath = f"path_prefix/{experiment}/all_metadata.tsv"
    count_per_species = {}
    with open(filepath, "r") as f_in:
        header = next(f_in)
        for line in f_in:
            line = line.strip().split("\t")
            cur_lineage = line[11]
            if cur_lineage not in count_per_species:
                count_per_species[cur_lineage] = 0
            count_per_species[cur_lineage] += 1
    nonsingletons = [lineage for lineage, count in count_per_species.items() if count > 1]
    return sorted(nonsingletons)

def determine_content(experiment, method, nonsingletons):
    filepath = f"path_prefix/{experiment}/{method}_metadata.tsv"
    if not os.path.exists(filepath):
        filepath = f"path_prefix/{experiment}/{method}.tsv"

    counts = np.zeros(len(nonsingletons), dtype=int)
    items = set()
    
    with open(filepath, "r") as f_in:
        header = next(f_in)
        for line in f_in:
            line = line.strip().split("\t")
            cur_id = line[0]
            cur_lineage = line[11]
            if cur_lineage in nonsingletons:
                idx = nonsingletons.index(cur_lineage)
                counts[idx] += 1
                items.add(cur_id)

    return counts, items

def calculate_CI(refset1, refset2):
    intersection = len(refset1.intersection(refset2))
    return [intersection / len(refset1), intersection / len(refset2)]

def simulate(N, n1, n2, obs_ci, num_simulations=10_000, seed=None):
    """
    Perform Monte Carlo simulation to estimate the probability of observing a certain overlap between two sets.
    NOTE: We specifically test whether the selection of n2 is independent of n1, i.e. we test the null hypothesis
    that the selection of n2 is random with respect to n1 (thus the n1 draws are fixed). This means that the p-values 
    we obtain are one-sided, i.e. we only test whether the overlap is significantly greater than expected by chance.

    Parameters:
    -----------
    N: numpy.ndarray
        A 1D vector with an entry for every nonsingleton species, indicating the total number of genomes
        for that species that could be selected (i.e. are in the "all" reference set).
    n1: numpy.ndarray
        A 1D vector with an entry for every nonsingleton species, indicating the number of genomes
        selected in the first set.
    n2: numpy.ndarray
        A 1D vector with an entry for every nonsingleton species, indicating the number of genomes
        selected in the second set.
    num_simulations: int
        The number of Monte Carlo simulations to perform.
    seed: int, optional
        Random seed for reproducibility.
    """
    rng = np.random.default_rng(seed)
    N = np.asarray(N, dtype=int)
    n1 = np.asarray(n1, dtype=int)
    n2 = np.asarray(n2, dtype=int)
    G = N.shape[0]

    # simulate overlaps
    sim_totals = np.zeros(num_simulations, dtype=int)
    for Ng, n1g, n2g in zip(N, n1, n2):
        draws = rng.hypergeometric(
            ngood=int(n1g),
            nbad=int(Ng-n1g),
            nsample=int(n2g),
            size=num_simulations
        )
        sim_totals += draws

    sim_ci = sim_totals / np.sum(n2)

    # Estimate p-values
    def pval(sim, obs):
        count = np.sum(sim >= obs)
        p = (1 + count) / (1 + num_simulations)
        se = sqrt(p * (1 - p) / num_simulations)
        return p, se

    return pval(sim_ci, obs_ci)

def BH_adjust(pvalues, alpha):
    M = pvalues.shape[0]
    
    mask = ~np.eye(M, dtype=bool)
    pvec = pvalues[mask]

    m = len(pvec)

    order = np.argsort(pvec)
    psorted = pvec[order]
    ranks = np.arange(1, m+1)
    adj_sorted = psorted * m / ranks
    adj_sorted = np.minimum.accumulate(adj_sorted[::-1])[::-1]
    adj = np.empty_like(pvec)
    adj[order] = np.minimum(adj_sorted, 1.0)

    adj_pvalues = np.ones_like(pvalues)
    adj_pvalues[mask] = adj
    significant = adj_pvalues < alpha
    return significant, adj_pvalues

In [None]:
for experiment in EXPERIMENTS:
    ns = determine_nonsingletons(experiment)
    reference_sets = {
        method: determine_content(experiment, method, ns) for method in METHODS
    }
    containment_indices = np.ones((len(METHODS), len(METHODS)), dtype=np.float64)
    pvalues = np.ones((len(METHODS), len(METHODS)), dtype=np.float64)
    annot_matrix = np.empty_like(containment_indices, dtype=object)
    for i, method1 in tqdm(enumerate(METHODS)):
        for j, method2 in enumerate(METHODS):
            if i != j:
                n1, r1 = reference_sets[method1]
                n2, r2 = reference_sets[method2]
                obs_ci = calculate_CI(r1, r2)[1]
                containment_indices[j,i] = obs_ci
                pvalues[j,i] = simulate(reference_sets["all"][0], n1, n2, obs_ci, seed=i*10000+j)[0]

    # Correct for multiple testing
    significant_pairs, _ = BH_adjust(pvalues, alpha=ALPHA)
    for i, _ in enumerate(METHODS):
        for j, _ in enumerate(METHODS):
            if significant_pairs[i,j]:
                annot_matrix[i,j] = f"{containment_indices[i,j]:.2f}*"
            else:
                annot_matrix[i,j] = f"{containment_indices[i,j]:.2f}"

    method_names = [METHOD_LABELS[m] for m in METHODS]
    fig = plt.figure()
    sns.heatmap(
        containment_indices[:, 1:], 
        annot=annot_matrix[:, 1:], 
        cmap="Greys",
        cbar=True,
        linewidth=0.5,
        xticklabels=method_names[1:],
        yticklabels=method_names,
        fmt="",
        annot_kws={"size": 4},
        )
    plt.title(f"{experiment.capitalize()}", size=20)
    plt.tight_layout()
    plt.savefig(f"path_prefix/{experiment}/{experiment}_containment_indices.pdf", dpi=500, bbox_inches="tight", format="pdf")
