In [1]:
import pandas as pd
import os
import tqdm
from collections import Counter
import numpy as np
from scipy.stats import multinomial
import numpy as np
import ast


This notebook calculates Log-Likelihood ratios (LLRs) for SARS-CoV-2 mutation classes using estimated mutation class distributions. 
Mutation proportions are extracted for each trinucleotide context and used to calculate per-context LLRs, enabling the detection of mutational biases potentially associated with Molnupiravir treatment.


In [None]:
# Load the data
df = pd.read_csv("/Users/reem/Mov/nextclade_results/GtoA_LLRs_final.tsv",sep="\t")
df.head()


In [44]:
df.head()

Unnamed: 0,seqName,privateNucMutations.unlabeledSubstitutions,subs,Counts,context,spectrum,G>A_context_counts,proportions,G>A_llr
0,hCoV-19/USA/MO-WRAIR-COX5040NPS/2020|2020-08-1...,"G204A,C2445T,C4331T,C5621T,C5622T,C6633T,C1240...","G>A,C>T,C>T,C>T,C>T,C>T,C>T,G>A,C>T,A>C,A>T,G>...","{'G>A': 3, 'C>T': 8, 'A>C': 1, 'A>T': 1, 'G>T'...","CGT,ACT,TCT,ACC,CCT,GCT,TCA,AGG,TCA,GAG,CAA,TG...","C[G>A]T,A[C>T]T,T[C>T]T,A[C>T]C,C[C>T]T,G[C>T]...","{'C[G>A]T': 1, 'A[G>A]G': 1, 'T[G>A]C': 1}","{'C[G>A]T': 0.3333333333333333, 'A[G>A]G': 0.3...",3.056241
1,hCoV-19/Belgium/UGent-14493/2021|2021-12-22|20...,,,{},,,{},{},0.0
2,hCoV-19/France/IDF-HMN-21052200412/2021|2021-0...,,,{},,,{},{},0.0
3,hCoV-19/England/LSPA-37EF052/2022|2022-02-21|2...,G19999T,G>T,{'G>T': 1},AGT,A[G>T]T,{},{},0.0
4,hCoV-19/Germany/BE-RKI-I-595719/2022|2022-02-2...,"G410T,A7881G","G>T,A>G","{'G>T': 1, 'A>G': 1}","TGG,AAT","T[G>T]G,A[A>G]T",{},{},0.0


In [None]:
# Convert the string representation of dicts to actual dicts
import ast
df["Counts"] = df["Counts"].apply(ast.literal_eval)
print(type(df["Counts"]))

<class 'pandas.core.series.Series'>


In [5]:
# Calculate LLR
probs_df=pd.read_csv("/Users/reem/Downloads/estimated_mutation_distribution.tsv", delimiter="\t")
pM=probs_df["Molnupiravir"].to_numpy(dtype=float)
pN=probs_df["Normal"].to_numpy(dtype=float)
mut_types=probs_df["MutationType"].str.replace("â†’",">").tolist()

def get_likelihood_ratio(counts,pM,pN):
    counts=np.array(counts,dtype=float)
    llM= float(multinomial.logpmf(counts, n=np.sum(counts), p=pM))
    llN = float(multinomial.logpmf(counts, n=np.sum(counts), p=pN))
    llr=llM-llN
    return llr
llr_list = []
for counts_dict in df["Counts"]:
    counts = [counts_dict.get(mt, 0) for mt in mut_types]
    llr = get_likelihood_ratio(counts,pM,pN)
    llr_list.append(llr)
df["LLR"] = llr_list



In [None]:
#Get Counts per substitution context
def count_AtoG(spectrum):
    counts = Counter()
    if not isinstance(spectrum, str):
        return {}
    muts = spectrum.split(",")
    for mut in muts:
        if mut[2:5] == 'A>G':
            counts[mut]+=1
    return counts

def count_CtoT(spectrum):
    counts = Counter()
    if not isinstance(spectrum, str):
        return {}
    muts = spectrum.split(",")
    for mut in muts:
        if mut[2:5] == 'C>T':
            counts[mut]+=1
    return counts

def count_TtoC(spectrum):
    counts = Counter()
    if not isinstance(spectrum, str):
        return {}
    muts = spectrum.split(",")
    for mut in muts:
        if mut[2:5] == 'T>C':
            counts[mut]+=1
    return counts


df["C>T_counts"] = df["spectrum"].apply(count_CtoT)
df["A>G_counts"] = df["spectrum"].apply(count_AtoG)
df["T>C_counts"] = df["spectrum"].apply(count_TtoC)

print(df["C>T_counts"].head())
print(df["A>G_counts"].head())
print(df["T>C_counts"].head())


In [None]:
# Get Proportions per substitution context
def get_proportion(df):
    dict = {}
    total = sum(df.values())
    for key, value in df.items():
        dict[key] = value/total
       
    return dict

df["A>Gproportions"] = df.apply(lambda row: get_proportion(row["A>G_counts"]), axis=1)
df["C>Tproportions"] = df.apply(lambda row: get_proportion(row["C>T_counts"]), axis=1)
df["T>Cproportions"] = df.apply(lambda row: get_proportion(row["T>C_counts"]), axis=1)
# df["T>Gproportions"] = df.apply(lambda row: get_proportion(row["T>G_counts"]), axis=1)

print(df["C>Tproportions"].head(10))
print(df["A>Gproportions"].head(10))
print(df["T>Cproportions"].head(10))
# print(df["T>Gproportions"].head(10))


In [8]:
def generate_all_possible_contexts(sub):
    bases = ["A","C","G","T"]
    possible_contexts = []
    for a in bases:
        for b in bases:
            possible_contexts.append(f"{a}[{sub}]{b}")
    return possible_contexts
print(generate_all_possible_contexts("A>G"))

['A[A>G]A', 'A[A>G]C', 'A[A>G]G', 'A[A>G]T', 'C[A>G]A', 'C[A>G]C', 'C[A>G]G', 'C[A>G]T', 'G[A>G]A', 'G[A>G]C', 'G[A>G]G', 'G[A>G]T', 'T[A>G]A', 'T[A>G]C', 'T[A>G]G', 'T[A>G]T']


In [None]:
# Compute mean context probabilities
def get_mean_context_probs(df, llr_condition, prop_col, llr_col='LLR'):
    """
    Filters dataframe into llr>6 and llr<6 and computes mean 
    proportions per substitution per context

    Returns a flattened dataframe with mutational contexts,seqname,LLR
    and computed means as the last row

    """

    df_filtered = df[llr_condition(df[llr_col])].copy()
    
    pivot = pd.json_normalize(df_filtered[prop_col]).fillna(0)
    pivot = pivot.loc[(pivot != 0).any(axis=1)]

    return pivot.mean()

#print(get_mean_context_probs(df, llr_condition=lambda x: x > 1, prop_col="T>Cproportions", llr_col='LLR'))
def build_prob_table(df,prop_col,llr_high,llr_low,llr_col='LLR'):
    """
    Builds a table of mean context probabilities (Molnupiravir vs Normal)
    for a specific substitution type.
    """
    llr_high=6
    llr_low=6

    Mov_probs = get_mean_context_probs(df.loc[(df!=0).any(axis=1)], lambda x: x > 6, prop_col='A>Gproportions')
    Normal_probs  = get_mean_context_probs(df.loc[(df!=0).any(axis=1)], lambda x: x < 6, prop_col='C>Tproportions')
    all_contexts = generate_all_possible_contexts("C>T")

    # Match means to all possible contexts
    Mov_probs = Mov_probs.reindex(all_contexts, fill_value=0)
    Normal_probs = Normal_probs.reindex(all_contexts, fill_value=0)

    print(Mov_probs)
    print(Normal_probs)

    df_prob = pd.DataFrame({'Mutational_Context':all_contexts,
                           'Molnupiravir': Mov_probs.values,
                            'Normal': Normal_probs.values
                            }) 

    
    return df_prob


df_prob = build_prob_table(df,prop_col='A>Gproportions',llr_high=6, llr_low=6, llr_col='LLR')

print(df_prob)



In [None]:
print(df_prob)
print(df_prob["Molnupiravir"].sum())
print(df_prob["Normal"].sum())

In [None]:
#Calculate LLRs using prob_table:

def calculate_llr(count_dict, pM, pN, contexts):
    counts = np.array([count_dict.get(ctx, 0) for ctx in contexts])
    n = counts.sum()
    if n == 0:
        return np.nan
    llM = multinomial.logpmf(counts, n=n, p=pM)
    llN = multinomial.logpmf(counts, n=n, p=pN)
    return float(llM - llN)

contexts = df_prob["Mutational_Context"].values.tolist()
pM = df_prob["Molnupiravir"].values.tolist()
pN = df_prob["Normal"].values.tolist()
df[f"A>G_llr"] = df["A>G_counts"].apply(lambda x: calculate_llr(x, pM, pN, contexts))

print(df[f"A>G_llr"])




In [None]:
# SAVE FINAL DF
df.to_csv("llrs_only.tsv", sep="\t", index=False)