In [2]:
import pandas as pd
import os
from Bio import Entrez, SeqIO
import tqdm
from collections import Counter
import numpy as np
from scipy.stats import multinomial
import numpy as np
import ast



In [None]:
# Load the data
iterator = pd.read_csv("/Users/reem/Mov/nextclade_results/final_results.tsv",sep="\t",chunksize=1000)
df = pd.concat([chunk for chunk in tqdm.tqdm(iterator, desc='Loading data')])



In [4]:
spectra = [
    {"name":"BA.1",
     "url": "https://raw.githubusercontent.com/theosanderson/molnupiravir/main/mutational_spectra/BA.1_SBS_spectrum_Ruis.csv"
    },
    {"name":"High G-to-A",
        "url": "https://raw.githubusercontent.com/theosanderson/molnupiravir/main/mutational_spectra/long_phylogenetic_branches/long_branch_spectrum_rescaled.csv"    
    },
]


In [5]:
df['privateNucMutations.unlabeledSubstitutions'].str.split(",")

0      [G204A, C2445T, C4331T, C5621T, C5622T, C6633T...
1                                                    NaN
2                                                    NaN
3                                              [G19999T]
4                                        [G410T, A7881G]
                             ...                        
994                                             [A4759G]
995                          [C11962T, C13694T, T18132C]
996    [G3692T, G6476T, T13188C, T13626C, G16528A, G1...
997    [G922A, C6428T, C8802T, G11108T, A18146G, G202...
998                   [C1441T, A1732G, G27517A, G29734T]
Name: privateNucMutations.unlabeledSubstitutions, Length: 999, dtype: object

In [2]:
# def GtoA_percent(muts):
#     mut_list=muts.split(",")
#     total=len(mut_list)
#     total_muts=0
#     G_to_A=0
#     for m in mut_list:
#         total_muts+=1
#         if m.startswith("G") and m.endswith("A"):
#             G_to_A+=1
#     if total > 0:
#         return((G_to_A/total_muts) * 100)
#df = pd.DataFrame(df)
#df["G_to_A_%"] = df["privateNucMutations.unlabeledSubstitutions"].apply(GtoA_percent)
#print(df[["seqName", "privateNucMutations.unlabeledSubstitutions", "G_to_A_%"]])


In [7]:
def fetch_reference_genome(accession='NC_045512.2'):
    Entrez.email = "theo@theo.io"  
    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    handle.close()
    return str(record.seq)
reference_genome = fetch_reference_genome("NC_045512.2")  # SARS-CoV-2 reference genome

In [8]:
def get_mut_type(mut_string):
    if isinstance(mut_string, str) and len(mut_string) >= 2:
        return mut_string[0] + '>' + mut_string[-1]
    else:
        return ''
df["subs"]=df["privateNucMutations.unlabeledSubstitutions"].apply(lambda x: ','.join([get_mut_type(item) for item in x.split(',')])if isinstance(x, str) else '')
df["subs"].head()

0    G>A,C>T,C>T,C>T,C>T,C>T,C>T,G>A,C>T,A>C,A>T,G>...
1                                                     
2                                                     
3                                                  G>T
4                                              G>T,A>G
Name: subs, dtype: object

In [None]:
df["Counts"] =df["subs"].apply(lambda x: dict(Counter(x.split(","))))
df["Counts"]

In [None]:
def get_context(genome_seq, mutation):
    if isinstance(mutation, str) and len(mutation) >= 2:
        pos = int(mutation[1:-1]) - 1  # -1 as Python uses 0-based indexing
        context = genome_seq[pos-1:pos+2]  # get the base before and after
        return context
    else:
        return ''
df["context"]=df["privateNucMutations.unlabeledSubstitutions"].apply(lambda x: ','.join([get_context(reference_genome, item) for item in x.split(',')])if isinstance(x, str) else '')
df["context"]   

In [None]:
# def count_all_trinucleotide_contexts(genome):
#     """Count all trinucleotide contexts in genome"""
#     counts = Counter()
#     for i in range(len(genome)-2):
#         context = genome[i:i+3]
#         counts[context] += 1
#     return counts



In [11]:
def spectrum(subs, contexts):
    if not isinstance(subs, str) or not isinstance(contexts, str):
        return ''
    if not subs.strip() or not contexts.strip():
        return ''
    subs= subs.split(',')
    contexts = contexts.split(',')
    spectra = []
    for mutation, context in zip(subs, contexts): 
        spectra.append(f"{context[0]}[{mutation}]{context[-1]}")
    return ','.join(spectra)
   
df["spectrum"] = df.apply(lambda row: spectrum(row["subs"], row["context"]), axis=1)
df["spectrum"].head()



0    C[G>A]T,A[C>T]T,T[C>T]T,A[C>T]C,C[C>T]T,G[C>T]...
1                                                     
2                                                     
3                                              A[G>T]T
4                                      T[G>T]G,A[A>G]T
Name: spectrum, dtype: object

In [None]:
#Get Counts per substitution context
def count_GtoA(spectrum):
    counts = Counter()
    if not isinstance(spectrum, str):
        return {}
    muts = spectrum.split(",")
    for mut in muts:
        if mut[2:5] == 'G>A':
            counts[mut]+=1
    return counts

def count_AtoG(spectrum):
    counts = Counter()
    if not isinstance(spectrum, str):
        return {}
    muts = spectrum.split(",")
    for mut in muts:
        if mut[2:5] == 'A>G':
            counts[mut]+=1
    return counts

def count_CtoT(spectrum):
    counts = Counter()
    if not isinstance(spectrum, str):
        return {}
    muts = spectrum.split(",")
    for mut in muts:
        if mut[2:5] == 'C>T':
            counts[mut]+=1
    return counts

def count_TtoC(spectrum):
    counts = Counter()
    if not isinstance(spectrum, str):
        return {}
    muts = spectrum.split(",")
    for mut in muts:
        if mut[2:5] == 'T>C':
            counts[mut]+=1
    return counts

df["G>A_counts"] = df["spectrum"].apply(count_GtoA)
df["A>G_counts"] = df["spectrum"].apply(count_AtoG)
df["C>T_counts"] = df["spectrum"].apply(count_CtoT)
df["T>C_counts"] = df["spectrum"].apply(count_TtoC)
print(df["G>A_counts"].head())
print(df["A>G_counts"].head())
print(df["C>T_counts"].head())
print(df["T>C_counts"].head())

 

0      {'C[G>A]T': 1, 'A[G>A]G': 1, 'T[G>A]C': 1}
1                                              {}
2                                              {}
3                                              {}
4                                              {}
                          ...                    
994                                            {}
995                                            {}
996                                {'T[G>A]T': 1}
997                  {'T[G>A]G': 1, 'G[G>A]A': 1}
998                                {'G[G>A]G': 1}
Name: context_counts, Length: 999, dtype: object

In [None]:
# Get Proportions per substitution context
def get_proportion(df):
    dict = {}
    total = sum(df.values())
    for key, value in df.items():
        dict[key] = value/total
       
    return dict

df["G>Aproportions"] = df.apply(lambda row: get_proportion(row["G>A_counts"]), axis=1)
df["A>Gproportions"] = df.apply(lambda row: get_proportion(row["A>G_counts"]), axis=1)
df["C>Tproportions"] = df.apply(lambda row: get_proportion(row["C>T_counts"]), axis=1)
df["T>Cproportions"] = df.apply(lambda row: get_proportion(row["T>C_counts"]), axis=1)

print(df["G>Aproportions"].head())
print(df["A>Gproportions"].head())
print(df["C>Tproportions"].head())
print(df["T>Cproportions"].head())


In [None]:
# Because df_Counts is not read as a dict in the next step

df["Counts"] = (
    df["Counts"]
    .astype(str)
    .str.replace("Counter(", "")  # remove "Counter("
    .str.rstrip(")")                           # remove trailing ")"
    .apply(lambda x: ast.literal_eval(x) if x not in ["", "nan", "Counter"] else {})
)
print(df["Counts"].head(20))
print(type(df["Counts"]))

0     {'C>T': 8, 'G>A': 3, 'A>G': 3, 'A>C': 1, 'A>T'...
1                                                    {}
2                                                    {}
3                                            {'G>T': 1}
4                                  {'G>T': 1, 'A>G': 1}
5                        {'T>C': 3, 'C>T': 2, 'A>G': 2}
6                                  {'G>T': 2, 'G>A': 1}
7                                  {'C>T': 4, 'A>G': 1}
8              {'C>T': 2, 'T>C': 1, 'G>C': 1, 'C>A': 1}
9                        {'C>T': 1, 'T>A': 1, 'T>G': 1}
10    {'C>T': 7, 'T>C': 2, 'A>G': 1, 'A>T': 1, 'G>T'...
11                       {'C>T': 4, 'G>T': 2, 'G>A': 1}
12             {'C>T': 4, 'G>T': 3, 'T>G': 2, 'G>A': 1}
13                                 {'C>T': 1, 'A>T': 1}
14                                           {'C>T': 1}
15                                                   {}
16                                 {'C>T': 4, 'A>G': 1}
17             {'C>T': 2, 'C>A': 2, 'G>T': 2, 'G

In [None]:
def get_likelihood_ratio(counts,pM,pN):
    counts=np.array(counts,dtype=float)
    llM= float(multinomial.logpmf(counts, n=np.sum(counts), p=pM))
    llN = float(multinomial.logpmf(counts, n=np.sum(counts), p=pN))
    llr=llM-llN
    return llr
probs_df = pd.read_csv("/Users/reem/Downloads/estimated_mutation_distribution.tsv", delimiter="\t")
#probs_df

pM=probs_df["Molnupiravir"].values.tolist()
pN=probs_df["Normal"].values.tolist()
mutation_types = probs_df["MutationType"].str.replace("→", ">").tolist()


llr_list=[]
for i,row in df.iterrows():
    counts_dict = row["Counts"]
    counts = [counts_dict.get(mut,0) for mut in mutation_types]
    llr = get_likelihood_ratio(counts,pM,pN)
    llr_list.append(llr)
df["LLR"] = llr_list
df["LLR"].head()


 

0    -0.378720
1     0.000000
2     0.000000
3    -1.509730
4    -1.720178
5    -1.135238
6    -2.514288
7     1.534272
8    -2.229715
9    -2.023665
10   -1.234451
11   -0.769569
12   -4.487829
13   -1.073550
14    0.436180
15    0.000000
16    1.534272
17   -4.215102
18    1.720991
19   -3.112182
Name: LLR, dtype: float64

In [3]:
# context_cols = [col for col in df_pivot.columns if col not in ["seqName","LLR"]]
# mean_row = df_pivot[context_cols].mean()
# mean_row = df_pivot.drop(columns = ["seqName","LLR"]).mean()
# mean_row["seqName"] = "Mean"
# mean_row["LLR"] = "None"
# mean_row = mean_row[df_pivot.columns]
# df = pd.concat([df_pivot,pd.DataFrame([mean_row])],ignore_index=True)
# print(df.tail(10))


In [None]:
def generate_all_possible_contexts(sub):
    bases = ["A","C","G","T"]
    possible_contexts = []
    for a in bases:
        for b in bases:
            possible_contexts.append(f"{a}[{sub}]{b}")
    return possible_contexts
print(generate_all_possible_contexts("G>A"))

In [None]:
# Compute mean context probabilities
def get_mean_context_probs(df, llr_condition, prop_col, llr_col='LLR'):
    """
    Filters dataframe into llr>6 and llr<6 and computes mean 
    proportions per substitution per context

    Returns a flattened dataframe with mutational contexts,seqname,LLR
    and computed means as the last row

    """

    df_filtered = df[llr_condition(df[llr_col])].copy()
    
    pivot = pd.json_normalize(df_filtered[prop_col]).fillna(0)
    pivot = pivot.loc[(pivot != 0).any(axis=1)]

    return pivot.mean()

print(get_mean_context_probs(df, llr_condition=lambda x: x > 1, prop_col="T>Cproportions", llr_col='LLR'))
def build_prob_table(df,prop_col,llr_high,llr_low,llr_col='LLR'):
    """
    Builds a table of mean context probabilities (Molnupiravir vs Normal)
    for a specific substitution type.
    """
    llr_high=6
    llr_low=6

    Mov_probs = get_mean_context_probs(df.loc[(df!=0).any(axis=1)], lambda x: x > 6, prop_col='G>Aproportions')
    Normal_probs  = get_mean_context_probs(df.loc[(df!=0).any(axis=1)], lambda x: x < 6, prop_col='G>Aproportions')
    all_contexts = generate_all_possible_contexts("G>A")

    # Match means to all possible contexts
    Mov_probs = Mov_probs.reindex(all_contexts, fill_value=0)
    Normal_probs = Normal_probs.reindex(all_contexts, fill_value=0)

    print(Mov_probs)
    print(Normal_probs)

    df_prob = pd.DataFrame({'Mutational_Context':all_contexts,
                           'Molnupiravir': Mov_probs.values,
                            'Normal': Normal_probs.values
                            }) 

    
    return df_prob


df_prob = build_prob_table(df,prop_col='G>Aproportions',llr_high=6, llr_low=6, llr_col='LLR')

print(df_prob)

In [None]:
print(df_prob)
print(df_prob["Molnupiravir"].sum())
print(df_prob["Normal"].sum())

In [None]:
def calculate_llr(count_dict, pM, pN, contexts):
    counts = np.array([count_dict.get(ctx, 0) for ctx in contexts])
    n = counts.sum()
    if n == 0:
        return np.nan
    llM = multinomial.logpmf(counts, n=n, p=pM)
    llN = multinomial.logpmf(counts, n=n, p=pN)
    return float(llM - llN)

contexts = df_prob["Mutational_Context"].values.tolist()
pM = df_prob["Molnupiravir"].values.tolist()
pN = df_prob["Normal"].values.tolist()
df[f"G>A_llr"] = df["G>A_counts"].apply(lambda x: calculate_llr(x, pM, pN, contexts))

print(df[f"G>A_llr"])


In [None]:
len(df)

In [None]:
df.to_csv("/Users/reem/Mov/final_llrs_per_context.tsv", sep="\t", index=False) 

In [4]:
"THE END!"

'THE END!'