In [2]:
import pandas as pd
import os
from Bio import Entrez, SeqIO
import tqdm
from collections import Counter
import numpy as np
from scipy.stats import multinomial
import numpy as np
import ast



In [3]:
# Load the data
iterator = pd.read_csv("/Users/reem/Mov/final_results.tsv", sep="\t", usecols = ['seqName','privateNucMutations.unlabeledSubstitutions'], dtype={"seqName":str, 'privateNucMutations.unlabeledSubstitutions':str } ,chunksize=1000)
df = pd.concat([chunk for chunk in tqdm.tqdm(iterator, desc='Loading data')])



Loading data: 16858it [01:34, 178.11it/s]


In [4]:
spectra = [
    {"name":"BA.1",
     "url": "https://raw.githubusercontent.com/theosanderson/molnupiravir/main/mutational_spectra/BA.1_SBS_spectrum_Ruis.csv"
    },
    {"name":"High G-to-A",
        "url": "https://raw.githubusercontent.com/theosanderson/molnupiravir/main/mutational_spectra/long_phylogenetic_branches/long_branch_spectrum_rescaled.csv"    
    },
]


In [5]:
df['privateNucMutations.unlabeledSubstitutions'].str.split(",")

0                         [A2396G, T13832C, C25665T, C27673T]
1           [C4901T, C6525T, A12162G, C12781T, G16558T, C2...
2           [C4084T, T7092C, A13956G, C14585T, A20425G, C2...
3                                                   [A29188G]
4                                          [C21595T, G21777T]
                                  ...                        
16857147                          [C15540T, C25710T, G27870T]
16857148    [C44T, -11296G, C12085T, C19164A, C21459A, G27...
16857149                  [G4583A, C10376T, C26895T, C29272T]
16857150                   [C2094T, C7528T, G21255T, C27657T]
16857151                            [C609T, T21610C, C25611T]
Name: privateNucMutations.unlabeledSubstitutions, Length: 16857152, dtype: object

In [6]:
def fetch_reference_genome(accession='NC_045512.2'):
    Entrez.email = "theo@theo.io"  
    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    handle.close()
    return str(record.seq)
reference_genome = fetch_reference_genome("NC_045512.2")  # SARS-CoV-2 reference genome

In [7]:
def get_mut_type(mut_string):
    if isinstance(mut_string, str) and len(mut_string) >= 2:
        return mut_string[0] + '>' + mut_string[-1]
    else:
        return ''
df["subs"]=df["privateNucMutations.unlabeledSubstitutions"].apply(lambda x: ','.join([get_mut_type(item) for item in x.split(',')])if isinstance(x, str) else '')
df["subs"].head()

0                    A>G,T>C,C>T,C>T
1    C>T,C>T,A>G,C>T,G>T,C>T,G>T,C>T
2    C>T,T>C,A>G,C>T,A>G,C>T,C>T,C>T
3                                A>G
4                            C>T,G>T
Name: subs, dtype: object

In [8]:
df["Counts"] =df["subs"].apply(lambda x: dict(Counter(x.split(","))))
df["Counts"]

0                              {'A>G': 1, 'T>C': 1, 'C>T': 2}
1                              {'C>T': 5, 'A>G': 1, 'G>T': 2}
2                              {'C>T': 5, 'T>C': 1, 'A>G': 2}
3                                                  {'A>G': 1}
4                                        {'C>T': 1, 'G>T': 1}
                                  ...                        
16857147                                 {'C>T': 2, 'G>T': 1}
16857148    {'C>T': 2, '->G': 1, 'C>A': 2, 'G>T': 1, 'A>G'...
16857149                                 {'G>A': 1, 'C>T': 3}
16857150                                 {'C>T': 3, 'G>T': 1}
16857151                                 {'C>T': 2, 'T>C': 1}
Name: Counts, Length: 16857152, dtype: object

In [9]:
def get_context(genome_seq, mutation):
    if isinstance(mutation, str) and len(mutation) >= 2:
        pos = int(mutation[1:-1]) - 1  # -1 as Python uses 0-based indexing
        context = genome_seq[pos-1:pos+2]  # get the base before and after
        return context
    else:
        return ''
df["context"]=df["privateNucMutations.unlabeledSubstitutions"].apply(lambda x: ','.join([get_context(reference_genome, item) for item in x.split(',')])if isinstance(x, str) else '')
df["context"]   

0                               CAC,TTA,ACT,TCA
1               CCT,ACA,CAG,ACA,TGC,ACA,TGG,CCA
2               ACA,GTC,TAC,GCT,TAT,TCG,TCT,ACA
3                                           CAC
4                                       TCT,GGG
                           ...                 
16857147                            TCA,TCT,TGA
16857148    TCG,TTA,CCT,ACA,TCA,TGG,AAT,AAA,AAA
16857149                        CGA,ACC,CCA,ACA
16857150                        TCG,TCG,CGT,TCA
16857151                            CCA,TTA,TCT
Name: context, Length: 16857152, dtype: object

In [11]:
df[df['context'].apply(lambda x: isinstance(x, str) and len(x) < 2)].head()

Unnamed: 0,seqName,privateNucMutations.unlabeledSubstitutions,subs,Counts,context
17,hCoV-19/Czech Republic/NAT-22-02478/2022|2022-...,,,{'': 1},
30,hCoV-19/USA/NY-NYCPHL-016290/2022|2022-12-15|2...,,,{'': 1},
35,hCoV-19/USA/CT-CDC-QDX26076006/2021|2021-06-19...,,,{'': 1},
38,hCoV-19/Japan/PG-118664/2021|2021-08-24|2021-1...,,,{'': 1},
41,hCoV-19/Canada/BC-BCCDC-683663/2023|2023-12-06...,,,{'': 1},


In [18]:
def spectrum(subs, contexts):
    if not isinstance(subs, str) or not isinstance(contexts, str):
        return ''
    if not subs.strip() or not contexts.strip():
        return ''
    subs= subs.split(',')
    contexts = contexts.split(',')
    spectra = []
    for mutation, context in zip(subs, contexts): 
        if len(context)<2:
            continue
        spectra.append(f"{context[0]}[{mutation}]{context[-1]}")
    return ','.join(spectra)
   
df["spectrum"] = df.apply(lambda row: spectrum(row["subs"], row["context"]), axis=1)
df["spectrum"].head()



0                      C[A>G]C,T[T>C]A,A[C>T]T,T[C>T]A
1    C[C>T]T,A[C>T]A,C[A>G]G,A[C>T]A,T[G>T]C,A[C>T]...
2    A[C>T]A,G[T>C]C,T[A>G]C,G[C>T]T,T[A>G]T,T[C>T]...
3                                              C[A>G]C
4                                      T[C>T]T,G[G>T]G
Name: spectrum, dtype: object

In [19]:
#Get Counts per substitution context
def count_GtoA(spectrum):
    counts = Counter()
    if not isinstance(spectrum, str):
        return {}
    muts = spectrum.split(",")
    for mut in muts:
        if mut[2:5] == 'G>A':
            counts[mut]+=1
    return counts

def count_AtoG(spectrum):
    counts = Counter()
    if not isinstance(spectrum, str):
        return {}
    muts = spectrum.split(",")
    for mut in muts:
        if mut[2:5] == 'A>G':
            counts[mut]+=1
    return counts

def count_CtoT(spectrum):
    counts = Counter()
    if not isinstance(spectrum, str):
        return {}
    muts = spectrum.split(",")
    for mut in muts:
        if mut[2:5] == 'C>T':
            counts[mut]+=1
    return counts

def count_TtoC(spectrum):
    counts = Counter()
    if not isinstance(spectrum, str):
        return {}
    muts = spectrum.split(",")
    for mut in muts:
        if mut[2:5] == 'T>C':
            counts[mut]+=1
    return counts

df["G>A_counts"] = df["spectrum"].apply(count_GtoA)
df["A>G_counts"] = df["spectrum"].apply(count_AtoG)
df["C>T_counts"] = df["spectrum"].apply(count_CtoT)
df["T>C_counts"] = df["spectrum"].apply(count_TtoC)
print(df["G>A_counts"].head())
print(df["A>G_counts"].head())
print(df["C>T_counts"].head())
print(df["T>C_counts"].head())

 

0    {}
1    {}
2    {}
3    {}
4    {}
Name: G>A_counts, dtype: object
0                  {'C[A>G]C': 1}
1                  {'C[A>G]G': 1}
2    {'T[A>G]C': 1, 'T[A>G]T': 1}
3                  {'C[A>G]C': 1}
4                              {}
Name: A>G_counts, dtype: object
0                         {'A[C>T]T': 1, 'T[C>T]A': 1}
1           {'C[C>T]T': 1, 'A[C>T]A': 3, 'C[C>T]A': 1}
2    {'A[C>T]A': 2, 'G[C>T]T': 1, 'T[C>T]G': 1, 'T[...
3                                                   {}
4                                       {'T[C>T]T': 1}
Name: C>T_counts, dtype: object
0    {'T[T>C]A': 1}
1                {}
2    {'G[T>C]C': 1}
3                {}
4                {}
Name: T>C_counts, dtype: object


In [20]:
# Get Proportions per substitution context
def get_proportion(df):
    dict = {}
    total = sum(df.values())
    for key, value in df.items():
        dict[key] = value/total
       
    return dict

df["G>Aproportions"] = df.apply(lambda row: get_proportion(row["G>A_counts"]), axis=1)
df["A>Gproportions"] = df.apply(lambda row: get_proportion(row["A>G_counts"]), axis=1)
df["C>Tproportions"] = df.apply(lambda row: get_proportion(row["C>T_counts"]), axis=1)
df["T>Cproportions"] = df.apply(lambda row: get_proportion(row["T>C_counts"]), axis=1)

print(df["G>Aproportions"].head())
print(df["A>Gproportions"].head())
print(df["C>Tproportions"].head())
print(df["T>Cproportions"].head())


0    {}
1    {}
2    {}
3    {}
4    {}
Name: G>Aproportions, dtype: object
0                    {'C[A>G]C': 1.0}
1                    {'C[A>G]G': 1.0}
2    {'T[A>G]C': 0.5, 'T[A>G]T': 0.5}
3                    {'C[A>G]C': 1.0}
4                                  {}
Name: A>Gproportions, dtype: object
0                     {'A[C>T]T': 0.5, 'T[C>T]A': 0.5}
1     {'C[C>T]T': 0.2, 'A[C>T]A': 0.6, 'C[C>T]A': 0.2}
2    {'A[C>T]A': 0.4, 'G[C>T]T': 0.2, 'T[C>T]G': 0....
3                                                   {}
4                                     {'T[C>T]T': 1.0}
Name: C>Tproportions, dtype: object
0    {'T[T>C]A': 1.0}
1                  {}
2    {'G[T>C]C': 1.0}
3                  {}
4                  {}
Name: T>Cproportions, dtype: object


In [21]:
# Because df_Counts is not read as a dict in the next step

df["Counts"] = (
    df["Counts"]
    .astype(str)
    .str.replace("Counter(", "")  # remove "Counter("
    .str.rstrip(")")                           # remove trailing ")"
    .apply(lambda x: ast.literal_eval(x) if x not in ["", "nan", "Counter"] else {})
)
print(df["Counts"].head(20))
print(type(df["Counts"]))

0               {'A>G': 1, 'T>C': 1, 'C>T': 2}
1               {'C>T': 5, 'A>G': 1, 'G>T': 2}
2               {'C>T': 5, 'T>C': 1, 'A>G': 2}
3                                   {'A>G': 1}
4                         {'C>T': 1, 'G>T': 1}
5               {'G>T': 6, 'C>T': 3, 'A>G': 1}
6     {'T>G': 1, 'T>C': 1, 'C>T': 2, 'G>A': 1}
7               {'C>T': 5, '->G': 1, 'T>C': 1}
8                         {'T>C': 1, 'G>T': 1}
9                         {'T>C': 1, 'C>T': 1}
10              {'C>T': 2, 'A>T': 1, 'T>C': 2}
11              {'G>T': 1, 'T>C': 1, 'C>T': 2}
12                        {'G>T': 1, 'G>A': 1}
13    {'G>T': 1, 'C>T': 4, 'T>C': 1, 'G>A': 1}
14              {'A>G': 2, 'C>A': 1, 'C>T': 2}
15                        {'C>T': 2, 'A>G': 1}
16              {'C>T': 3, 'T>C': 2, 'G>T': 1}
17                                     {'': 1}
18    {'G>T': 2, 'T>C': 2, 'C>T': 1, 'G>A': 1}
19              {'C>T': 4, 'G>T': 1, 'T>C': 2}
Name: Counts, dtype: object
<class 'pandas.core.series.Serie

In [22]:
def get_likelihood_ratio(counts,pM,pN):
    counts=np.array(counts,dtype=float)
    llM= float(multinomial.logpmf(counts, n=np.sum(counts), p=pM))
    llN = float(multinomial.logpmf(counts, n=np.sum(counts), p=pN))
    llr=llM-llN
    return llr
probs_df = pd.read_csv("/Users/reem/Downloads/estimated_mutation_distribution.tsv", delimiter="\t")
#probs_df

pM=probs_df["Molnupiravir"].values.tolist()
pN=probs_df["Normal"].values.tolist()
mutation_types = probs_df["MutationType"].str.replace("â†’", ">").tolist()


llr_list=[]
for i,row in df.iterrows():
    counts_dict = row["Counts"]
    counts = [counts_dict.get(mut,0) for mut in mutation_types]
    llr = get_likelihood_ratio(counts,pM,pN)
    llr_list.append(llr)
df["LLR"] = llr_list
df["LLR"].head()


0    0.133011
1   -1.049009
2    1.231103
3   -0.210447
4   -1.073550
Name: LLR, dtype: float64

In [24]:
print(df["LLR"].min())
print(df["LLR"].max())

-6208.81062121423
31.137873738090548


In [None]:
df.to_csv("/Users/reem/Mov/final_results_LLRs.tsv",sep="\t")

: 