In [2]:
import pandas as pd
import os
from Bio import Entrez, SeqIO
import tqdm
from collections import Counter
import numpy as np
from scipy.stats import multinomial
import numpy as np
import ast



In [3]:
file = "/Users/reem/Mov/llr_top_1000.tsv"
df =pd.read_csv(file, delimiter="\t")



In [4]:
spectra = [
    {"name":"BA.1",
     "url": "https://raw.githubusercontent.com/theosanderson/molnupiravir/main/mutational_spectra/BA.1_SBS_spectrum_Ruis.csv"
    },
    {"name":"High G-to-A",
        "url": "https://raw.githubusercontent.com/theosanderson/molnupiravir/main/mutational_spectra/long_phylogenetic_branches/long_branch_spectrum_rescaled.csv"    
    },
]


In [5]:
df['privateNucMutations.unlabeledSubstitutions'].str.split(",")

0      [G204A, C2445T, C4331T, C5621T, C5622T, C6633T...
1                                                    NaN
2                                                    NaN
3                                              [G19999T]
4                                        [G410T, A7881G]
                             ...                        
994                                             [A4759G]
995                          [C11962T, C13694T, T18132C]
996    [G3692T, G6476T, T13188C, T13626C, G16528A, G1...
997    [G922A, C6428T, C8802T, G11108T, A18146G, G202...
998                   [C1441T, A1732G, G27517A, G29734T]
Name: privateNucMutations.unlabeledSubstitutions, Length: 999, dtype: object

In [6]:
def GtoA_percent(muts):
    mut_list=muts.split(",")
    total=len(mut_list)
    total_muts=0
    G_to_A=0
    for m in mut_list:
        total_muts+=1
        if m.startswith("G") and m.endswith("A"):
            G_to_A+=1
    if total > 0:
        return((G_to_A/total_muts) * 100)
#df = pd.DataFrame(df)
#df["G_to_A_%"] = df["privateNucMutations.unlabeledSubstitutions"].apply(GtoA_percent)
#print(df[["seqName", "privateNucMutations.unlabeledSubstitutions", "G_to_A_%"]])


In [7]:
def fetch_reference_genome(accession='NC_045512.2'):
    Entrez.email = "theo@theo.io"  
    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    handle.close()
    return str(record.seq)
reference_genome = fetch_reference_genome("NC_045512.2")  # SARS-CoV-2 reference genome

In [8]:
def get_mut_type(mut_string):
    if isinstance(mut_string, str) and len(mut_string) >= 2:
        return mut_string[0] + '>' + mut_string[-1]
    else:
        return ''
df["subs"]=df["privateNucMutations.unlabeledSubstitutions"].apply(lambda x: ','.join([get_mut_type(item) for item in x.split(',')])if isinstance(x, str) else '')
df["subs"].head()

0    G>A,C>T,C>T,C>T,C>T,C>T,C>T,G>A,C>T,A>C,A>T,G>...
1                                                     
2                                                     
3                                                  G>T
4                                              G>T,A>G
Name: subs, dtype: object

In [None]:
df["Counts"] =df["subs"].apply(lambda x: dict(Counter(x.split(","))))
df["Counts"]

In [9]:
def count_all_trinucleotide_contexts(genome):
    """Count all trinucleotide contexts in genome"""
    counts = Counter()
    for i in range(len(genome)-2):
        context = genome[i:i+3]
        counts[context] += 1
    return counts

In [10]:
def get_context(genome_seq, mutation):
    if isinstance(mutation, str) and len(mutation) >= 2:
        pos = int(mutation[1:-1]) - 1  # -1 as Python uses 0-based indexing
        context = genome_seq[pos-1:pos+2]  # get the base before and after
        return context
    else:
        return ''
df["context"]=df["privateNucMutations.unlabeledSubstitutions"].apply(lambda x: ','.join([get_context(reference_genome, item) for item in x.split(',')])if isinstance(x, str) else '')
df["context"]   

0      CGT,ACT,TCT,ACC,CCT,GCT,TCA,AGG,TCA,GAG,CAA,TG...
1                                                       
2                                                       
3                                                    AGT
4                                                TGG,AAT
                             ...                        
994                                                  AAG
995                                          TCT,ACA,CTA
996                      AGT,AGT,ATA,ATG,TGT,AGT,ATA,TCA
997                          TGG,TCC,ACT,TGC,GAA,GGA,GAA
998                                      GCT,AAA,GGG,CGA
Name: context, Length: 999, dtype: object

In [11]:
def spectrum(subs, contexts):
    if not isinstance(subs, str) or not isinstance(contexts, str):
        return ''
    if not subs.strip() or not contexts.strip():
        return ''
    subs= subs.split(',')
    contexts = contexts.split(',')
    spectra = []
    for mutation, context in zip(subs, contexts): 
        spectra.append(f"{context[0]}[{mutation}]{context[-1]}")
    return ','.join(spectra)
   
df["spectrum"] = df.apply(lambda row: spectrum(row["subs"], row["context"]), axis=1)
df["spectrum"].head()



0    C[G>A]T,A[C>T]T,T[C>T]T,A[C>T]C,C[C>T]T,G[C>T]...
1                                                     
2                                                     
3                                              A[G>T]T
4                                      T[G>T]G,A[A>G]T
Name: spectrum, dtype: object

In [12]:
def count_GtoA(spectrum):
    counts = Counter()
    muts = spectrum.split(",")
    for mut in muts:
        if mut[2:5] == 'G>A':
            counts[mut]+=1
    return counts

df["context_counts"] = df["spectrum"].apply(count_GtoA)
df["context_counts"]

            

            

 

0      {'C[G>A]T': 1, 'A[G>A]G': 1, 'T[G>A]C': 1}
1                                              {}
2                                              {}
3                                              {}
4                                              {}
                          ...                    
994                                            {}
995                                            {}
996                                {'T[G>A]T': 1}
997                  {'T[G>A]G': 1, 'G[G>A]A': 1}
998                                {'G[G>A]G': 1}
Name: context_counts, Length: 999, dtype: object

In [300]:
bases = ["A","C","G","T"]
possible_contexts = []
for a in bases:
    for b in bases:
        possible_contexts.append(f"{a}[G>A]{b}")
possible_contexts


['A[G>A]A',
 'A[G>A]C',
 'A[G>A]G',
 'A[G>A]T',
 'C[G>A]A',
 'C[G>A]C',
 'C[G>A]G',
 'C[G>A]T',
 'G[G>A]A',
 'G[G>A]C',
 'G[G>A]G',
 'G[G>A]T',
 'T[G>A]A',
 'T[G>A]C',
 'T[G>A]G',
 'T[G>A]T']

In [13]:
def get_proportion(df):
    dict = {}
    total = sum(df.values())
    for key, value in df.items():
        dict[key] = value/total
       
    return dict

df["proportions"] = df.apply(lambda row: get_proportion(row["context_counts"]), axis=1)
#df["proportions"].head()
df["Counts"].head()

    

    


0    Counter({'C>T': 8, 'G>A': 3, 'A>G': 3, 'A>C': ...
1                                            Counter()
2                                            Counter()
3                                  Counter({'G>T': 1})
4                        Counter({'G>T': 1, 'A>G': 1})
Name: Counts, dtype: object

In [292]:
df["Counts"] = (
    df["Counts"]
    .astype(str)
    .str.replace("Counter(", "")  # remove "Counter("
    .str.rstrip(")")                           # remove trailing ")"
    .apply(lambda x: ast.literal_eval(x) if x not in ["", "nan", "Counter"] else {})
)
print(df["Counts"].head(20))
print(type(df["Counts"]))

0     {'C>T': 8, 'G>A': 3, 'A>G': 3, 'A>C': 1, 'A>T'...
1                                                    {}
2                                                    {}
3                                            {'G>T': 1}
4                                  {'G>T': 1, 'A>G': 1}
5                        {'T>C': 3, 'C>T': 2, 'A>G': 2}
6                                  {'G>T': 2, 'G>A': 1}
7                                  {'C>T': 4, 'A>G': 1}
8              {'C>T': 2, 'T>C': 1, 'G>C': 1, 'C>A': 1}
9                        {'C>T': 1, 'T>A': 1, 'T>G': 1}
10    {'C>T': 7, 'T>C': 2, 'A>G': 1, 'A>T': 1, 'G>T'...
11                       {'C>T': 4, 'G>T': 2, 'G>A': 1}
12             {'C>T': 4, 'G>T': 3, 'T>G': 2, 'G>A': 1}
13                                 {'C>T': 1, 'A>T': 1}
14                                           {'C>T': 1}
15                                                   {}
16                                 {'C>T': 4, 'A>G': 1}
17             {'C>T': 2, 'C>A': 2, 'G>T': 2, 'G

In [293]:
def get_likelihood_ratio(counts,pM,pN):
    counts=np.array(counts,dtype=float)
    llM= float(multinomial.logpmf(counts, n=np.sum(counts), p=pM))
    llN = float(multinomial.logpmf(counts, n=np.sum(counts), p=pN))
    llr=llM-llN
    return llr
probs_df = pd.read_csv("/Users/reem/Downloads/estimated_mutation_distribution.tsv", delimiter="\t")
#probs_df

pM=probs_df["Molnupiravir"].values.tolist()
pN=probs_df["Normal"].values.tolist()
mutation_types = probs_df["MutationType"].str.replace("→", ">").tolist()


llr_list=[]
for i,row in df.iterrows():
    counts_dict = row["Counts"]
    counts = [counts_dict.get(mut,0) for mut in mutation_types]
    llr = get_likelihood_ratio(counts,pM,pN)
    llr_list.append(llr)
df["LLR"] = llr_list
df["LLR"].head(20)

# sample_counter = df["Counts"].iloc[0]
# print(sample_counter)
# test_row = df["Counts"].iloc[0]
# print(test_row)
# counts_dict = test_row
# counts = np.array([counts_dict.get(mt, 0) for mt in mutation_types])
# print(counts)

 

0    -0.378720
1     0.000000
2     0.000000
3    -1.509730
4    -1.720178
5    -1.135238
6    -2.514288
7     1.534272
8    -2.229715
9    -2.023665
10   -1.234451
11   -0.769569
12   -4.487829
13   -1.073550
14    0.436180
15    0.000000
16    1.534272
17   -4.215102
18    1.720991
19   -3.112182
Name: LLR, dtype: float64

In [1]:
df.head()

NameError: name 'df' is not defined

In [None]:
df_Mov = df[df["LLR"]>6]
df_Mov.head()

Unnamed: 0.1,Unnamed: 0,index,seqName,clade,clade_display,clade_who,clade_nextstrain,partiallyAliased,Nextclade_pango,qc.overallScore,...,warnings,errors,subs,Counts,LLR,Country,context,spectrum,context_counts,proportions
7,7,9,hCoV-19/Germany/NW-RKI-I-854767/2022|2022-02-0...,21K,21K (BA.1),Omicron,21K,BA.1.1,BA.1.1,23.390864,...,,,"C>T,C>T,A>G,C>T,C>T","{'C>T': 4, 'A>G': 1}",1.534272,Germany,"TCA,TCC,GAA,TCA,GCA","T[C>T]A,T[C>T]C,G[A>G]A,T[C>T]A,G[C>T]A",{},{}
16,16,17,hCoV-19/Canada/ON-KHS-22-02511-v1/2022|2022-03...,21K,21K (BA.1),Omicron,21K,BA.1.1,BA.1.1,0.0,...,,,"C>T,A>G,C>T,C>T,C>T","{'C>T': 4, 'A>G': 1}",1.534272,Canada,"TCA,TAG,ACA,TCT,TCT","T[C>T]A,T[A>G]G,A[C>T]A,T[C>T]T,T[C>T]T",{},{}
18,18,16,hCoV-19/USA/HI-H2327938/2024|2024-01-30|2024-0...,24A,24A (JN.1),Omicron,24A,BA.2.86.1.1.4,JN.1.4,0.0,...,,,"G>A,C>T,C>T,C>T,C>T,T>C","{'C>T': 4, 'G>A': 1, 'T>C': 1}",1.720991,USA,"TGG,ACA,TCC,TCT,ACA,GTT","T[G>A]G,A[C>T]A,T[C>T]C,T[C>T]T,A[C>T]A,G[T>C]T",{'T[G>A]G': 1},{'T[G>A]G': 1.0}
26,26,23,hCoV-19/England/QEUH-9E4607/2020|2020-09-23|20...,20A,20A,,20A,B.1.416.1,B.1.416.1,0.173611,...,,,"C>T,C>T,C>T,G>A,C>T","{'C>T': 4, 'G>A': 1}",2.249892,England,"ACA,ACA,ACT,GGT,ACG","A[C>T]A,A[C>T]A,A[C>T]T,G[G>A]T,A[C>T]G",{'G[G>A]T': 1},{'G[G>A]T': 1.0}
38,38,44,hCoV-19/USA/CA-CDC-FG-148485/2021|2021-10-12|2...,21J,21J (Delta),Delta,21J,B.1.617.2.103,AY.103,0.0,...,,,"C>T,C>T,G>A","{'C>T': 2, 'G>A': 1}",1.377532,USA,"GCC,CCT,AGC","G[C>T]C,C[C>T]T,A[G>A]C",{'A[G>A]C': 1},{'A[G>A]C': 1.0}


In [309]:
proportions_list = pd.json_normalize(df['proportions'])
proportions_list

Unnamed: 0,C[G>A]T,A[G>A]G,T[G>A]C,A[G>A]C,C[G>A]G,T[G>A]G,G[G>A]T,A[G>A]A,T[G>A]T,A[G>A]T,G[G>A]C,G[G>A]G,T[G>A]A,C[G>A]C,G[G>A]A,C[G>A]A
0,0.333333,0.333333,0.333333,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,,,,,,,,,,,,,,,,
995,,,,,,,,,,,,,,,,
996,,,,,,,,,1.0,,,,,,,
997,,,,,,0.5,,,,,,,,,0.5,


In [341]:
df_pivot = pd.concat([df[["seqName"]],proportions_list,df[["LLR"]]], axis=1)
df_pivot.fillna(0, inplace=True)
df_pivot.head(10)

Unnamed: 0,seqName,C[G>A]T,A[G>A]G,T[G>A]C,A[G>A]C,C[G>A]G,T[G>A]G,G[G>A]T,A[G>A]A,T[G>A]T,A[G>A]T,G[G>A]C,G[G>A]G,T[G>A]A,C[G>A]C,G[G>A]A,C[G>A]A,LLR
0,hCoV-19/USA/MO-WRAIR-COX5040NPS/2020|2020-08-1...,0.333333,0.333333,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.37872
1,hCoV-19/Belgium/UGent-14493/2021|2021-12-22|20...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hCoV-19/France/IDF-HMN-21052200412/2021|2021-0...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hCoV-19/England/LSPA-37EF052/2022|2022-02-21|2...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.50973
4,hCoV-19/Germany/BE-RKI-I-595719/2022|2022-02-2...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.720178
5,hCoV-19/Russia/TAM-RII-MH144950S/2023|2023-09-...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.135238
6,hCoV-19/Canada/ON-KHS-21-06687-v1/2021|2021-12...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.514288
7,hCoV-19/Germany/NW-RKI-I-854767/2022|2022-02-0...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.534272
8,hCoV-19/England/ALDP-1FEE90B/2021|2021-09-29|2...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.229715
9,hCoV-19/Northern Mariana Islands/MP-CDC-2-5839...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.023665


In [362]:
# context_cols = [col for col in df_pivot.columns if col not in ["seqName","LLR"]]
# mean_row = df_pivot[context_cols].mean()
mean_row = df_pivot.drop(columns = ["seqName","LLR"]).mean()
mean_row["seqName"] = "Mean"
mean_row["LLR"] = "None"
mean_row = mean_row[df_pivot.columns]
df = pd.concat([df_pivot,pd.DataFrame([mean_row])],ignore_index=True)
print(df.tail(10))


     seqName   C[G>A]T  A[G>A]G   T[G>A]C   A[G>A]C   C[G>A]G  T[G>A]G  \
1086     NaN       NaN      NaN       NaN       NaN       NaN      NaN   
1087     NaN       NaN      NaN       NaN       NaN       NaN      NaN   
1088     NaN       NaN      NaN       NaN       NaN       NaN      NaN   
1089     NaN       NaN      NaN       NaN       NaN       NaN      NaN   
1090     NaN       NaN      NaN       NaN       NaN       NaN      NaN   
1091     NaN       NaN      NaN       NaN       NaN       NaN      NaN   
1092     NaN       NaN      NaN       NaN       NaN       NaN      NaN   
1093     NaN       NaN      NaN       NaN       NaN       NaN      NaN   
1094     NaN       NaN      NaN       NaN       NaN       NaN      NaN   
1095    Mean  0.020187  0.02631  0.018635  0.017327  0.009843  0.02711   

       G[G>A]T   A[G>A]A   T[G>A]T   A[G>A]T  G[G>A]C   G[G>A]G   T[G>A]A  \
1086       NaN       NaN       NaN       NaN      NaN       NaN       NaN   
1087       NaN       NaN       