In [1]:
import pandas as pd
import os
from Bio import Entrez, SeqIO
from collections import Counter
import numpy as np
from scipy.stats import multinomial
import numpy as np



In [2]:
file = "nextclade.tsv"
df =pd.read_csv(file, delimiter="\t")
#print(df["seqName"])

In [3]:
spectra = [
    {"name":"BA.1",
     "url": "https://raw.githubusercontent.com/theosanderson/molnupiravir/main/mutational_spectra/BA.1_SBS_spectrum_Ruis.csv"
    },
    {"name":"High G-to-A",
        "url": "https://raw.githubusercontent.com/theosanderson/molnupiravir/main/mutational_spectra/long_phylogenetic_branches/long_branch_spectrum_rescaled.csv"    
    },
]


In [4]:
df['privateNucMutations.unlabeledSubstitutions'].str.split(",")

0                                     [T6272A, T6559G]
1    [C1613T, T5213C, G8366A, G10981A, G11128A, A12...
2    [C1613T, T5213C, G8366A, G10981A, G11128A, A12...
3                           [A1650G, C19512T, C20115T]
4                  [C1593T, T12913C, A15667G, C16887T]
5                  [C3382T, T19260C, A22002T, C29719T]
6    [C1613T, T5213C, G8366A, G10981A, G11128A, A12...
7                          [C25626A, C29203T, C29358T]
8    [C1684T, C22883T, G24971A, G25507A, T26732C, G...
9                   [G4006A, A5043G, A11827G, G17331T]
Name: privateNucMutations.unlabeledSubstitutions, dtype: object

In [5]:
def GtoA_percent(muts):
    mut_list=muts.split(",")
    total=len(mut_list)
    total_muts=0
    G_to_A=0
    for m in mut_list:
        total_muts+=1
        if m.startswith("G") and m.endswith("A"):
            G_to_A+=1
    if total > 0:
        return((G_to_A/total_muts) * 100)
df = pd.DataFrame(df)
df["G_to_A_%"] = df["privateNucMutations.unlabeledSubstitutions"].apply(GtoA_percent)
print(df[["seqName", "privateNucMutations.unlabeledSubstitutions", "G_to_A_%"]])


                                             seqName  \
0  hCoV-19/Sweden/4866925052/2021|EPI_ISL_1619277...   
1  hCoV-19/England/MILK-3935CB1/2022|EPI_ISL_1096...   
2  hCoV-19/England/LSPA-36AE7E2/2022|EPI_ISL_1010...   
3  hCoV-19/England/CLIMB-CM7YMEGX/2024|EPI_ISL_19...   
4  hCoV-19/Australia/NSW-SAVID-13821/2022|EPI_ISL...   
5  hCoV-19/England/CLIMB-CM7YFZ3C/2024|EPI_ISL_19...   
6  hCoV-19/England/MILK-392DB55/2022|EPI_ISL_1093...   
7  hCoV-19/England/CLIMB-CM7YEX44/2025|EPI_ISL_20...   
8  hCoV-19/England/CLIMB-CM7YRP1U/2024|EPI_ISL_19...   
9  hCoV-19/England/CLIMB-CM7YKQDN/2025|EPI_ISL_20...   

          privateNucMutations.unlabeledSubstitutions   G_to_A_%  
0                                      T6272A,T6559G   0.000000  
1  C1613T,T5213C,G8366A,G10981A,G11128A,A12682G,C...  42.857143  
2  C1613T,T5213C,G8366A,G10981A,G11128A,A12682G,C...  42.857143  
3                             A1650G,C19512T,C20115T   0.000000  
4                     C1593T,T12913C,A15667G,C16887T 

In [None]:
def fetch_reference_genome(accession='NC_045512.2'):
    Entrez.email = "theo@theo.io"  
    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    handle.close()
    return str(record.seq)
reference_genome = fetch_reference_genome("NC_045512.2")  # SARS-CoV-2 reference genome

In [7]:
def get_mut_type(mut_string):
    return mut_string[0] + '>' + mut_string[-1]


df["subs"]=df["privateNucMutations.unlabeledSubstitutions"].apply(lambda x: ','.join([get_mut_type(item) for item in x.split(',')]))
df["subs"]

0                                              T>A,T>G
1    C>T,T>C,G>A,G>A,G>A,A>G,C>T,G>A,C>T,C>T,A>G,C>...
2    C>T,T>C,G>A,G>A,G>A,A>G,C>T,G>A,C>T,C>T,A>G,C>...
3                                          A>G,C>T,C>T
4                                      C>T,T>C,A>G,C>T
5                                      C>T,T>C,A>T,C>T
6    C>T,T>C,G>A,G>A,G>A,A>G,C>T,G>A,C>T,C>T,A>G,C>...
7                                          C>A,C>T,C>T
8                              C>T,C>T,G>A,G>A,T>C,G>T
9                                      G>A,A>G,A>G,G>T
Name: subs, dtype: object

In [8]:
def count_all_trinucleotide_contexts(genome):
    """Count all trinucleotide contexts in genome"""
    counts = Counter()
    for i in range(len(genome)-2):
        context = genome[i:i+3]
        counts[context] += 1
    return counts

In [9]:
def get_context(genome_seq, mutation):
    pos = int(mutation[1:-1]) - 1  # -1 as Python uses 0-based indexing
    context = genome_seq[pos-1:pos+2]  # get the base before and after
    return context
df["context"]=df["privateNucMutations.unlabeledSubstitutions"].apply(lambda x: ','.join([get_context(reference_genome, item) for item in x.split(',')]))
df["context"]   

0                                              GTA,CTT
1    CCT,ATT,TGC,TGA,TGT,TAC,ACA,TGC,GCA,ACA,GAG,AC...
2    CCT,ATT,TGC,TGA,TGT,TAC,ACA,TGC,GCA,ACA,GAG,AC...
3                                          AAT,ACA,TCA
4                                      TCC,TTA,GAA,ACA
5                                      ACA,CTA,AAA,CCA
6    CCT,ATT,TGC,TGA,TGT,TAC,ACA,TGC,GCA,ACA,GAG,AC...
7                                          ACT,GCG,ACA
8                              TCG,TCT,AGT,AGC,TTT,TGT
9                                      AGT,CAG,AAG,AGA
Name: context, dtype: object

In [34]:
def spectrum(subs, contexts):
    subs= subs.split(',')
    contexts = contexts.split(',')
    spectra = []
    for mutation, context in zip(subs, contexts):
        spectra.append(f"{context[0]}[{mutation}]{context[-1]}")
    return ','.join(spectra)
df["spectrum"] = df.apply(lambda row: spectrum(row["subs"], row["context"]), axis=1)
df["spectrum"]



0                                      G[T>A]A,C[T>G]T
1    C[C>T]T,A[T>C]T,T[G>A]C,T[G>A]A,T[G>A]T,T[A>G]...
2    C[C>T]T,A[T>C]T,T[G>A]C,T[G>A]A,T[G>A]T,T[A>G]...
3                              A[A>G]T,A[C>T]A,T[C>T]A
4                      T[C>T]C,T[T>C]A,G[A>G]A,A[C>T]A
5                      A[C>T]A,C[T>C]A,A[A>T]A,C[C>T]A
6    C[C>T]T,A[T>C]T,T[G>A]C,T[G>A]A,T[G>A]T,T[A>G]...
7                              A[C>A]T,G[C>T]G,A[C>T]A
8      T[C>T]G,T[C>T]T,A[G>A]T,A[G>A]C,T[T>C]T,T[G>T]T
9                      A[G>A]T,C[A>G]G,A[A>G]G,A[G>T]A
Name: spectrum, dtype: object

In [None]:
os.makedirs('~/Documents', exist_ok=True)  
df.to_csv('~/Documents/gtoa_contexts.csv') 
 