In [2]:
import pandas as pd
import os
from Bio import Entrez, SeqIO
import tqdm
from collections import Counter
import numpy as np
from scipy.stats import multinomial
import numpy as np
import ast


In [None]:
dict_cols = ['G>Aproportions', 'C>Tproportions', 'A>Gproportions', 'T>Cproportions']
iterator = pd.read_csv("/Users/reem/Mov/final_llrs_per_context.tsv",sep="\t", converters={col: ast.literal_eval for col in dict_cols},chunksize=1000)
df = pd.concat([chunk for chunk in tqdm.tqdm(iterator, desc='Loading data')])


Loading data: 16858it [05:06, 54.93it/s]


In [7]:
def generate_all_possible_contexts(sub):
    bases = ["A","C","G","T"]
    possible_contexts = []
    for a in bases:
        for b in bases:
            possible_contexts.append(f"{a}[{sub}]{b}")
    return possible_contexts
#print(generate_all_possible_contexts("G>A"))

In [None]:
df["G>Aproportions"] = (
    df["G>Aproportions"].apply(lambda x: ast.literal_eval(x) if x not in ["", "nan", "Counter"] else {}))
print(df["G>Aproportions"].head())
print(type(df["G>Aproportions"].iloc[0]))

0    {}
1    {}
2    {}
3    {}
4    {}
Name: G>Aproportions, dtype: object
<class 'pandas.core.series.Series'>


In [36]:
# Compute mean context probabilities
def get_mean_context_probs(df, llr_condition, prop_col, llr_col='LLR'):
    """
    Filters dataframe into llr>6 and llr<6 and computes mean 
    proportions per substitution per context

    Returns a flattened dataframe with mutational contexts,seqname,LLR
    and computed means as the last row

    """

    df_filtered = df[llr_condition(df[llr_col])].copy()
    
    pivot = pd.json_normalize(df_filtered[prop_col]).fillna(0)
    pivot = pivot.loc[(pivot != 0).any(axis=1)]

    return pivot.mean()

print(get_mean_context_probs(df, llr_condition=lambda x: x > 1, prop_col="C>Tproportions", llr_col='LLR'))
def build_prob_table(df,prop_col,llr_high,llr_low,llr_col='LLR'):
    """
    Builds a table of mean context probabilities (Molnupiravir vs Normal)
    for a specific substitution type.
    """
    llr_high=6
    llr_low=6

    Mov_probs = get_mean_context_probs(df.loc[(df!=0).any(axis=1)], lambda x: x > 6, prop_col='C>Tproportions')
    Normal_probs  = get_mean_context_probs(df.loc[(df!=0).any(axis=1)], lambda x: x < 6, prop_col='C>Tproportions')
    all_contexts = generate_all_possible_contexts("C>T")

    # Match means to all possible contexts
    Mov_probs = Mov_probs.reindex(all_contexts, fill_value=0)
    Normal_probs = Normal_probs.reindex(all_contexts, fill_value=0)

    print(Mov_probs)
    print(Normal_probs)

    df_prob = pd.DataFrame({'Mutational_Context':all_contexts,
                           'Molnupiravir': Mov_probs.values,
                            'Normal': Normal_probs.values
                            }) 

    
    return df_prob


df_prob = build_prob_table(df,prop_col='C>Tproportions',llr_high=6, llr_low=6, llr_col='LLR')

print(df_prob)

A[C>T]A    0.175507
G[C>T]T    0.065908
T[C>T]G    0.068107
T[C>T]T    0.111961
A[C>T]C    0.056895
A[C>T]T    0.123832
C[C>T]A    0.062370
C[C>T]T    0.045081
T[C>T]A    0.094493
T[C>T]C    0.032480
C[C>T]C    0.018144
G[C>T]C    0.021246
A[C>T]G    0.044656
G[C>T]A    0.048562
G[C>T]G    0.015044
C[C>T]G    0.015713
dtype: float64
A[C>T]A    0.238774
A[C>T]C    0.063358
A[C>T]G    0.034287
A[C>T]T    0.116155
C[C>T]A    0.053411
C[C>T]C    0.011453
C[C>T]G    0.010166
C[C>T]T    0.029820
G[C>T]A    0.094290
G[C>T]C    0.030137
G[C>T]G    0.018229
G[C>T]T    0.070275
T[C>T]A    0.104712
T[C>T]C    0.021520
T[C>T]G    0.029948
T[C>T]T    0.073465
dtype: float64
A[C>T]A    0.168822
A[C>T]C    0.056742
A[C>T]G    0.049587
A[C>T]T    0.124377
C[C>T]A    0.065197
C[C>T]C    0.019030
C[C>T]G    0.014831
C[C>T]T    0.042752
G[C>T]A    0.050420
G[C>T]C    0.021439
G[C>T]G    0.014606
G[C>T]T    0.066150
T[C>T]A    0.091213
T[C>T]C    0.030318
T[C>T]G    0.064645
T[C>T]T    0.119870
dtype: flo

In [37]:
print(df_prob)
print(df_prob["Molnupiravir"].sum())
print(df_prob["Normal"].sum())

   Mutational_Context  Molnupiravir    Normal
0             A[C>T]A      0.238774  0.168822
1             A[C>T]C      0.063358  0.056742
2             A[C>T]G      0.034287  0.049587
3             A[C>T]T      0.116155  0.124377
4             C[C>T]A      0.053411  0.065197
5             C[C>T]C      0.011453  0.019030
6             C[C>T]G      0.010166  0.014831
7             C[C>T]T      0.029820  0.042752
8             G[C>T]A      0.094290  0.050420
9             G[C>T]C      0.030137  0.021439
10            G[C>T]G      0.018229  0.014606
11            G[C>T]T      0.070275  0.066150
12            T[C>T]A      0.104712  0.091213
13            T[C>T]C      0.021520  0.030318
14            T[C>T]G      0.029948  0.064645
15            T[C>T]T      0.073465  0.119870
1.0
1.0


In [38]:
df_prob.to_csv("C>Tprobs.tsv",sep="\t")

In [31]:
df["T>C_counts"] = (
    df["T>C_counts"]
    .astype(str)
    .str.replace("Counter(", "")  # remove "Counter("
    .str.rstrip(")")                           # remove trailing ")"
    .apply(lambda x: ast.literal_eval(x) if x not in ["", "nan", "Counter"] else {})
)
print(df["T>C_counts"].head())
print(type(df["T>C_counts"]))

0    {'T[T>C]A': 1}
1                {}
2    {'G[T>C]C': 1}
3                {}
4                {}
Name: T>C_counts, dtype: object
<class 'pandas.core.series.Series'>


In [32]:
def calculate_llr(count_dict, pM, pN, contexts):
    counts = np.array([count_dict.get(ctx, 0) for ctx in contexts])
    n = counts.sum()
    if n == 0:
        return np.nan
    llM = multinomial.logpmf(counts, n=n, p=pM)
    llN = multinomial.logpmf(counts, n=n, p=pN)
    return float(llM - llN)

contexts = df_prob["Mutational_Context"].values.tolist()
pM = df_prob["Molnupiravir"].values.tolist()
pN = df_prob["Normal"].values.tolist()
df[f"T>C_llr"] = df["T>C_counts"].apply(lambda x: calculate_llr(x, pM, pN, contexts))

print(df[f"T>C_llr"])


0           0.156313
1                NaN
2          -0.447496
3                NaN
4                NaN
              ...   
16857147         NaN
16857148         NaN
16857149         NaN
16857150         NaN
16857151    0.156313
Name: T>C_llr, Length: 16857152, dtype: float64


In [33]:
len(df)

16857152

In [None]:
df.to_csv("Users/reem/Mov/final_llrs_per_context.tsv", sep="\t", index=False)

In [None]:
iterator = pd.read_csv("/Users/reem/Mov/final_llrs_per_context.tsv",sep="\t",chunksize = 1000)
df = pd.concat([chunk for chunk in tqdm.tqdm(iterator, desc='Loading data')])


In [41]:
df = df.drop(['Unnamed: 0','privateNucMutations.unlabeledSubstitutions','subs','Counts','context','spectrum','G>A_counts','A>G_counts','C>T_counts','T>C_counts','G>Aproportions','A>Gproportions','C>Tproportions','T>Cproportions'],axis=1)
df.head()

Unnamed: 0,seqName,LLR,G>A_llr,A>G_llr,C>T_llr,T>C_llr
0,hCoV-19/USA/CA-CDPH-500004296/2021|2021-07-17|...,0.133011,,0.004225,0.069622,0.156313
1,hCoV-19/Spain/CL-COV01948/2021|2021-04-08|2021...,-1.049009,,0.319879,0.480353,
2,hCoV-19/USA/OR-OHSU-213401246/2021|2021-10-27|...,1.231103,,-1.811061,-0.505227,-0.447496
3,hCoV-19/Germany/SL-RKI-I-1077947/2022|2022-12-...,-0.210447,,0.004225,,
4,hCoV-19/USA/MA-CDCBI-CRSP_HGQQM7RZS5PYBHGU/202...,-1.07355,,,-0.489601,


In [42]:
df.to_csv("/Users/reem/Mov/llrs_only.tsv",sep= "\t")
