[code](https://www.google.com/url?q=https://github.com/cancerit/NanoSeq/blob/599f6a6185356eada286af036119cf6f6eb3723d/R/nanoseq_results_plotter.R%23L195&sa=D&source=docs&ust=1642691262337302&usg=AOvVaw00BPrhhPxHyi86V7kluYBa)

В этой статье (из которой код выше) берут комплементарные замещения, которые в их эксперименте являются взаимоисключающими, поэтому можно применить биномиальный тест, мол столько-то раз монетка упала орлом, а столько-то решкой, а мы ожидали одинаковых долей...

Будем сравнивать как пары реципрокных замещений (C>U and G>A), так и пары направленных (C>U and U>C)

`ObsToExp` column is notmalized count of substitution. We will compare them

~~[bootstraping](https://www.google.com/url?q=https://www.nature.com/articles/ng.3292%23Sec2&sa=D&source=docs&ust=1642691262336584&usg=AOvVaw2I2fiwx0-1bV6dvc2hG4pq)~~

In [31]:
from typing import List, Tuple, Dict, Union, Iterable

import numpy as np
import pandas as pd
import scipy.stats
import statsmodels.api as sm

In [2]:
PATH_TO_MUTSPEC = "/home/mr/Sars_Cov_2_MutSpec/Sars_Cov_2/new_data/data_obtained/07.MutSpec12_ForFullGenome.csv"
NUCLEOTIDES = list("ACGU")

In [3]:
cols = ["NucSubst", "ExpFr", "ObsFr", "ObsToExp"]

df = pd.read_csv(PATH_TO_MUTSPEC, usecols=cols)
mut_num = dict(zip(df.NucSubst, df.ObsToExp))
df

Unnamed: 0,NucSubst,ExpFr,ObsFr,ObsToExp
0,A>C,1590,35589,22.383019
1,A>G,2596,152472,58.733436
2,A>U,1387,28833,20.788032
3,C>A,704,34116,48.460227
4,C>G,582,12204,20.969072
5,C>U,1670,653064,391.056287
6,G>A,919,157032,170.872688
7,G>C,277,12042,43.472924
8,G>U,277,146010,527.111913
9,U>A,2421,27192,11.231722


In [4]:
directional_pairs = [
    ("A>C", "C>A"),
    ("A>G", "G>A"),
    ("A>U", "U>A"),
    ("C>G", "G>C"),
    ("C>U", "U>C"),
    ("G>U", "U>G"),
]
reciprocal_pairs = [
    ("A>C", "U>G"),
    ("A>G", "U>C"),
    ("A>U", "U>A"),
    ("C>G", "G>C"),
    ("C>U", "G>A"),
    ("G>U", "C>A"),
]

In [41]:
def asterics_for_vector(pvals: Iterable) -> List[str]:
    asterics = []
    for val in pvals:
        if val < 0.001:
            asterics.append("***")
        elif val < 0.01:
            asterics.append("**")
        elif val < 0.05:
            asterics.append("*")
        else:
            asterics.append("")
    return asterics


def binom_testing(
        pairs: List[Tuple[str]],
        mut_num: Dict[str, Union[int, float]],
        label: str = None):
    data = []
    for mut1, mut2 in pairs:
        n1, n2 = round(mut_num[mut1]), round(mut_num[mut2])
        res = scipy.stats.binomtest(n1, n1 + n2, p=0.5)
        pval = res.pvalue
        row = (mut1, mut2, pval) if label is None else (label, mut1, mut2, pval)
        data.append(row)
    cols = ["mut1", "mut2", "pval"] if label is None else ["label", "mut1", "mut2", "pval"]
    data = pd.DataFrame(data, columns=cols)
    
    _, qval, _, _ = sm.stats.multipletests(
        data["pval"].values, method="fdr_bh")  # adjust pval
    data["qval"] = qval
    data["asterics"] = asterics_for_vector(qval)
    return data


In [42]:
binom_testing(directional_pairs, mut_num, "directional")

Unnamed: 0,label,mut1,mut2,pval,qval,asterics
0,directional,A>C,C>A,0.002547565,0.003821347,**
1,directional,A>G,G>A,7.90668e-14,1.581336e-13,***
2,directional,A>U,U>A,0.1101842,0.1101842,
3,directional,C>G,G>C,0.008146902,0.009776283,**
4,directional,C>U,U>C,1.840321e-28,5.520963000000001e-28,***
5,directional,G>U,U>G,3.094663e-94,1.856798e-93,***


In [44]:
binom_testing(reciprocal_pairs, mut_num, "reciprocal")

Unnamed: 0,label,mut1,mut2,pval,qval,asterics
0,reciprocal,A>C,U>G,3.231713e-05,4.847569e-05,***
1,reciprocal,A>G,U>C,8.936772e-09,1.787354e-08,***
2,reciprocal,A>U,U>A,0.1101842,0.1101842,
3,reciprocal,C>G,G>C,0.008146902,0.009776283,**
4,reciprocal,C>U,G>A,7.948096e-21,2.384429e-20,***
5,reciprocal,G>U,C>A,5.543354e-103,3.326013e-102,***


In [48]:
dirp = binom_testing(directional_pairs, mut_num, "directional")
recp = binom_testing(reciprocal_pairs, mut_num, "reciprocal")

pd.concat([dirp, recp], axis=0).reset_index(drop=True)

Unnamed: 0,label,mut1,mut2,pval,qval,asterics
0,directional,A>C,C>A,0.002547565,0.003821347,**
1,directional,A>G,G>A,7.90668e-14,1.581336e-13,***
2,directional,A>U,U>A,0.1101842,0.1101842,
3,directional,C>G,G>C,0.008146902,0.009776283,**
4,directional,C>U,U>C,1.840321e-28,5.520963000000001e-28,***
5,directional,G>U,U>G,3.094663e-94,1.856798e-93,***
6,reciprocal,A>C,U>G,3.231713e-05,4.847569e-05,***
7,reciprocal,A>G,U>C,8.936772e-09,1.787354e-08,***
8,reciprocal,A>U,U>A,0.1101842,0.1101842,
9,reciprocal,C>G,G>C,0.008146902,0.009776283,**


In [11]:
import pandas as pd

pd.set_option('display.precision', 1)

df = pd.read_csv("/tmp/mutspec_compare.csv")

In [13]:
df.fillna("")

Unnamed: 0,label,mut1,mut2,ratio,pval,pval_adj,asterics
0,directional,A>C,C>A,0.5,0.0025,0.0038,**
1,directional,A>G,G>A,0.3,7.9e-14,1.6e-13,***
2,directional,A>U,U>A,1.9,0.11,0.11,
3,directional,C>G,G>C,0.5,0.0081,0.0098,**
4,directional,C>U,U>C,2.8,1.8000000000000002e-28,5.500000000000001e-28,***
5,directional,G>U,U>G,8.8,3.1e-94,1.9e-93,***
6,reciprocal,A>C,U>G,0.4,3.2e-05,4.8e-05,***
7,reciprocal,A>G,U>C,0.4,8.9e-09,1.8e-08,***
8,reciprocal,A>U,U>A,1.9,0.11,0.11,
9,reciprocal,C>G,G>C,0.5,0.0081,0.0098,**
