# Generates tables for the manuscript

**Authored by:** Yicheng Zhu, Cheng Soon Ong and Gavin Huttley

This notebook produces the tables for the main manuscript and the supplementary material. It assumes the directory containing the manuscript latex is named `mutation_classifier_manuscript` and is a sister directory to the one containing this file.

The notebook further assumes installation of numerous python libraries. Those are defined in the top level imports below.

In [1]:
import os
import re
import textwrap
import numpy
from tqdm import tqdm
from cogent3.util.misc import open_
from cogent3 import LoadTable

from ms_scripts.makefig import MakeMatch
from ms_scripts.getrefs import get_ms_supp_labels
from ms_scripts.maketab import (classifier_summary_stats, format_latex_table,
                     format_positions, format_pvalue, format_direction,
                    format_group)

os.makedirs('figs_n_tabs', exist_ok=True)

kwargs = dict(category="name", order=["M", "M+I", "M+I+2D", "M+I+2Dp", "FS"])


def clean_latex(latex):
    latex = latex.splitlines()
    result = []
    for line in latex:
        if line.rstrip().endswith(r'\\'):
            result.append(line)
            continue

        line = line.strip()
        result.extend(textwrap.wrap(line.strip(), break_long_words=False, break_on_hyphens=False))
    latex = '\n'.join(result)
    return latex


def get_relative_dir(path):
    relative_dir = os.path.basename(os.path.dirname(path))
    return relative_dir

def get_summary_stats(table, stat, k):
    matcher = MakeMatch({0: lambda x: x == "lr",
                         1: lambda x: x == k,
                         2: lambda x: x in kwargs["order"]})
    selected = collated.filtered(matcher, columns=["algorithm", "k", "name"])
    t = classifier_summary_stats(selected, 'auc', ['name', 'size'])
    return t

def is_tab(val):
    bits = val.split(':')
    result = bits[0].endswith('tab')
    return result

In [2]:
outdir_ms = "../mutation_classifier_manuscript"

all_ms_tables = {}
all_supp_tables = {}

all_floats = {}  # for storing all latex float text (figs, tables)
unused_labels = []
ms_labels, supp_labels = get_ms_supp_labels(is_tab, texdir=outdir_ms)



Working on labels


Working on refs


## Sample sizes

In [3]:
label = "suptab:sample-sizes"
if label in supp_labels:
    chroms = [1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 'XY']
    enu_size_dict = {1: 16977, 2: 21100, 3: 11228, 4: 13973, 5: 14509, 6: 13039, 7: 20864, 8: 11232, 
                     9: 14010, 10: 11315, 11: 17101, 12: 8022, 13: 9085, 14: 8395, 15: 9342, 16: 7266, 
                     17: 11981, 18: 6356, 19: 7529, 'XY': 853}
    sp_sizes_dict = {1: 17848, 2: 20051, 3: 11713, 4: 16936, 5: 16028, 6: 12097, 7: 19161, 8: 13465, 
                     9: 15662, 10: 12641, 11: 19626, 12: 8817, 13: 8939, 14: 8868, 15: 11079, 16: 8117, 
                     17: 12168, 18: 7732, 19: 8635, 'XY': 5097}

    rows = []
    for chrom in chroms:
        num_enu = enu_size_dict[chrom]
        num_sp = sp_sizes_dict[chrom]
        rows.append([chrom, num_enu, num_sp])

    header = ['Chromosome', 'ENU-induced', 'Spontaneous']
    all_size_table = LoadTable(header=header, rows=rows, column_templates={"ENU-induced": "{:,}".format, "Spontaneous": "{:,}".format})
    all_size_table.title = r"By-chromosome sample sizes of genetic variants from the ENU induced and spontaneous "\
    +r"germline mutations."
    all_supp_tables[label] = format_latex_table(all_size_table, justify="rrrl", label=label)
    all_size_table
else:
    unused_labels.append(label)

# Log-linear

### For manuscript

In [4]:
label = "tab:enu_v_germline:a-g"

if label in ms_labels:
    fns = !ls loglin/results/ENU_vs_germline/autosomes/directions/AtoG/summary.txt
    fns
    tab_enu_v_sp = LoadTable(fns[0], sep="\t")
    tab_enu_v_sp = tab_enu_v_sp.with_new_column("Position(s)", format_positions, columns=["Position"])
    tab_enu_v_sp = tab_enu_v_sp.get_columns(["Position(s)", "Deviance", "df", "prob"])
    tab_enu_v_sp = tab_enu_v_sp.with_new_header("prob", "p-value")
    tab_enu_v_sp.format_column("p-value", format_pvalue)
    tab_enu_v_sp.format_column("Deviance", "%.1f")
    tab_enu_v_sp = tab_enu_v_sp.sorted(columns=["df", "Deviance"])

    tab_enu_v_sp.title = r"Log-linear analysis of mutation motif comparison between mouse germline and ENU-induced "\
    +r"A$\rightarrow$G mutations. Deviance is from the log-linear model, with df degrees-of-freedom "\
    +r"and corresponding $p$-value obtained from the $\chi^2$ distribution."


    all_ms_tables[label] = format_latex_table(tab_enu_v_sp, justify="rrrl", label=label)
    print(tab_enu_v_sp)
else:
    unused_labels.append(labels)

     Position(s)    Deviance    df                 p-value
----------------------------------------------------------
              +2        88.6     3    $4.4\times 10^{-19}$
              -2      1105.6     3                     0.0
              +1      1393.7     3                     0.0
              -1      5693.3     3                     0.0
        (-2, +2)        12.0     9                  0.2145
        (-1, +2)        50.3     9     $9.4\times 10^{-8}$
        (+1, +2)        96.1     9    $9.5\times 10^{-17}$
        (-2, +1)       123.0     9    $3.3\times 10^{-22}$
        (-2, -1)       284.1     9    $6.2\times 10^{-56}$
        (-1, +1)       353.1     9    $1.3\times 10^{-70}$
    (-2, -1, +2)        41.2    27                  0.0396
    (-1, +1, +2)        46.9    27                  0.0100
    (-2, +1, +2)        55.1    27                  0.0011
    (-2, -1, +1)        62.2    27                  0.0001
(-2, -1, +1, +2)       118.6    81                  0.00

# For supplementary

## Log-linear

In [5]:
label = "suptab:spectra:enu_spontaneous"
if label in supp_labels:
    fns = !ls loglin/results/ENU_vs_germline/autosomes/combined/spectra_summary.txt

    tab_spectra = LoadTable(fns[0], sep="\t")
    tab_spectra = tab_spectra.get_columns(["direction", "group", "ret"])
    tab_spectra = tab_spectra.with_new_header("direction", "Direction")
    tab_spectra = tab_spectra.with_new_header("group", "Class")
    tab_spectra = tab_spectra.with_new_header("ret", "RET")
    tab_spectra.format_column("Direction", format_direction)
    tab_spectra.format_column("Class", format_group)
    tab_spectra.format_column("RET", "%.3f")
    tab_spectra = tab_spectra.sorted(columns=["RET"])

    tab_spectra.title = r"Comparison of mutation spectra between Spontaneous and ENU-induced "\
    +r"germline point mutations. RET values are proportional to deviance generated from the log-linear model \citep{zhu2017statistical}, and "\
    +r"$p$-value are obtained from the $\chi^2$ distribution. All $p$-values were below the limit of detection."
    all_supp_tables[label] = format_latex_table(tab_spectra, justify="rrrl", label=label)
    tab_spectra
else:
    unused_labels.append(labels)

In [6]:
label = "suptab:a-g:enu"
if label in supp_labels:
    fns = !ls loglin/results/ENU_variants/autosomes/directions/AtoG/*.txt
    tab_enu = LoadTable(fns[0], sep="\t")
    tab_enu = tab_enu.with_new_column("Position(s)", format_positions, columns=["Position"])
    tab_enu = tab_enu.get_columns(["Position(s)", "Deviance", "df", "prob"])
    tab_enu = tab_enu.with_new_header("prob", "p-value")
    tab_enu.format_column("p-value", format_pvalue)
    tab_enu.format_column("Deviance", "%.1f")
    tab_enu = tab_enu.sorted(columns=["df", "Deviance"])

    tab_enu.title = r"Log-linear analysis of ENU-induced A$\rightarrow$G mutation. "\
    +r"Position(s) are relative to the index position. Deviance is from the log-linear model, "\
    +r"with df degrees-of-freedom and corresponding $p$-value obtained from the $\chi^2$ "\
    +r"distribution. $p$-values listed as 0.0 are below the limit of detection. "\
    +r"See \citet{zhu2017statistical} for a more detailed description of the log-linear models."


    all_supp_tables[label] = format_latex_table(tab_enu, justify="rrrl", label=label)
    tab_enu
else:
    unused_labels.append(label)

In [7]:
label = "suptab:p_sum_tab"
if label in supp_labels:
    fns = !ls loglin/results/ENU_vs_germline/autosomes/directions/*/summary.txt

    p_sum_rows = []
    for fn in fns:
        start, end = fn.split('/')[-2].split('to')
        mut_dir = start + r'$\rightarrow$' + end

        summary_tab = LoadTable(fn, sep="\t")
        p_vals = summary_tab.get_columns(["Position", "prob"]).tolist()

        first_p_vals = []
        second_p_vals = []
        third_p_vals = []
        forth_p_vals = []
        for record in p_vals:
            poses = record[0]
            order = poses.count(':') + 1
            if order == 1:
                first_p_vals.append(record[1])
            if order == 2:
                second_p_vals.append(record[1])
            if order == 3:
                third_p_vals.append(record[1])
            if order == 4:
                forth_p_vals.append(record[1])

        first_p_num = sum(p < 0.05 for p in first_p_vals)
        second_p_num = sum(p < 0.05 for p in second_p_vals)
        third_p_num = sum(p < 0.05 for p in third_p_vals)
        forth_p_num = sum(p < 0.05 for p in forth_p_vals)
        p_sum_rows.append([mut_dir, first_p_num, second_p_num, third_p_num, forth_p_num])

    p_sum_header = ["Mutation direction", "1st-order", "2nd-order", "3rd-order", "4th-order"]
    p_sum_tab = LoadTable(header=p_sum_header, rows=p_sum_rows, sep='\t')
    p_sum_tab.title = r"Number of positions showing significant differences between ENU-induced and "\
    +r"spontaneous germline point mutations from analysis of 5-mers. A $p$-value $\le 0.05$ was classified as significant. "\
    +r"$p$-values were from the log-linear analysis."


    all_supp_tables[label] = format_latex_table(p_sum_tab, justify="ccccc", label=label)
    p_sum_tab
else:
    unused_labels.append(label)

In [8]:
def convert_to_subtable(table):
    table = table.splitlines()
    table[0] = r"\begin{subtable}[t]{1.0\textwidth}"
    table[-1] = r"\end{subtable}"
    table = "\n".join(table)
    return table

label = "suptab:long-flank"
if label in supp_labels:
    header = ['Direction', 'RE$_{max}(1)$', 'RE Dist.', 'p-val Dist.']
    rows =[['A$\\rightarrow$C', '0.0374', '6', '10'],
     ['A$\\rightarrow$G', '0.0402', '4', '10'],
     ['A$\\rightarrow$T', '0.0638', '2', '10'],
     ['C$\\rightarrow$A', '0.0632', '2', '10'],
     ['C$\\rightarrow$T', '0.0703', '2', '10'],
     ['G$\\rightarrow$A', '0.0710', '2', '10'],
     ['G$\\rightarrow$T', '0.0624', '2', '10'],
     ['T$\\rightarrow$A', '0.0606', '2', '10'],
     ['T$\\rightarrow$C', '0.0395', '4', '10'],
     ['T$\\rightarrow$G', '0.0373', '6', '10']]
    lflank_enu = LoadTable(header=header, rows=rows, title="ENU-induced")
    lflank_enu = format_latex_table(lflank_enu, "rrrc", label="suptab:long-flank-nbrsize:enu")
    lflank_enu = convert_to_subtable(lflank_enu)

    d = r"""Direction,RE$_{max}(1)$,RE Dist.,p-val Dist.
    A$\rightarrow$C,0.0047,8,10
    A$\rightarrow$G,0.0118,3,10
    A$\rightarrow$T,0.0194,3,10
    C$\rightarrow$A,0.0332,4,10
    C$\rightarrow$T,0.0505,1,10
    G$\rightarrow$A,0.0508,1,10
    G$\rightarrow$T,0.0351,3,10
    T$\rightarrow$A,0.0117,2,10
    T$\rightarrow$C,0.0152,2,10
    T$\rightarrow$G,0.0148,2,10""".splitlines()
    header = d.pop(0).split(",")
    rows = [r.split(",") for r in d]
    lflank_spontab = LoadTable(header=header, rows=rows, title="Spontaneous")
    lflank_spon = format_latex_table(lflank_spontab, "rrrc")
    lflank_spon = convert_to_subtable(lflank_spon)

    lflank_tmp = '\n'.join([r"\begin{table}",
                  r"\centering", "", "%s", "", "%s",
                  r"\caption{Longer range neighbourhood effect log-linear analyses results of (a) ENU-induced "
                  "mutations and (b) germline spontaneous mutations. For both subtables, the most distant "
                  "positions from the mutation with RE$(1)\ge10\%%$ of RE$_{max}(1)$. RE$(1)$ is the"
                  " first order RE for the position, and RE$_{max}(1)$ the largest RE from a first "
                  "order effect  for the surveyed positions. RE Dist. is the furthest position with "
                  "an RE value $\ge 0.1\times\mathrm{RE}_{max}$. p-val Dist. is the corresponding"
                  " distance based on the $p$-value$\le 0.05$. As the analysis was limited to "
                  "a flank  size of 10bp either side of the mutating base, the maximum possible distance is 10.}",
                r"\label{%s}",
                r"\end{table}"])

    all_supp_tables[label] = lflank_tmp % (lflank_enu, lflank_spon, label)
    lflank_spontab
else:
    unused_labels.append(label)

## Data properties

In [9]:
auto_xy = lambda x: re.search(r'chrom([0-9]{1,2}|XY)\.', x)
get_chrom = lambda x: re.findall(r'(?<=chrom).{1,2}(?=\.)', x)[0]

def get_chrom_paths(paths):
    result = {}
    for path in paths:
        if not auto_xy(path):
            continue
    
        chrom = get_chrom(path)
        try:
            chrom = int(chrom)
        except ValueError:
            pass
        result[chrom] = path
    return result

def get_num_records(path):
    data = LoadTable(path)
    return data.shape[0]

enu_fns = !ls ../variant_data/ENU/*.tsv.gz
enu_fns = get_chrom_paths(enu_fns)

spn_fns = !ls ../variant_data/Germline/*.tsv.gz
spn_fns = get_chrom_paths(spn_fns)

rows = []
for chrom in enu_fns:
    enu_count = get_num_records(enu_fns[chrom])
    spn_count = get_num_records(spn_fns[chrom])
    rows.append([chrom, enu_count, spn_count])

rows = sorted(rows, key=lambda x: ({True: 100}.get(type(x[0]) == str, x[0]), x))
data_sizes = LoadTable(header=['Chromosome', 'ENU-induced', 'Spontaneous'], rows=rows)
data_sizes.format_column('ENU-induced', '{:,}'.format)
data_sizes.format_column('Spontaneous', '{:,}'.format)
data_sizes.title = 'By-chromosome sample sizes of genetic variants from the ENU induced and spon-taneous germline mutations.'

# Classifier

In [10]:
collated = LoadTable("classifier/chrom1_train/collated/collated.tsv.gz",
                    static_column_types=True)
collated = collated.with_new_column('k', lambda x: 2 * x + 1, columns='flank_size')

stat = 'auc'
columns = ['algorithm', stat, 'k', 'name', 'size', 'flank_size', 'feature_dim', 'usegc', 'proximal']
collated = collated.get_columns(columns)

In [11]:
three = get_summary_stats(collated, 'auc', 3)
three.title = "Summary of AUC scores from LR classifiers using 3-mers."

In [12]:
five = get_summary_stats(collated, 'auc', 5)
five.title = "Summary of AUC scores from LR classifiers using 5-mers."

In [13]:
seven = get_summary_stats(collated, 'auc', 7)
seven.title = "Summary of AUC scores from LR classifiers using 7-mers."

In [14]:
fifty_nine = get_summary_stats(collated, 'auc', 59)
fifty_nine.title = "Summary of AUC scores from LR classifiers using 59-mers."

In [15]:
tables = []
for t, l in [(three, 'suptab:LR_aucs_3mer'), (five, 'suptab:LR_aucs_5mer'),
             (seven, 'suptab:LR_aucs_7mer'),
             (fifty_nine, 'suptab:LR_aucs_59mer')]:
    all_supp_tables[l] = format_latex_table(t, 'rrcccc', label=l)

# Writing ms tables

In [16]:
for label in ms_labels:
    if label not in all_ms_tables:
        print('ms label missing', label)
        continue
    
    opath = os.path.join(outdir_ms, label.replace(':', '-') + '.tex')
    print(opath)
    with open(opath, 'w') as outfile:
        table = all_ms_tables[label]
        outfile.write(clean_latex(table) + '\n\n\n')

../mutation_classifier_manuscript/tab-enu_v_germline-a-g.tex
ms label missing tab:encoding-process


# Writing supp tables

In [17]:
with open(os.path.join(outdir_ms, 'sup_tables.tex'), 'w') as outfile:
    for label in supp_labels:
        if label not in all_supp_tables:
            print('ms label missing', label)
            continue

        table = all_supp_tables[label]
        outfile.write(clean_latex(table) + '\n\n\n')