# H12 window size calibration

In [1]:
# Notebook parameters. Values here are for development only and 
# will be overridden when running via snakemake and papermill.
cohort_id = 'llineup'
#cohorts_analysis="20230223"
#contigs = ['2L']
sample_sets = ["1288-VO-UG-DONNELLY-VMF00168","1288-VO-UG-DONNELLY-VMF00219"]
sample_query = "aim_species == 'gambiae' & sex_call == 'F'"
#min_cohort_size = 20
#max_cohort_size = 50
h12_calibration_contig = '3L'
use_gcs_cache = False
dask_scheduler = "threads"
window_sizes = (100, 200, 500, 1000, 2000, 5000, 10000, 20000)


## Setup

In [2]:
import yaml
import pandas as pd
import malariagen_data
from pyprojroot import here
import numpy as np
import os
import dask
dask.config.set(scheduler=dask_scheduler);

  n_threads = get_num_threads()


In [3]:
ag3 = malariagen_data.Ag3(pre = True)

In [4]:
contig = h12_calibration_contig


## Run calibration

In [6]:
calibration_runs = ag3.h12_calibration(
    contig=h12_calibration_contig,
    analysis='gamb_colu',
    sample_sets=sample_sets,
    sample_query=sample_query,
    min_cohort_size=None,
    max_cohort_size=None,
    window_sizes=window_sizes,
)
calibration_runs

                                  

Load haplotypes:   0%|          | 0/1584 [00:00<?, ?it/s]

Compute H12:   0%|          | 0/8 [00:00<?, ?it/s]

{'100': array([0.98393453, 0.89225402, 0.82754851, ..., 0.97157675, 0.95399023,
        0.95921736]),
 '200': array([0.85634378, 0.69676889, 0.7920847 , ..., 0.92361519, 0.93487623,
        0.90226105]),
 '500': array([0.47538056, 0.16973315, 0.52784251, ..., 0.63910233, 0.68384363,
        0.79838887]),
 '1000': array([0.07893342, 0.17122564, 0.20940699, ..., 0.61337328, 0.43065133,
        0.53480665]),
 '2000': array([0.02851214, 0.02293833, 0.01348107, ..., 0.28032858, 0.30449195,
        0.27348428]),
 '5000': array([0.00164411, 0.0007187 , 0.00542259, ..., 0.09236622, 0.08138591,
        0.13321643]),
 '10000': array([0.00047658, 0.000849  , 0.00113374, ..., 0.01544692, 0.018376  ,
        0.02303927]),
 '20000': array([0.00045084, 0.0005538 , 0.00045285, 0.00067164, 0.00045889,
        0.00062539, 0.00045165, 0.00608659, 0.00172213, 0.0013107 ,
        0.00380502, 0.00503811, 0.00375958, 0.00587384, 0.00562489,
        0.00494721, 0.00340043, 0.00463432, 0.00171409, 0.00397917,


In [7]:
selected_window_size = None
for window_size in window_sizes:
    x = calibration_runs[str(window_size)]
    x95 = np.percentile(x, 95)
    if x95 < 0.1:
        selected_window_size = window_size
        break
selected_window_size

1000

## Write outputs

In [10]:
home= '/home/harunnn/lstm_scratch/network_scratch/llineup/llineup-genomics/llineup_publication'
outdir =os.path.join(home, "scripts_notebooks/haplotype_clustering/peak_centre")
os.makedirs(outdir, exist_ok=True)

In [11]:
output = {
    "h12_window_size": selected_window_size
}
with open(os.path.join(outdir, f"{cohort_id}.yaml"), mode="w") as output_file:
    yaml.safe_dump(output, output_file)
    