In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import statsmodels.api as sm
import statsmodels.formula.api as smf
import tqdm 

from ukbb_recessive.regression.regressions import run_regressions, save_table_for_paper, get_formula, get_target_family

# Load regressions datasets

First, we load datasets for the regression analysis saved on the previous step using script `0_create_dataset.ipynb`. 

In [2]:
output_path = ".../ukbb_recessive/data/tables/sampling/synonymous"
datasets_path = '.../450k/datasets/sampling/synonymous'

datasets_dict = {f"Roulette_{idx}":  pd.read_csv(f"{datasets_path}/Roulette_sample_{idx}.csv", sep='\t') for idx in range(300)}

# samples of interest: European & non-related without hom and comp_het
european_non_rel_samples = (
    ".../450k/samples/european_non_related_no_withdrawal_to_include_450k.no_hom_comp_het.txt"
)

with open(european_non_rel_samples, 'r') as f:
    european_non_rel_samples = [l.strip() for l in f.readlines()]

print (f"Number of european non-related samples without hom and comp_het: {len(european_non_rel_samples)}\n")

for dataset_name, dataset in datasets_dict.items():
    if dataset_name == 'Roulette_0':
        print (f"Number of samples in {dataset_name}: {datasets_dict[dataset_name].shape[0]}")
    
    datasets_dict[dataset_name] = dataset[dataset['eid'].astype(str).isin(european_non_rel_samples)].copy()
    
    if dataset_name == 'Roulette_0':
        print (f"Number of samples in {dataset_name} after filtration: {datasets_dict[dataset_name].shape[0]}")

Number of european non-related samples without hom and comp_het: 376608

Number of samples in Roulette_0: 378751
Number of samples in Roulette_0 after filtration: 376608


Here are all s_het-based genetic burdens, calculated based on different gene sets and variants (PLPs or singleton LOFs):

In [3]:
for col in datasets_dict[list(datasets_dict.keys())[0]].columns:
    if col[:5] == 's_het':
        print (col)

s_het_recessive_AR_without_ID
s_het_recessive_Blindness
s_het_recessive_Cardiovascular
s_het_recessive_Cilia+Kidney
s_het_recessive_Deafness
s_het_recessive_Derm
s_het_recessive_Endocrine
s_het_recessive_Hematologic
s_het_recessive_ID-total
s_het_recessive_Immune_system
s_het_recessive_Metabolic
s_het_recessive_Metabolic-ID
s_het_recessive_Neuromuscular
s_het_recessive_No_panel
s_het_recessive_Overlaps
s_het_recessive_Skeletal+Craniofacial
s_het_recessive_Tumor
s_het_recessive_all


# Define analyses

We define all phenotypes (targets), that we would like to analyze.

In [4]:
phenotypes = ['childlessness', 'years_of_edu', 'diagnosis_total_ICD10_cnt_log',
              'ICD_infertility', 'fluid_intelligence_score', 'is_blond']

phenotype_family = ['binomial', 'gaussian', 'gaussian', 
                    'binomial', 'gaussian', 'binomial']

# we looked into phenotypes of interest
phenotype_regressions = {
    's_hets': ['s_het_recessive_all'],
    'targets': phenotypes,
    'families': phenotype_family,
    'genders': ['all'], 
    'n_tests_correction': 10,
    'filter_dataset': lambda dataset: dataset
}

# we looked into ID vs rest
id_vs_rest_phenotype_regressions = {
    's_hets': ['s_het_recessive_AR_without_ID', 's_het_recessive_ID_total'],
    'targets': phenotypes,
    'families': phenotype_family,
    'genders': ['all'], 
    'n_tests_correction': 14,
    'filter_dataset': lambda dataset: dataset
}

analyses = {
    # 'synonymous_on_phenotype': phenotype_regressions,
    'synonymous_id_vs_rest': id_vs_rest_phenotype_regressions,
}

## All samples

We define all combinations of s-het burdens that we would like to check for the association with phenotypes and run the regressions for all samples:

In [5]:
for analysis, analysis_cfg in analyses.items():

    all_results = defaultdict(list)

    print (f"Running {analysis} analysis\n")

    # read analysis config
    s_hets = analysis_cfg['s_hets']
    targets = analysis_cfg['targets']
    families = analysis_cfg['families']
    genders = analysis_cfg['genders']   
    filter_dataset_func = analysis_cfg['filter_dataset']
    n_tests_correction = analysis_cfg['n_tests_correction']
     
    
    # iterate over different s-het datasets
    for dataset_key in datasets_dict:
        print ("\tProcessing", dataset_key, flush=True)

        # iterate over different s-het burdens
        for s_het in s_hets:

            if not (isinstance(s_het, list) or isinstance(s_het, tuple)) :
                s_het = [s_het]

            analysis_tag = f'{analysis} regressions on {str(s_het)}'.replace("'", '')

            base_dataset_key, idx = tuple(dataset_key.split('_'))

            print (f"\t\tRun {analysis_tag}")

            dataset = filter_dataset_func(datasets_dict[dataset_key]).copy()
            print (f"\t\t\tDataset size after filtration: {dataset.shape}")
            
            if int(idx) >= 137:
                dataset = dataset.rename(columns={'s_het_synonymous_all': 's_het_recessive_all', 
                                                  's_het_synonymous_AR_without_ID': 's_het_recessive_AR_without_ID', 
                                                  's_het_synonymous_ID-total': 's_het_recessive_ID_total'})
            else:
                dataset = dataset.rename(columns={'s_het_recessive_ID-total': 's_het_recessive_ID_total'})

            regression_result = run_regressions(dataset=dataset, 
                                                targets=targets, 
                                                families=families, 
                                                analysis_tag=analysis_tag, 
                                                genders=genders, 
                                                s_het_list=s_het, 
                                                tab_offset='\t\t\t', 
                                                n_tests_correction=n_tests_correction)
            
            all_results[dataset_key].append(regression_result)

            print()

    all_results = {dataset_key: pd.concat(all_results[dataset_key]) for dataset_key in all_results}
    flatten_results = pd.concat(all_results.values(), axis=0)

    save_table_for_paper(all_results=dict({base_dataset_key: flatten_results}),
                        path=f"{output_path}/table_{analysis}_analysis.xlsx")

    print()

Running synonymous_id_vs_rest analysis

	Processing Roulette_0


		Run synonymous_id_vs_rest regressions on [s_het_recessive_AR_without_ID]
			Dataset size after filtration: (376608, 170)
				Processing all samples
					Processing childlessness
					Processing years_of_edu
					Processing diagnosis_total_ICD10_cnt_log
					Processing ICD_infertility
					Processing fluid_intelligence_score
					Processing is_blond

		Run synonymous_id_vs_rest regressions on [s_het_recessive_ID_total]
			Dataset size after filtration: (376608, 170)
				Processing all samples
					Processing childlessness
					Processing years_of_edu
					Processing diagnosis_total_ICD10_cnt_log
					Processing ICD_infertility
					Processing fluid_intelligence_score
					Processing is_blond

	Processing Roulette_1
		Run synonymous_id_vs_rest regressions on [s_het_recessive_AR_without_ID]
			Dataset size after filtration: (376608, 170)
				Processing all samples
					Processing childlessness
					Processing years_of_edu
					Processing diagnosis_total_ICD10_cnt_log
					Processing ICD

In [6]:
!ls -lah $output_path

total 2.5M
drwxrwx--- 2 gelana bioinf  120 Aug 29 15:45 .
drwxrwx--- 5 gelana bioinf   76 Aug 19 16:38 ..
-rwxrwx--- 1 gelana bioinf 1.1M Aug 29 15:45 table_synonymous_id_vs_rest_analysis.xlsx
-rwxrwx--- 1 gelana bioinf 576K Aug 28 16:20 table_synonymous_on_phenotype_analysis.xlsx
