In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.formula.api as smf

from collections import defaultdict
from ukbb_recessive.regression.regressions import run_regressions, save_table_for_paper, get_formula, get_target_family

# Create regressions datasets

In [2]:
output_path = "../../../../data/tables"
datasets_path = '.../450k/datasets'

datasets_dict = {
    'Cassa': pd.read_csv(f"{datasets_path}/Cassa.csv", sep='\t'),
    'pLI': pd.read_csv(f"{datasets_path}/PLI.csv", sep='\t'),
    'Weghorn': pd.read_csv(f"{datasets_path}/Weghorn-drift.csv", sep='\t'),
    'Roulette': pd.read_csv(f"{datasets_path}/Roulette.csv", sep='\t'),
}

# samples of interest: European & non-related without hom and comp_het
european_non_rel_samples = (
    ".../450k/samples/european_non_related_no_withdrawal_to_include_450k.no_hom_comp_het.txt"
)

with open(european_non_rel_samples, 'r') as f:
    european_non_rel_samples = [l.strip() for l in f.readlines()]

print (f"Number of european non-related samples without hom and comp_het: {len(european_non_rel_samples)}\n")

for dataset_name, dataset in datasets_dict.items():
    print (f"Number of samples in {dataset_name}: {datasets_dict[dataset_name].shape[0]}")
    datasets_dict[dataset_name] = dataset[dataset['eid'].astype(str).isin(european_non_rel_samples)].copy()
    print (f"Number of samples in {dataset_name} after filtration: {datasets_dict[dataset_name].shape[0]}")

Number of european non-related samples without hom and comp_het: 376608

Number of samples in Cassa: 378751
Number of samples in Cassa after filtration: 376608
Number of samples in pLI: 378751
Number of samples in pLI after filtration: 376608
Number of samples in Weghorn: 378751
Number of samples in Weghorn after filtration: 376608
Number of samples in Roulette: 378751
Number of samples in Roulette after filtration: 376608


In [3]:
panels = [
    'ID_total', 'Metabolic_ID',  'Blindness',
    'Cilia_Kidney', 'Deafness', 'Derm', 'Endocrine',
    'Hematologic', 'Immune_system', 'Neuromuscular',
    'Skeletal_Craniofacial', 'Metabolic', 'Overlaps'
]

panel_s_het_list = ['s_het_recessive_' + panel for panel in panels]

print (f"Total amount of panels: ", len(panels), flush=True)

Total amount of panels:  13


# Define analyses

In [4]:
# correction for 14: 13 panels + 1 for non-ID

# regressions for 13 panels
panel_regressions = {
    's_hets': panel_s_het_list,
    'targets': ['childlessness', 'years_of_edu', 'diagnosis_total_ICD10_cnt_log', 'fluid_intelligence_score', 'is_blond'],
    'families': ['binomial', 'gaussian', 'gaussian', 'gaussian', 'binomial'],
    'genders': ['all'], 
    'n_tests_correction': 14,
    'filter_dataset': lambda dataset: dataset
}

# regressions for ID vs non-ID
id_vs_other_regressions = {
    's_hets': ['s_het_recessive_AR_without_ID', 's_het_recessive_ID_total'],
    'targets': ['childlessness', 'years_of_edu', 'diagnosis_total_ICD10_cnt_log', 'fluid_intelligence_score', 'is_blond'],
    'families': ['binomial', 'gaussian', 'gaussian', 'gaussian', 'binomial'],
    'genders': ['all'], 
    'n_tests_correction': 14, 
    'filter_dataset': lambda dataset: dataset
}

analyses = {
    'panel': panel_regressions,
    'id_vs_rest': id_vs_other_regressions
}

# Generate regressions

In [5]:
for analysis, analysis_cfg in analyses.items():

    all_results = defaultdict(list)

    print (f"Running {analysis} analysis\n")

    # read analysis config
    s_hets = analysis_cfg['s_hets']
    targets = analysis_cfg['targets']
    families = analysis_cfg['families']
    genders = analysis_cfg['genders']   
    filter_dataset_func = analysis_cfg['filter_dataset']
    n_tests_correction = analysis_cfg['n_tests_correction']
     
    
    # iterate over different s-het datasets
    for dataset_key in datasets_dict:
        print ("\tProcessing", dataset_key, flush=True)

        # iterate over different s-het burdens
        for s_het in s_hets:
            if not (isinstance(s_het, list) or isinstance(s_het, tuple)) :
                s_het = [s_het]

            analysis_tag = f'{analysis} regressions on {str(s_het)}'.replace("'", '')

            print (f"\t\tRun {analysis_tag}")

            dataset = filter_dataset_func(datasets_dict[dataset_key]).copy()
            print (f"\t\t\tDataset size after filtration: {dataset.shape}")

            regression_result = run_regressions(dataset=dataset, 
                                                targets=targets, 
                                                families=families, 
                                                analysis_tag=analysis_tag, 
                                                genders=genders, 
                                                s_het_list=s_het, 
                                                tab_offset='\t\t\t', 
                                                n_tests_correction=n_tests_correction)
            
            all_results[dataset_key].append(regression_result)

            print()

    all_results = {dataset_key: pd.concat(all_results[dataset_key]) for dataset_key in all_results}

    save_table_for_paper(all_results, f"{output_path}/table_{analysis}_analysis.xlsx", 
                         keep_effects=[])

    print()

Running panel analysis

	Processing Cassa
		Run panel regressions on [s_het_recessive_ID_total]
			Dataset size after filtration: (376608, 227)
				Processing all samples
					Processing childlessness
					Processing years_of_edu
					Processing diagnosis_total_ICD10_cnt_log
					Processing fluid_intelligence_score
					Processing is_blond

		Run panel regressions on [s_het_recessive_Metabolic_ID]
			Dataset size after filtration: (376608, 227)
				Processing all samples
					Processing childlessness
					Processing years_of_edu
					Processing diagnosis_total_ICD10_cnt_log
					Processing fluid_intelligence_score
					Processing is_blond

		Run panel regressions on [s_het_recessive_Blindness]
			Dataset size after filtration: (376608, 227)
				Processing all samples
					Processing childlessness
					Processing years_of_edu
					Processing diagnosis_total_ICD10_cnt_log
					Processing fluid_intelligence_score
					Processing is_blond

		Run panel regressions on [s_het_recessive_Cilia_

# Significance test

Run regression for both AR without ID and ID-only to compare the regression coefficients. 

In [6]:
targets = id_vs_other_regressions['targets']
families = id_vs_other_regressions['families']

target_results = defaultdict(dict)

for target, family in zip(targets, families):
    print ("Processing", target, flush=True)

    formula = get_formula(target=target, s_het_list=['s_het_recessive_AR_without_ID', 's_het_recessive_ID_total'])

    # run regressions
    model = smf.glm(formula = formula, data=datasets_dict['Roulette'].copy(), family=get_target_family(family))
    fitted_model = model.fit()
    
    target_results[target] = fitted_model

Processing childlessness
Processing years_of_edu
Processing diagnosis_total_ICD10_cnt_log
Processing fluid_intelligence_score
Processing is_blond


In [7]:
for target in targets:
    print(target)

    print(target_results[target].t_test(
        's_het_recessive_AR_without_ID = s_het_recessive_ID_total', use_t=True))
    
    print()
    print()

childlessness
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0            -0.6923      0.379     -1.825      0.068      -1.436       0.051


years_of_edu
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0             2.9561      0.768      3.850      0.000       1.451       4.461


diagnosis_total_ICD10_cnt_log
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0            -0.2762      0.152     -1.812      0.070      -0.575    

In [None]:
for target in targets:
    print(target)

    print(target_results[target].wald_test(
        's_het_recessive_AR_without_ID = s_het_recessive_ID_total'))
    
    print()
    print()