In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import statsmodels.api as sm
import statsmodels.formula.api as smf

from ukbb_recessive.regression.regressions import run_regressions, save_table_for_paper, get_formula, get_target_family

# Load regressions datasets

First, we load datasets for the regression analysis saved on the previous step using script `0_create_dataset.ipynb`. 

In [2]:
output_path = "../../../../data/tables"
datasets_path = '.../450k/datasets'

datasets_dict = {
    'Cassa': pd.read_csv(f"{datasets_path}/Cassa.csv", sep='\t'),
    'pLI': pd.read_csv(f"{datasets_path}/PLI.csv", sep='\t'),
    'Weghorn': pd.read_csv(f"{datasets_path}/Weghorn-drift.csv", sep='\t'),
    'Roulette': pd.read_csv(f"{datasets_path}/Roulette.csv", sep='\t'),
}

# samples of interest: European & non-related without hom and comp_het
european_non_rel_samples = (
    ".../450k/samples/european_non_related_no_withdrawal_to_include_450k.no_hom_comp_het.txt"
)

with open(european_non_rel_samples, 'r') as f:
    european_non_rel_samples = [l.strip() for l in f.readlines()]

print (f"Number of european non-related samples without hom and comp_het: {len(european_non_rel_samples)}\n")

for dataset_name, dataset in datasets_dict.items():
    print (f"Number of samples in {dataset_name}: {datasets_dict[dataset_name].shape[0]}")
    datasets_dict[dataset_name] = dataset[dataset['eid'].astype(str).isin(european_non_rel_samples)].copy()
    print (f"Number of samples in {dataset_name}: {datasets_dict[dataset_name].shape[0]}")

    datasets_dict[dataset_name]['diagnosis_secondary_ICD10_cnt_log'] = np.log(datasets_dict[dataset_name]['diagnosis_secondary_ICD10_cnt'])
    datasets_dict[dataset_name]['diagnosis_main_ICD10_cnt_log'] = np.log(datasets_dict[dataset_name]['diagnosis_main_ICD10_cnt'])

    print()

Number of european non-related samples without hom and comp_het: 376608

Number of samples in Cassa: 378751
Number of samples in Cassa: 376608

Number of samples in pLI: 378751
Number of samples in pLI: 376608

Number of samples in Weghorn: 378751
Number of samples in Weghorn: 376608

Number of samples in Roulette: 378751
Number of samples in Roulette: 376608



Here are all s_het-based genetic burdens, calculated based on different gene sets and variants (PLPs or singleton LOFs):

In [3]:
for col in datasets_dict[list(datasets_dict.keys())[0]].columns:
    if col[:5] == 's_het':
        print (col)

s_het_lof_all
s_het_lof_without_AR
s_het_lof_AR
s_het_recessive_AR_without_ID
s_het_recessive_Blindness
s_het_recessive_Cardiovascular
s_het_recessive_Cilia_Kidney
s_het_recessive_Deafness
s_het_recessive_Derm
s_het_recessive_Endocrine
s_het_recessive_Hematologic
s_het_recessive_ID_total
s_het_recessive_Immune_system
s_het_recessive_Metabolic
s_het_recessive_Metabolic_ID
s_het_recessive_Neuromuscular
s_het_recessive_No_panel
s_het_recessive_Overlaps
s_het_recessive_Skeletal_Craniofacial
s_het_recessive_Tumor
s_het_recessive_all
s_het_recessive_AD
s_het_recessive_AR
s_het_recessive_ID_without_AD
s_het_recessive_sampled_AR_without_ID
s_het_recessive_sampled_ID


# Define analyses

In [5]:
# here we tried to look into the effect of being a carrier of PLP in recessive gene without taking into account the selection constraint.
preliminary_regresions = {
    's_hets': ['has_mutation_recessive_all', 'mutations_cnt_recessive_all', 'has_mutation_lof_without_AR'],
    'targets': ['childlessness'],
    'families': ['binomial'], 
    'genders': ['all'], 
    'n_tests_correction': 10,
    'filter_dataset': lambda dataset: dataset
}

#we define all combinations of s-het burdens that we would like to check for the association with phenotypes and run the regressions for all samples:
s_het_regressions = {
    's_hets': ['s_het_recessive_all', 's_het_lof_without_AR', 
               ['s_het_recessive_all', 's_het_lof_without_AR']],
    'targets': ['childlessness', 'is_blond'],
    'families': ['binomial', 'binomial'], 
    'genders': ['all'], 
    'n_tests_correction': 10,
    'filter_dataset': lambda dataset: dataset
}

#to ensure, that our resuls are not confounded by the LoF carriers in other genes, we exclude all those carriers and repeat the analysis for PLPs in all recessive genes:
exclude_lof_carriers = {
    's_hets': ['s_het_recessive_all'],
    'targets': ['childlessness', 'is_blond'],
    'families': ['binomial', 'binomial'], 
    'genders': ['all'], 
    'n_tests_correction': 10,
    'filter_dataset': lambda dataset: dataset[dataset['has_mutation_lof_without_AR'].fillna(0) != 1]
}

analyses = {
    'preliminary': preliminary_regresions,
    'selection': s_het_regressions, 
    'exlude_lof_carriers': exclude_lof_carriers
}

# Run regressions

In [5]:
all_results = defaultdict(list)

for analysis, analysis_cfg in analyses.items():

    print (f"Running {analysis} analysis\n")

    # read analysis config
    s_hets = analysis_cfg['s_hets']
    targets = analysis_cfg['targets']
    families = analysis_cfg['families']
    genders = analysis_cfg['genders']   
    filter_dataset_func = analysis_cfg['filter_dataset']
    n_tests_correction = analysis_cfg['n_tests_correction']
     
    
    # iterate over different s-het datasets
    for dataset_key in datasets_dict:
        print ("\tProcessing", dataset_key, flush=True)

        # iterate over different s-het burdens
        for s_het in s_hets:
            if not isinstance(s_het, list):
                s_het = [s_het]

            analysis_tag = f'{analysis} regressions on {str(s_het)}'.replace("'", '')

            print (f"\t\tRun {analysis_tag}")

            dataset = filter_dataset_func(datasets_dict[dataset_key]).copy()
            print (f"\t\t\tDataset size aftexr filtration: {dataset.shape}")

            regression_result = run_regressions(dataset=dataset, 
                                                targets=targets, 
                                                families=families, 
                                                analysis_tag=analysis_tag, 
                                                genders=genders, 
                                                s_het_list=s_het, 
                                                tab_offset='\t\t\t', 
                                                n_tests_correction=n_tests_correction)
            
            all_results[dataset_key].append(regression_result)

            print()
    print()

all_results = {dataset_key: pd.concat(all_results[dataset_key]) for dataset_key in all_results}

save_table_for_paper(all_results, f"{output_path}/table_selection_analysis.xlsx", keep_effects=preliminary_regresions['s_hets'])

Running preliminary analysis

	Processing Cassa
		Run preliminary regressions on [has_mutation_recessive_all]
			Dataset size aftexr filtration: (376608, 210)
				Processing all samples
					Processing childlessness

		Run preliminary regressions on [mutations_cnt_recessive_all]
			Dataset size aftexr filtration: (376608, 210)
				Processing all samples
					Processing childlessness

		Run preliminary regressions on [has_mutation_lof_without_AR]
			Dataset size aftexr filtration: (376608, 210)
				Processing all samples
					Processing childlessness

	Processing pLI
		Run preliminary regressions on [has_mutation_recessive_all]
			Dataset size aftexr filtration: (376608, 210)
				Processing all samples
					Processing childlessness

		Run preliminary regressions on [mutations_cnt_recessive_all]
			Dataset size aftexr filtration: (376608, 210)
				Processing all samples
					Processing childlessness

		Run preliminary regressions on [has_mutation_lof_without_AR]
			Dataset size aftexr fi

In [6]:
! ls -lah ../../../../data/tables

total 1.1M
drwxrwx--- 4 gelana bioinf  324 Jul 15 11:26 .
drwxrwx--- 5 gelana bioinf   70 Jul 17 13:02 ..
-rwxrwx--- 1 gelana bioinf  452 May 22 14:30 CR_pie_data.csv
drwxrwx--- 2 gelana bioinf 1.1K Jun 18 14:05 education_split
drwxrwx--- 5 gelana bioinf   76 Jul 17 14:19 sampling
-rwxrwx--- 1 gelana bioinf  49K Jul 15 16:58 table_covariate_analysis.xlsx
-rwxrwx--- 1 gelana bioinf  26K Jul 16 13:41 table_id_vs_rest_analysis.xlsx
-rwxrwx--- 1 gelana bioinf  68K Jul 16 13:37 table_panel_analysis.xlsx
-rwxrwx--- 1 gelana bioinf  21K Jul 15 16:52 table_phenotype_analysis.xlsx
-rwxrwx--- 1 gelana bioinf  19K Jul 18 16:21 table_selection_analysis.xlsx


# Constraints test

In [6]:
targets = s_het_regressions['targets']
families = s_het_regressions['families']

target_results = defaultdict(dict)

for target, family in zip(targets, families):
    print ("Processing", target, flush=True)

    formula = get_formula(target=target, s_het_list=['s_het_recessive_all', 's_het_lof_without_AR'])

    # run regressions
    model = smf.glm(formula = formula, data=datasets_dict['Roulette'].copy(), family=get_target_family(family))
    fitted_model = model.fit()
    
    target_results[target] = fitted_model

Processing childlessness
Processing is_blond


In [7]:
for target in targets:
    print(target)

    print(target_results[target].t_test(
        's_het_recessive_all = s_het_lof_without_AR', use_t=True))
    
    print()
    print()

childlessness
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0            -0.2218      0.187     -1.186      0.236      -0.589       0.145


is_blond
                             Test for Constraints                             
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c0             0.4384      0.240      1.827      0.068      -0.032       0.909




In [9]:
for target in targets:
    print(target)

    print(target_results[target].wald_test(
        's_het_recessive_all = s_het_lof_without_AR'), scalar=True)
    
    print()
    print()

childlessness
<Wald test (chi2): statistic=[[1.40546269]], p-value=0.23581107545856247, df_denom=1>


is_blond
<Wald test (chi2): statistic=[[3.33868229]], p-value=0.06766878052314841, df_denom=1>




