In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import statsmodels.api as sm
import statsmodels.formula.api as smf

from itertools import product
from ukbb_recessive.regression.regressions import run_regressions, save_table_for_paper, get_formula, get_target_family

# Load regressions datasets

First, we load datasets for the regression analysis saved on the previous step using script `0_create_dataset.ipynb`. 

In [5]:
output_path = "../../../../data/tables"
datasets_path = '.../450k/datasets'

datasets_dict = {
    'Cassa': pd.read_csv(f"{datasets_path}/Cassa.csv", sep='\t'),
    'pLI': pd.read_csv(f"{datasets_path}/PLI.csv", sep='\t'),
    'Weghorn': pd.read_csv(f"{datasets_path}/Weghorn-drift.csv", sep='\t'),
    'Roulette': pd.read_csv(f"{datasets_path}/Roulette.csv", sep='\t'),
}

# samples of interest: European & non-related without hom and comp_het
european_non_rel_samples = (
    ".../450k/samples/european_non_related_no_withdrawal_to_include_450k.no_hom_comp_het.txt"
)

with open(european_non_rel_samples, 'r') as f:
    european_non_rel_samples = [l.strip() for l in f.readlines()]

print (f"Number of european non-related samples without hom and comp_het: {len(european_non_rel_samples)}\n")

for dataset_name, dataset in datasets_dict.items():
    print (f"Number of samples in {dataset_name}: {datasets_dict[dataset_name].shape[0]}")
    datasets_dict[dataset_name] = dataset[dataset['eid'].astype(str).isin(european_non_rel_samples)].copy()
    print (f"Number of samples in {dataset_name} after filtration: {datasets_dict[dataset_name].shape[0]}")

Number of european non-related samples without hom and comp_het: 376608

Number of samples in Cassa: 378751
Number of samples in Cassa after filtration: 376608
Number of samples in pLI: 378751
Number of samples in pLI after filtration: 376608
Number of samples in Weghorn: 378751
Number of samples in Weghorn after filtration: 376608
Number of samples in Roulette: 378751
Number of samples in Roulette after filtration: 376608


Here are all s_het-based genetic burdens, calculated based on different gene sets and variants (PLPs or singleton LOFs):

In [6]:
for col in datasets_dict[list(datasets_dict.keys())[0]].columns:
    if col[:5] == 's_het':
        print (col)

s_het_lof_all
s_het_lof_without_AR
s_het_lof_AR
s_het_recessive_AR_without_ID
s_het_recessive_Blindness
s_het_recessive_Cardiovascular
s_het_recessive_Cilia_Kidney
s_het_recessive_Deafness
s_het_recessive_Derm
s_het_recessive_Endocrine
s_het_recessive_Hematologic
s_het_recessive_ID_total
s_het_recessive_Immune_system
s_het_recessive_Metabolic
s_het_recessive_Metabolic_ID
s_het_recessive_Neuromuscular
s_het_recessive_No_panel
s_het_recessive_Overlaps
s_het_recessive_Skeletal_Craniofacial
s_het_recessive_Tumor
s_het_recessive_all
s_het_recessive_AD
s_het_recessive_AR
s_het_recessive_ID_without_AD
s_het_recessive_sampled_AR_without_ID
s_het_recessive_sampled_ID


# Define analyses

In [7]:
# we looked into males/females separately
sex_specific_regressions = {
    's_hets': ['s_het_recessive_all', 's_het_lof_without_AR'],
    'targets': ['childlessness', 'years_of_edu', 'is_blond'],
    'families': ['binomial', 'gaussian', 'binomial'],
    'genders': ['males', 'females'], 
    'n_tests_correction': 12,
    'filter_dataset': lambda dataset: dataset
}

# added with/without any education
with_any_education = {
    's_hets': ['s_het_recessive_all', 's_het_lof_without_AR'],
    'targets': ['childlessness', 'is_blond'],
    'families': ['binomial', 'binomial'],
    'genders': ['males', 'females'], 
    'n_tests_correction': 16,
    'filter_dataset': lambda dataset: dataset[dataset['any_education_including_none'] > 0]
}

without_any_education = {
    's_hets': ['s_het_recessive_all', 's_het_lof_without_AR'],
    'targets': ['childlessness', 'is_blond'],
    'families': ['binomial', 'binomial'],
    'genders': ['males', 'females'], 
    'n_tests_correction': 16,
    'filter_dataset': lambda dataset: dataset[dataset['any_education_including_none'] <= 0]
}

analyses = {
    'basic_analysis': sex_specific_regressions,
    # 'with_any_education': with_any_education, 
    # 'without_any_education': without_any_education
}

# Regressions

In [5]:
all_results = defaultdict(list)

for analysis, analysis_cfg in analyses.items():

    print (f"Running {analysis} analysis\n")

    # read analysis config
    s_hets = analysis_cfg['s_hets']
    targets = analysis_cfg['targets']
    families = analysis_cfg['families']
    genders = analysis_cfg['genders']   
    filter_dataset_func = analysis_cfg['filter_dataset']
    n_tests_correction = analysis_cfg['n_tests_correction']
     
    
    # iterate over different s-het datasets
    for dataset_key in datasets_dict:
        print ("\tProcessing", dataset_key, flush=True)

        # iterate over different s-het burdens
        for s_het in s_hets:
            if not (isinstance(s_het, list) or isinstance(s_het, tuple)) :
                s_het = [s_het]

            print (f"\t\tRun {analysis}")

            dataset = filter_dataset_func(datasets_dict[dataset_key]).copy()
            print (f"\t\t\tDataset size after filtration: {dataset.shape}")

            regression_result = run_regressions(dataset=dataset, 
                                                targets=targets, 
                                                families=families, 
                                                analysis_tag=analysis, 
                                                genders=genders, 
                                                s_het_list=s_het, 
                                                tab_offset='\t\t\t', 
                                                n_tests_correction=n_tests_correction)
            
            all_results[dataset_key].append(regression_result)

            print()

    print()

all_results = {dataset_key: pd.concat(all_results[dataset_key]) for dataset_key in all_results}

save_table_for_paper(all_results, f"{output_path}/table_sex_specific_analysis.xlsx")

Running basic_analysis analysis

	Processing Cassa
		Run basic_analysis
			Dataset size after filtration: (376608, 227)
				Processing males samples
					Processing childlessness
					Processing years_of_edu
					Processing is_blond
				Processing females samples
					Processing childlessness
					Processing years_of_edu
					Processing is_blond

		Run basic_analysis
			Dataset size after filtration: (376608, 227)
				Processing males samples
					Processing childlessness
					Processing years_of_edu
					Processing is_blond
				Processing females samples
					Processing childlessness
					Processing years_of_edu
					Processing is_blond

	Processing pLI
		Run basic_analysis
			Dataset size after filtration: (376608, 227)
				Processing males samples
					Processing childlessness
					Processing years_of_edu
					Processing is_blond
				Processing females samples
					Processing childlessness
					Processing years_of_edu
					Processing is_blond

		Run basic_analysis
			Dataset size af

In [6]:
! ls -lah ../../../../data/tables

total 1.4M
drwxrwx--- 3 gelana bioinf  478 Aug 27 13:27 .
drwxrwx--- 5 gelana bioinf   70 Jul 17 13:02 ..
-rwxrwx--- 1 gelana bioinf 1.8K Aug  5 11:18 CR.csv
-rwxrwx--- 1 gelana bioinf  341 Aug  5 11:16 CR_dutch_estonian.csv
-rwxrwx--- 1 gelana bioinf 6.4K Aug  8 16:28 high_s_het_genes_roulette.xlsx
drwxrwx--- 5 gelana bioinf   76 Aug 19 16:38 sampling
-rwxrwx--- 1 gelana bioinf  57K Aug 28 12:05 table_covariate_analysis.xlsx
-rwxrwx--- 1 gelana bioinf  25K Aug 28 12:08 table_covariate_deprivation_analysis.xlsx
-rwxrwx--- 1 gelana bioinf  19K Aug  8 12:48 table_id_vs_rest_analysis.xlsx
-rwxrwx--- 1 gelana bioinf  78K Aug 28 12:13 table_panel_analysis.xlsx
-rwxrwx--- 1 gelana bioinf  22K Aug 28 11:57 table_phenotype_analysis.xlsx
-rwxrwx--- 1 gelana bioinf  19K Jul 18 16:21 table_selection_analysis.xlsx
-rwxrwx--- 1 gelana bioinf  21K Aug 28 12:16 table_sex_specific_analysis.xlsx


# Significance test

In [12]:
datasets_dict['Roulette']['gender'].value_counts()  

gender
0    202138
1    174470
Name: count, dtype: int64

In [8]:
targets = sex_specific_regressions['targets']
families = sex_specific_regressions['families']

target_results = defaultdict(dict)

for target, family in zip(targets, families):
    print ("Processing", target, flush=True)

    formula = get_formula(target=target, s_het_list=['s_het_recessive_all*gender'])

    # run regressions
    model = smf.glm(formula = formula, data=datasets_dict['Roulette'].copy(), family=get_target_family(family))
    fitted_model = model.fit()
    
    target_results[target] = fitted_model

Processing childlessness


Processing years_of_edu
Processing is_blond


In [19]:
target_results['is_blond'].summary()

0,1,2,3
Dep. Variable:,is_blond,No. Observations:,375729.0
Model:,GLM,Df Residuals:,375683.0
Model Family:,Binomial,Df Model:,45.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-132640.0
Date:,"Thu, 29 Aug 2024",Deviance:,265290.0
Time:,16:28:24,Pearson chi2:,376000.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.00554
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.0348,0.259,-4.000,0.000,-1.542,-0.528
s_het_lof_without_AR,-0.2642,0.157,-1.679,0.093,-0.573,0.044
gender,-0.2791,0.011,-26.438,0.000,-0.300,-0.258
s_het_lof_without_AR:gender,0.2122,0.236,0.898,0.369,-0.251,0.675
age_at_recruitment,-0.0423,0.009,-4.534,0.000,-0.061,-0.024
I(age_at_recruitment ** 2),0.0004,8.4e-05,4.478,0.000,0.000,0.001
PCA_1,-0.0144,0.003,-4.265,0.000,-0.021,-0.008
PCA_2,0.0083,0.003,2.373,0.018,0.001,0.015
PCA_3,-0.0171,0.003,-5.038,0.000,-0.024,-0.010


In [15]:
targets = sex_specific_regressions['targets']
families = sex_specific_regressions['families']

target_results = defaultdict(dict)

for target, family in zip(targets, families):
    print ("Processing", target, flush=True)

    formula = get_formula(target=target, s_het_list=['s_het_lof_without_AR*gender'])

    # run regressions
    model = smf.glm(formula = formula, data=datasets_dict['Roulette'].copy(), family=get_target_family(family))
    fitted_model = model.fit()
    
    target_results[target] = fitted_model

Processing childlessness
Processing years_of_edu
Processing is_blond


In [20]:
target_results['childlessness'].summary()

0,1,2,3
Dep. Variable:,childlessness,No. Observations:,374671.0
Model:,GLM,Df Residuals:,374625.0
Model Family:,Binomial,Df Model:,45.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-180170.0
Date:,"Thu, 29 Aug 2024",Deviance:,360340.0
Time:,16:28:35,Pearson chi2:,375000.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.02304
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.4555,0.205,2.221,0.026,0.054,0.857
s_het_lof_without_AR,0.4242,0.124,3.425,0.001,0.181,0.667
gender,0.1840,0.008,21.741,0.000,0.167,0.201
s_het_lof_without_AR:gender,0.6474,0.170,3.808,0.000,0.314,0.981
age_at_recruitment,-0.0235,0.008,-3.134,0.002,-0.038,-0.009
I(age_at_recruitment ** 2),-0.0002,6.84e-05,-2.985,0.003,-0.000,-7.01e-05
PCA_1,-0.0027,0.003,-0.971,0.331,-0.008,0.003
PCA_2,-0.0007,0.003,-0.231,0.818,-0.006,0.005
PCA_3,0.0053,0.003,1.935,0.053,-6.88e-05,0.011
