In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import statsmodels.api as sm
import statsmodels.formula.api as smf

from ukbb_recessive.regression.regressions import run_regressions, save_table_for_paper, get_formula, get_target_family

# Load regressions datasets

First, we load datasets for the regression analysis saved on the previous step using script `0_create_dataset.ipynb`. 

In [None]:
datasets_path = '.../datasets'

datasets_dict = {
    'Cassa': pd.read_csv(f"{datasets_path}/Cassa.csv", sep='\t'),
    'pLI': pd.read_csv(f"{datasets_path}/PLI.csv", sep='\t'),
    'Weghorn': pd.read_csv(f"{datasets_path}/Weghorn-drift.csv", sep='\t'),
}

Here are all s_het-based genetic burdens, calculated based on different gene sets and variants (PLPs or singleton LOFs):

In [None]:
for col in datasets_dict['Cassa'].columns:
    if col[:5] == 's_het':
        print (col)

# Generate regressions

We define all phenotypes (targets), that we would like to analyze.

In [None]:
targets=['childlessness', 'any_education_including_none', 'is_blond', 
         'fluid_intelligence_score','higher_education_including_none']

families=['binomial', 'binomial', 'binomial', 
          'gaussian', 'binomial']

genders=['all']

## All samples

We define all combinations of s-het burdens that we would like to check for the association with phenotypes and run the regressions for all samples:

In [None]:
s_hets = ['s_het_recessive_all', 's_het_lof_without_AR', 's_het_lof_AR', ['s_het_recessive_all', 's_het_lof_without_AR'], 's_het_recessive_without_high_s_het']

for s_het in s_hets:
    if not isinstance(s_het, list):
        s_het = [s_het]

    analysis_tag = f'basic regressions on {str(s_het)}'.replace("'", '')

    all_results = {}

    for dataset_key in datasets_dict:

        print ("Processing", dataset_key, flush=True)

        regression_result = run_regressions(dataset=datasets_dict[dataset_key], 
                                            targets=targets, 
                                            families=families, 
                                            analysis_tag=analysis_tag, 
                                            genders=genders, 
                                            s_het_list=s_het)
        
        all_results[dataset_key] = regression_result

    save_table_for_paper(all_results, f"../../../../data/tables/table_{analysis_tag.replace(' ', '_')}.xlsx")

## Exclude LoF carriers

To ensure, that our resuls are not confounded by the LoF carriers in other genes, we exclude all those carriers and repeat the analysis for PLPs in all recessive genes. 

In [None]:
s_hets = ['s_het_recessive_all']

for s_het in s_hets:
    if not isinstance(s_het, list):
        s_het = [s_het]

    analysis_tag = f'basic regressions on {str(s_het)} without LoF carriers'.replace("'", '')

    all_results = {}

    for dataset_key in datasets_dict:

        print ("Processing", dataset_key, flush=True)

        # exclude carriers
        dataset = datasets_dict[dataset_key]
        print ("Number of LoF carriers in non-recessive genes:", dataset['has_mutation_lof_without_AR'].fillna(0).sum())
        dataset = dataset[dataset['has_mutation_lof_without_AR'].fillna(0) != 1]

        regression_result = run_regressions(dataset=dataset, 
                                            targets=targets, 
                                            families=families, 
                                            analysis_tag=analysis_tag, 
                                            genders=genders, 
                                            s_het_list=s_het)
        
        all_results[dataset_key] = regression_result

    save_table_for_paper(all_results, f"../../../../data/tables/table_{analysis_tag.replace(' ', '_')}.xlsx")

# Correction for other covariants: ICD and infertility

To ensure, that our results are not confounded by infertility or other diseases, we run the regressions with new covariates like infertility and ICD diagnosis counts.  

In [None]:
correction_phenotypes = ['diagnosis_main_ICD10_cnt', 'diagnosis_secondary_ICD10_cnt', 'diagnosis_total_ICD10_cnt', 'ICD_infertility']

s_hets = [['s_het_recessive_all', correction_phenotype] for correction_phenotype in correction_phenotypes] + ['s_het_recessive_all']
print (s_hets)
        
all_results = {}

# iterate over different s-het sources (cassa, weghorn, pli)
for dataset_key in datasets_dict:

    all_results[dataset_key] = []

    # iterate over different covariates
    for s_het in s_hets:
        if not isinstance(s_het, list):
            s_het = [s_het]

        analysis_tag = f'basic regressions on {str(s_het)}'.replace("'", '')

        print (analysis_tag)


        regression_result = run_regressions(dataset=datasets_dict[dataset_key], 
                                            targets=['childlessness'], 
                                            families=['binomial'], 
                                            analysis_tag=analysis_tag, 
                                            genders=['all'], 
                                            s_het_list=s_het)
        
        all_results[dataset_key] += [regression_result]


    all_results[dataset_key] = pd.concat(all_results[dataset_key])
    
save_table_for_paper(all_results, f"../../../../data/tables/table_basic_regressions_with_covariates.xlsx")

# Preliminary regressions

Here we tried to look into the effect of being a carrier of PLP in recessive gene without taking into account the selection constraint.

In [None]:
s_hets = ['has_mutation_recessive_all', 'mutations_cnt_recessive_all', 'has_mutation_lof_all']
targets = ['childlessness']
families = ['binomial']
genders = ['all']


all_results = defaultdict(list)
    
for dataset_key in datasets_dict:
    print ("Processing", dataset_key, flush=True)

    for s_het in s_hets:
        if not isinstance(s_het, list):
            s_het = [s_het]

        analysis_tag = f'preliminary regressions on {str(s_het)}'.replace("'", '')

        print (f"Run {analysis_tag}")

        regression_result = run_regressions(dataset=datasets_dict[dataset_key], 
                                            targets=targets, 
                                            families=families, 
                                            analysis_tag=analysis_tag, 
                                            genders=genders, 
                                            s_het_list=s_het)
        
        all_results[dataset_key].append(regression_result)

    all_results[dataset_key] = pd.concat(all_results[dataset_key])

save_table_for_paper(all_results, f"../../../../data/tables/table_preliminary_regressions.xlsx", keep_effects=s_hets)

In [None]:
s_hets = ['has_mutation_recessive_all', 'mutations_cnt_recessive_all']
targets = ['childlessness']
families = ['binomial']
genders = ['all']


all_results = defaultdict(list)
    
for dataset_key in datasets_dict:
    print ("Processing", dataset_key, flush=True)

    # exclude carriers
    dataset = datasets_dict[dataset_key]
    dataset = dataset[dataset['has_mutation_lof_without_AR'].fillna(0) != 1]

    for s_het in s_hets:
        if not isinstance(s_het, list):
            s_het = [s_het]

        analysis_tag = f'preliminary regressions on {str(s_het)}'.replace("'", '')

        print (f"Run {analysis_tag}")

        regression_result = run_regressions(dataset=dataset, 
                                            targets=targets, 
                                            families=families, 
                                            analysis_tag=analysis_tag, 
                                            genders=genders, 
                                            s_het_list=s_het)
        
        all_results[dataset_key].append(regression_result)

    all_results[dataset_key] = pd.concat(all_results[dataset_key])

save_table_for_paper(all_results, f"../../../../data/tables/table_preliminary_regressions_without_LoF_carriers.xlsx", keep_effects=s_hets)

## Sub-sample the dataset

We downsample dataset and repeat an analysis for each downsample fraction `n_repeats` times to measure, how big should be the dataset to capture the effect. 

In [None]:
s_hets = ['s_het_recessive_all', 's_het_lof_without_AR']

sub_sample_fractions = np.arange(0.1, 1.1, 0.1).tolist()
n_repeats = 20

# iterate over different covariates
for s_het in s_hets:
    if not isinstance(s_het, list):
        s_het = [s_het]

    analysis_tag = f'reduced samples regressions on {str(s_het)}'.replace("'", '')

    all_results = {}

    # iterate over different s-het sources (cassa, weghorn, pli)
    for dataset_key in datasets_dict:

        all_results[dataset_key] = []
        
        # iterate over different sample fractions
        for fraction in sub_sample_fractions:

            fraction = round(fraction, 2)

            print ("Processing", dataset_key, flush=True)
            print (fraction)
            
            # make several repeats
            for i in range(n_repeats):

                dataset = datasets_dict[dataset_key].sample(frac=fraction)

                regression_result = run_regressions(dataset=dataset, 
                                                    targets=['childlessness'], 
                                                    families=['binomial'], 
                                                    analysis_tag=analysis_tag+f"_frac={fraction}", 
                                                    genders=['all'], 
                                                    s_het_list=s_het)
                
                all_results[dataset_key] += [regression_result]

                if fraction == 1:
                    break

        all_results[dataset_key] = pd.concat(all_results[dataset_key])
    
    save_table_for_paper(all_results, f"../../../../data/tables/table_{analysis_tag.replace(' ', '_')}.xlsx")

# Significance test PLPs vs LoFs

In [None]:
s_hets = ['s_het_recessive_all', 's_het_lof_without_AR']

targets=['childlessness']

families=['binomial', 'binomial', 'binomial']

all_results = defaultdict(dict)

for target, family in zip(targets, families):
    print ("Processing", target, flush=True)

    for dataset_key in ['Weghorn']:
        print ("\tProcessing", dataset_key, flush=True)

        dataset_subset = datasets_dict[dataset_key].copy()

        formula = get_formula(target=target, s_het_list=s_hets)

        # run regressions
        model = smf.glm(formula = formula, data=dataset_subset, family=get_target_family(family))
        fitted_model = model.fit()
        
        all_results[target][dataset_key] = fitted_model

In [None]:
for target in targets:
    odds_ratios =  np.round(np.exp(all_results[target]['Weghorn'].params), 2)

    print (f"Target = {target}, OR {s_hets[0]}={odds_ratios[s_hets[0]]}, OR {s_hets[1]}={odds_ratios[s_hets[1]]}")
    print()

In [None]:
for target in targets:
    print(target)

    print(all_results[target]['Weghorn'].t_test(
        's_het_recessive_all = s_het_lof_without_AR', use_t=True))
    
    print()
    print()