In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import statsmodels.api as sm
import statsmodels.formula.api as smf

from ukbb_recessive.regression.regressions import run_regressions, save_table_for_paper, get_formula, get_target_family

# Load regressions datasets

First, we load datasets for the regression analysis saved on the previous step using script `0_create_dataset.ipynb`. 

In [2]:
output_path = "../../../../../data/tables/sampling/dataset"
datasets_path = '.../450k/datasets/'

datasets_dict = {
    'Roulette': pd.read_csv(f"{datasets_path}/Roulette.csv", sep='\t')
}

# samples of interest: European & non-related without hom and comp_het
european_non_rel_samples = (
    ".../450k/samples/european_non_related_no_withdrawal_to_include_450k.no_hom_comp_het.txt"
)

with open(european_non_rel_samples, 'r') as f:
    european_non_rel_samples = [l.strip() for l in f.readlines()]

print (f"Number of european non-related samples without hom and comp_het: {len(european_non_rel_samples)}\n")

for dataset_name, dataset in datasets_dict.items():
    print (f"Number of samples in {dataset_name}: {datasets_dict[dataset_name].shape[0]}")
    datasets_dict[dataset_name] = dataset[dataset['eid'].astype(str).isin(european_non_rel_samples)].copy()
    print (f"Number of samples in {dataset_name}: {datasets_dict[dataset_name].shape[0]}")

    datasets_dict[dataset_name]['diagnosis_secondary_ICD10_cnt_log'] = np.log(datasets_dict[dataset_name]['diagnosis_secondary_ICD10_cnt'])
    datasets_dict[dataset_name]['diagnosis_main_ICD10_cnt_log'] = np.log(datasets_dict[dataset_name]['diagnosis_main_ICD10_cnt'])


Number of european non-related samples without hom and comp_het: 376608

Number of samples in Roulette: 378751
Number of samples in Roulette: 376608


# Sub-sample the dataset

We downsample dataset and repeat an analysis for each downsample fraction `n_repeats` times to measure, how big should be the dataset to capture the effect. 

In [3]:
s_hets = ['s_het_recessive_all', 's_het_recessive_ID_total', 's_het_lof_without_AR']

sub_sample_fractions = np.arange(0.1, 1.1, 0.1).tolist()
n_repeats = 20

# iterate over different covariates
for s_het in s_hets:
    if not isinstance(s_het, list):
        s_het = [s_het]

    analysis_tag = f'dataset sampling regressions on {str(s_het)}'.replace("'", '')

    all_results = {}

    # iterate over different s-het sources (cassa, weghorn, pli)
    # for dataset_key in datasets_dict:
    for dataset_key in ['Roulette']:

        all_results[dataset_key] = []
        
        # iterate over different sample fractions
        for fraction in sub_sample_fractions:

            fraction = round(fraction, 2)

            print ("Processing", dataset_key, flush=True)
            print (fraction)
            
            # make several repeats
            for i in range(n_repeats):

                # dataset = datasets_dict[dataset_key].sample(frac=fraction, replace=True)
                dataset = datasets_dict[dataset_key].sample(frac=fraction, replace=False)

                regression_result = run_regressions(dataset=dataset, 
                                                    targets=['childlessness'], 
                                                    families=['binomial'], 
                                                    analysis_tag=analysis_tag+f"_frac={fraction}", 
                                                    genders=['all'], 
                                                    s_het_list=s_het)
                
                all_results[dataset_key] += [regression_result]

                if fraction == 1:
                    break

        all_results[dataset_key] = pd.concat(all_results[dataset_key])
    
    save_table_for_paper(all_results, f"{output_path}/table_{analysis_tag.replace(' ', '_')}.xlsx")

Processing Roulette
0.1
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Processing all samples
		Processing childlessness
	Proces

In [4]:
output_path

'../../../../../data/tables/sampling/'