In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.formula.api as smf

from collections import defaultdict
from ukbb_recessive.regression.regressions import run_regressions, save_table_for_paper, get_formula, get_target_family

# Create regressions datasets

In [None]:
datasets_path = '.../datasets'

datasets_dict = {
    'Cassa': pd.read_csv(f"{datasets_path}/Cassa.csv", sep='\t'),
    'pLI': pd.read_csv(f"{datasets_path}/PLI.csv", sep='\t'),
    'Weghorn': pd.read_csv(f"{datasets_path}/Weghorn-drift.csv", sep='\t'),
}

for key in datasets_dict:
    for col in datasets_dict[key].columns:
        if (col[:5] == 's_het') and ('+' in col):
            datasets_dict[key] = datasets_dict[key].rename(columns={col: col.replace('+', '_')})

        if (col[:5] == 's_het') and ('-' in col):
            datasets_dict[key] = datasets_dict[key].rename(columns={col: col.replace('-', '_')})

In [None]:
panels = [
    'ID_total', 'Metabolic_ID',  'Blindness',
    'Cilia_Kidney', 'Deafness', 'Derm', 'Endocrine',
    'Hematologic', 'Immune_system', 'Neuromuscular',
    'Skeletal_Craniofacial', 'Metabolic', 'Overlaps'
]

s_het_list = ['s_het_recessive_' + panel for panel in panels]

print (f"Total amount of panels: ", len(panels), flush=True)

# Generate regressions

In [None]:
# Every group is saved in a separate file, but the regressions are run individually
s_het_groups = [['s_het_recessive_AR_without_ID', 's_het_recessive_ID_total'], s_het_list]

targets=['childlessness', 'any_education_including_none', 'is_blond', 
         'fluid_intelligence_score','higher_education_including_none']

families=['binomial', 'binomial', 'binomial', 
          'gaussian', 'binomial']

genders=['all']

for s_het_group in s_het_groups:
    if not isinstance(s_het_group, list):
        raise Exception("S-het should be groups!")
    
    analysis_tag = f"panel regressions on {str(s_het_group) if len(s_het_group) <=2 else 's_het_panels'}".replace("'", '')

    all_results = defaultdict(list)

    for s_het in s_het_group:

        print ("Processing", s_het, flush=True)

        for dataset_key in datasets_dict:

            print ("\tProcessing", dataset_key, flush=True)

            regression_tag = f"panel regressions on {s_het}"

            regression_result = run_regressions(dataset=datasets_dict[dataset_key], 
                                                targets=targets, 
                                                families=families, 
                                                analysis_tag=regression_tag, 
                                                genders=['all'], 
                                                s_het_list=[s_het])
            
            all_results[dataset_key].append(regression_result)

    all_results = {k: pd.concat(v) for k,v in all_results.items()}

    save_table_for_paper(all_results, f"../../../../data/tables/table_{analysis_tag.replace(' ', '_')}.xlsx")