In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import glob

from ukbb_recessive.data_collection.dataset import RegressionDataset

In [None]:
datasets_save_path = '.../datasets'

#_____________________________________________________________________________________________________________________________
s_het_paths = {
    'Weghorn-drift': ".../weghorn_drift_gencode-v34.txt",
    'Cassa': "..,/gene_s_het_cassa_all_genes_gencode-v34.txt",
    'PLI': ".../gnomad.v2.1.1.PLI_gencode-v34.txt", 
}

#_____________________________________________________________________________________________________________________________
# phenotypic paths
age_children_path = ".../450k/phenotypes/age_children_data_participant.tsv" 
pca_path = ".../450k/phenotypes/PC_participant.tsv"
other_features_path = (
    ".../450k/phenotypes/phenotypes_of_interest_participant.tsv"
)

#_____________________________________________________________________________________________________________________________
# samples of interest: European & non-related
european_non_rel_samples = (
    ".../450k/samples/european_non_related_no_withdrawal_to_include.txt"
)

#_____________________________________________________________________________________________________________________________
# plps paths
variants_paths_cfg = {
    'recessive' : {
        'cohort_files': glob.glob(".../450k/RAP_output_per_chr/filtered_plps/basic/new_gene_names/new_freq/chr*"), 
        'all_variants_file': ".../450k/plp_selection/basic/new_gene_names/new_freq/all_chr_total_presumable_plps_HFE_final_sorted.txt"
    },
    'lof': {
        'cohort_files': glob.glob(f".../data_450k/sample_lofs/*.normed.csv"),
        'all_variants_file': ".../data_450k/annotations/all_singetones_annotated.csv"
    },
}
#_____________________________________________________________________________________________________________________________
# all gene names
gencode = ".../gencode.v34.GRCh38.txt"

#gene panel (generated on 0_convert_data_into_gencode_v34.ipynb)
gene_panel = pd.read_csv(".../gene-panel-gencode-v34.txt", header=None)
gene_panel.columns = ['Gene name', 'Gene panel']

gene_panel_AR_without_id = gene_panel[gene_panel['Gene panel'] != 'ID-total'].copy()
gene_panel_AR_without_id['Gene panel'] = 'AR_without_ID'

gene_panel = pd.concat([gene_panel, gene_panel_AR_without_id])

gene_panel.tail(3)

# Collect phenotypic info

In [None]:
with open(european_non_rel_samples, 'r') as f:
    european_non_rel_samples = [l.strip() for l in f.readlines()]

print (f"Number of european non-related samples: {len(european_non_rel_samples)}")

features = RegressionDataset(
    age_children_path = age_children_path, 
    pca_path = pca_path, 
    other_features_path = other_features_path,
    samples_list = european_non_rel_samples, 
).collect_phenotypic_features()

In [None]:
print ("Mistakes children:", sum((~features['number_of_children_fathered'].isnull()) & (~features['number_of_live_births'].isnull())))
print ("Missing children:", sum((features['number_of_children_fathered'].isnull()) & (features['number_of_live_births'].isnull())))
print ("Missing children:", sum(features['number_of_children_MF'].isnull()))
print ("Childlessness:", int(features['childlessness'].sum()))
print ("Childlessness missing:", sum(features['childlessness'].isnull()))


features[['number_of_children_fathered', 'number_of_live_births', 'number_of_children_MF']].hist(bins=8, figsize=(20, 3), layout=(1, 3))

plt.show()

In [None]:
print ("Total number of individuals:", features.shape[0])
print ("Number of individuals with university degree:", int(features['uni_1_0_including_none'].sum()))
print ("Number of individuals with higher educational degree:", int(features['higher_education_including_none'].sum()))

qualifications = features[['eid', 'qualifications']].copy()
qualifications['qualifications'] = qualifications['qualifications'].apply(lambda x: x.split('|'))
qualifications = qualifications.explode('qualifications')
qualifications.drop_duplicates().groupby('qualifications').count().plot.barh()

plt.show()

In [None]:
print ("Number of individuals with partner:", int(features['living_with_a_partner'].sum()))
print()
print ("Number of individuals with mental health ICD diagnosis:", int(features['ICD_mental_health_yes_no'].sum()))
print ("Number of individuals with mental health questionnarie:", int(features['mental_health_Q'].sum()))
print()
print ("Number of individuals with email:", int(features['email'].sum()))
print()
print ("Number of ever had sex:", int(features['ever_had_sex'].sum()))
print ("Number of never had sex:", int((1 - features['ever_had_sex']).sum()))
print()
print ("Number has gp record sex:", int(features['has_gp_record'].sum()))
print()
print ("Number left-handed:", int(features['is_left_handed'].sum()))
print()
print ("Number blond:", int(features['is_blond'].sum()))
print()
print ("Number infertility:", int(features['ICD_infertility'].sum()))

In [None]:
features.groupby('gender').count()[['eid']]

In [None]:
features.groupby('gender').mean()[['age_at_recruitment', 'is_left_handed', 'is_blond', 'ICD_infertility', 
                                   'email', 'living_with_a_partner', 'uni_1_0_including_none', 'childlessness']]

# Collect s-het info

## Prepare gene panels

In [None]:
# Create gene panels recessive
recessive_panels_dict = gene_panel.groupby('Gene panel').agg({'Gene name': lambda x: list(x)}).to_dict()['Gene name']
recessive_panels_dict['all'] = None

# Create gene panels for lof
all_genes = set(pd.read_csv(gencode, sep='\t', low_memory=False)['name2'].drop_duplicates().values.tolist())
AR_genes = set(gene_panel['Gene name'].drop_duplicates().values.tolist())
non_AR_genes = all_genes - AR_genes
lof_panels_dict = {'all': None, 'without_AR': non_AR_genes, 'AR': AR_genes}

print ("All genes number:", len(all_genes))
print ("Non-AR genes number:", len(non_AR_genes))

# high_s-het
high_s_het_genes = pd.read_csv(s_het_paths['Weghorn-drift'], sep='\t')
high_s_het_genes = set(high_s_het_genes[high_s_het_genes['s_het'] >= 0.15]['gene_symbol'].drop_duplicates().values.tolist())

print ()

print ("All high s-het genes number:", len(high_s_het_genes.intersection(all_genes)))
print ("AR high s-het genes number:", len(high_s_het_genes.intersection(AR_genes)))
print ("Non-AR high s-het genes number:", len(high_s_het_genes.intersection(non_AR_genes)))
print()

recessive_panels_dict['without_high_s_het'] = AR_genes - high_s_het_genes
lof_panels_dict['without_AR_and_high_s_het'] = non_AR_genes - high_s_het_genes

print ("Added new gene panels:")
print (f"recessive_without_high_s_het: {len(recessive_panels_dict['without_high_s_het'])} genes")
print (f"lof_without_AR_and_high_s_het: {len(lof_panels_dict['without_AR_and_high_s_het'])} genes")

In [None]:
s_het_cfg = {
    'lof': lof_panels_dict,
    'recessive' : recessive_panels_dict
}

for dataset_type in s_het_cfg:
    print (f"Dataset = {dataset_type}")
    for panel, gene_list in s_het_cfg[dataset_type].items():

        print (f"\tPanel={panel}, number of genes={len(gene_list) if gene_list is not None else 'NA' }")
    print()

## Collect s_het data

In [None]:
result = {}

for s_het_key, s_het_path in s_het_paths.items():
    print (f"Collecting data for {s_het_key}\n")

    for dataset_type in s_het_cfg:
        print (f"Dataset = {dataset_type}")

        for panel, gene_list in s_het_cfg[dataset_type].items():

            print (f"\tPanel={panel}, number of genes={len(gene_list) if gene_list is not None else 'NA' }")
            print ("_____________________")
            s_het_features = RegressionDataset(
                het_occurrence_threshold=20., 
                all_plps_file=variants_paths_cfg[dataset_type]['all_variants_file'], 
                s_het_file=s_het_path,
                cohort_plps_files=variants_paths_cfg[dataset_type]['cohort_files'], 
                genes_list=gene_list,
                dataset=dataset_type
            ).collect_variant_features()
            
            # rename columns
            for col in s_het_features.columns.tolist():
                if col != 'eid':
                    s_het_features = s_het_features.rename(columns={col: f"{col}_{dataset_type}_{panel}"})

            if s_het_key not in result:
                result[s_het_key] = s_het_features
            else:
                result[s_het_key] = result[s_het_key].merge(s_het_features, on="eid", how='outer')
            print ("_____________________")


In [None]:
# check shapes
for s_het_key in s_het_paths:
    print (f"Shape for variants features of {s_het_key}:", result[s_het_key].shape)

# Merge and fill-in NA

In [None]:
 # fill NA for s_het with PLP but no information about s_het
fill_na_shet = True

for s_het_key in s_het_paths:
    print (f"Merging data for {s_het_key}")

    result[s_het_key] = features.merge(result[s_het_key], on='eid', how='left')

    for dataset_type in s_het_cfg:
        print (f"\tDataset = {dataset_type}")

        for panel, gene_list in s_het_cfg[dataset_type].items():
            
            columns = [f"{col}_{dataset_type}_{panel}" for col in ['s_het', 'mutations_cnt', 'has_mutation']]

            # fills NAs with 0s
            result[s_het_key][columns] = result[s_het_key][columns].fillna(0.)

            # fill NAs for s_het of recessive PLP carriers with no s_het information
            if fill_na_shet:
                result[s_het_key].loc[(result[s_het_key][f'mutations_cnt_{dataset_type}_{panel}'] > 0) & 
                                      (result[s_het_key][f's_het_{dataset_type}_{panel}'] == 0),  f's_het_{dataset_type}_{panel}'] = None

In [None]:
# check shapes
for s_het_key in s_het_paths:
    print (f"Shape for all features of {s_het_key}:", result[s_het_key].shape)

In [None]:
for s_het_key in s_het_paths:
    print (f"Saving {s_het_key}", flush=True)
    result[s_het_key].to_csv(f'{datasets_save_path}/{s_het_key}.csv', sep='\t', index=False)