In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import glob
import tqdm

from ukbb_recessive.data_collection.dataset import RegressionDataset

In [13]:
datasets_save_path = '.../450k/datasets/sampling/genes'

#_____________________________________________________________________________________________________________________________
s_het_paths = {
    "Roulette": ".../450k/selection_roulette/s_het_roulette_gencode-v34.csv",
}

#_____________________________________________________________________________________________________________________________
# phenotypic paths
age_children_path = ".../450k/phenotypes/age_children_data_participant.tsv" 
pca_path = ".../450k/phenotypes/PC_participant.tsv"
other_features_path = (
    ".../450k/phenotypes/phenotypes_of_interest_participant.tsv"
)

#_____________________________________________________________________________________________________________________________
# samples of interest: European & non-related
european_non_rel_samples = (
    ".../450k/samples/european_non_related_no_withdrawal_to_include_450k.txt"
)

#_____________________________________________________________________________________________________________________________
# plps paths

# plps paths
variants_paths_cfg = {
    'recessive' : {
        'cohort_files': glob.glob(".../450k/RAP_output_per_chr/filtered_plps/basic/new_gene_names/new_freq/new_relatedness/chr*"), 
        'all_variants_file': ".../450k/plp_selection/basic/new_gene_names/new_freq/new_relatedness/all_chr_total_presumable_plps_HFE_final_sorted.txt"
    },
}

#_____________________________________________________________________________________________________________________________
# gene info
gencode = ".../genCode/GRCh38/v34/gencode.v34.GRCh38.txt"

#gene panel
#sampeld gene panel of ID and similar-distributed non-ID
gene_panel = pd.read_csv(".../450k/regions/gene-panel-gencode-v34.sampled.txt")
# gene_panel.columns = ['Gene name', 'Gene panel']
gene_panel['Gene panel'].unique()

array(['ID_sampled', 'AR_without_ID_0_sampled', 'AR_without_ID_1_sampled',
       'AR_without_ID_2_sampled', 'AR_without_ID_3_sampled',
       'AR_without_ID_4_sampled', 'AR_without_ID_5_sampled',
       'AR_without_ID_6_sampled', 'AR_without_ID_7_sampled',
       'AR_without_ID_8_sampled', 'AR_without_ID_9_sampled',
       'AR_without_ID_10_sampled', 'AR_without_ID_11_sampled',
       'AR_without_ID_12_sampled', 'AR_without_ID_13_sampled',
       'AR_without_ID_14_sampled', 'AR_without_ID_15_sampled',
       'AR_without_ID_16_sampled', 'AR_without_ID_17_sampled',
       'AR_without_ID_18_sampled', 'AR_without_ID_19_sampled'],
      dtype=object)

# Collect phenotypic info

In [14]:
with open(european_non_rel_samples, 'r') as f:
    european_non_rel_samples = [l.strip() for l in f.readlines()]

print (f"Number of european non-related samples: {len(european_non_rel_samples)}")

features = RegressionDataset(
    age_children_path = age_children_path, 
    pca_path = pca_path, 
    other_features_path = other_features_path,
    samples_list = european_non_rel_samples, 
).collect_phenotypic_features()

Number of european non-related samples: 378751

Entering `collect_phenotype_features` function...
Current columns names list: ['eid', 'number_of_children_fathered', 'number_of_live_births', 'birth_cohort', 'age_at_recruitment', 'gender', 'people_number_in_household', 'people_related_in_household', 'qualifications', 'fluid_intelligence_score', 'email', 'age_first_sex', 'handedness', 'hair_color', 'gp_record', 'mental_health_problems', 'diagnosis_main_ICD10', 'diagnosis_secondary_ICD10', 'height', 'PCA_1', 'PCA_2', 'PCA_3', 'PCA_4', 'PCA_5', 'PCA_6', 'PCA_7', 'PCA_8', 'PCA_9', 'PCA_10', 'PCA_11', 'PCA_12', 'PCA_13', 'PCA_14', 'PCA_15', 'PCA_16', 'PCA_17', 'PCA_18', 'PCA_19', 'PCA_20', 'PCA_21', 'PCA_22', 'PCA_23', 'PCA_24', 'PCA_25', 'PCA_26', 'PCA_27', 'PCA_28', 'PCA_29', 'PCA_30', 'PCA_31', 'PCA_32', 'PCA_33', 'PCA_34', 'PCA_35', 'PCA_36', 'PCA_37', 'PCA_38', 'PCA_39', 'PCA_40']

Number of samlples with features: 502394
Function `collect_phenotype_features` finished.

Phenotypic featur

# Collect s-het info

## Prepare gene panels

In [17]:
# Create gene panels recessive
recessive_panels_dict = gene_panel.groupby('Gene panel').agg({'Gene name': lambda x: list(x)}).to_dict()['Gene name']
recessive_panels_dict['all'] = None

s_het_cfg = {
    'recessive' : recessive_panels_dict
}

for dataset_type in s_het_cfg:
    print (f"Dataset = {dataset_type}")
    for panel, gene_list in s_het_cfg[dataset_type].items():

        print (f"\tPanel={panel}, number of genes={len(gene_list) if gene_list is not None else 'NA' }")
    print()

Dataset = recessive
	Panel=AR_without_ID_0_sampled, number of genes=355
	Panel=AR_without_ID_10_sampled, number of genes=355
	Panel=AR_without_ID_11_sampled, number of genes=355
	Panel=AR_without_ID_12_sampled, number of genes=355
	Panel=AR_without_ID_13_sampled, number of genes=355
	Panel=AR_without_ID_14_sampled, number of genes=355
	Panel=AR_without_ID_15_sampled, number of genes=355
	Panel=AR_without_ID_16_sampled, number of genes=355
	Panel=AR_without_ID_17_sampled, number of genes=355
	Panel=AR_without_ID_18_sampled, number of genes=355
	Panel=AR_without_ID_19_sampled, number of genes=355
	Panel=AR_without_ID_1_sampled, number of genes=355
	Panel=AR_without_ID_2_sampled, number of genes=355
	Panel=AR_without_ID_3_sampled, number of genes=355
	Panel=AR_without_ID_4_sampled, number of genes=355
	Panel=AR_without_ID_5_sampled, number of genes=355
	Panel=AR_without_ID_6_sampled, number of genes=355
	Panel=AR_without_ID_7_sampled, number of genes=355
	Panel=AR_without_ID_8_sampled, nu

## Collect s_het data

In [18]:
result = {}

for s_het_key, s_het_path in s_het_paths.items():
    print (f"Collecting data for {s_het_key}\n")

    for dataset_type in s_het_cfg:
        print (f"Dataset = {dataset_type}")

        for panel, gene_list in s_het_cfg[dataset_type].items():

            print (f"\tPanel={panel}, number of genes={len(gene_list) if gene_list is not None else 'NA' }")
            print ("_____________________")
            s_het_features = RegressionDataset(
                het_occurrence_threshold=20., 
                all_plps_file=variants_paths_cfg[dataset_type]['all_variants_file'], 
                s_het_file=s_het_path,
                cohort_plps_files=variants_paths_cfg[dataset_type]['cohort_files'], 
                genes_list=gene_list,
                dataset=dataset_type
            ).collect_variant_features()
            
            # rename columns
            for col in s_het_features.columns.tolist():
                if col != 'eid':
                    s_het_features = s_het_features.rename(columns={col: f"{col}_{dataset_type}_{panel}"})

            if s_het_key not in result:
                result[s_het_key] = s_het_features
            else:
                result[s_het_key] = result[s_het_key].merge(s_het_features, on="eid", how='outer')
            print ("_____________________")

Collecting data for Roulette

Dataset = recessive
	Panel=AR_without_ID_0_sampled, number of genes=355
_____________________
Dataset type: recessive

Entering `calculate_s_het_per_sample` function...

Entering `collect_rare_plps` function...
Initial total numbers of PLPs: 54758
Total numbers rare PLPs using <treshold=20.0>: 50568
Total numbers rare PLPs in specified gene list: 8301
Function `collect_rare_plps` finished.

All PLPs in the cohort: 764216
Heterozygous PLPs in the cohort: 761211
Total numbers of PLP variants in cohort: 761211
Total numbers of rare PLP variants in cohort: 28579
Total numbers of rare PLP  variants in cohort, one per gene: 28553
Function `calculate_s_het_per_sample` finished.

_____________________
	Panel=AR_without_ID_10_sampled, number of genes=355
_____________________
Dataset type: recessive

Entering `calculate_s_het_per_sample` function...

Entering `collect_rare_plps` function...
Initial total numbers of PLPs: 54758
Total numbers rare PLPs using <treshol

In [19]:
# check shapes
for s_het_key in s_het_paths:
    print (f"Shape for variants features of {s_het_key}:", result[s_het_key].shape)

Shape for variants features of Roulette: (139265, 111)


# Merge and fill-in NA

In [20]:
 # fill NA for s_het with PLP but no information about s_het
fill_na_shet = True

for s_het_key in s_het_paths:
    print (f"Merging data for {s_het_key}")

    result[s_het_key] = features.merge(result[s_het_key], on='eid', how='left')

    for dataset_type in s_het_cfg:
        print (f"\tDataset = {dataset_type}")

        for panel, gene_list in s_het_cfg[dataset_type].items():
            
            columns = [f"{col}_{dataset_type}_{panel}" for col in ['s_het', 'mutations_cnt', 'has_mutation']]

            # fills NAs with 0s
            result[s_het_key][columns] = result[s_het_key][columns].fillna(0.)

            # fill NAs for s_het of recessive PLP carriers with no s_het information
            if fill_na_shet:
                result[s_het_key].loc[(result[s_het_key][f'mutations_cnt_{dataset_type}_{panel}'] > 0) & 
                                      (result[s_het_key][f's_het_{dataset_type}_{panel}'] == 0),  f's_het_{dataset_type}_{panel}'] = None

Merging data for Roulette
	Dataset = recessive


In [21]:
# check shapes
for s_het_key in s_het_paths:
    print (f"Shape for all features of {s_het_key}:", result[s_het_key].shape)

# Shape for all features of Weghorn-drift: (376872, 231)

Shape for all features of Roulette: (378751, 190)


In [22]:
for s_het_key in s_het_paths:
    print (f"Saving {s_het_key}", flush=True)
    result[s_het_key].to_csv(f'{datasets_save_path}/{s_het_key}.csv', sep='\t', index=False)

Saving Roulette
