In [23]:
import glob

s_het_paths = {
    'Weghorn-drift': ".../450k/selection_weghorn/weghorn_drift_gencode-v34.txt",
    'Cassa': ".../450k/selection_cassa/cassa_supp_table_1_gencode-v34.txt",
    'PLI': ".../450k/selection_pli/gnomad.v2.1.1.PLI_gencode-v34.txt", 
    "Roulette": ".../450k/selection_roulette/s_het_roulette_gencode-v34.csv",
}

#_____________________________________________________________________________________________________________________________
# plps paths
variants_paths_cfg = {
    'recessive' : {
        'cohort_files': glob.glob(".../450k/RAP_output_per_chr/filtered_plps/basic/new_gene_names/new_freq/new_relatedness/chr*"), 
        'all_variants_file': ".../450k/plp_selection/basic/new_gene_names/new_freq/new_relatedness/all_chr_total_presumable_plps_HFE_final_sorted.txt"
    },
    'lof': {
        'cohort_files': glob.glob(f".../data_450k/sample_lofs/*.normed.csv"),
        'all_variants_file': ".../data_450k/annotations/all_singetones_annotated.csv"
    },
}

# samples of interest: European & non-related
european_non_rel_samples = (
    ".../450k/samples/european_non_related_no_withdrawal_to_include_450k.txt"
)


In [3]:
from ukbb_recessive.data_collection.variants import VariantFeatures

variant_features = VariantFeatures()

In [4]:
# select rare PLPs
rare_plps = variant_features.collect_rare_plps(het_occurence_threshold=100000,
                                               hom_occurence_threshold=100000,
                                                all_plps_file=variants_paths_cfg['recessive']['all_variants_file'],
                                                s_het_file=s_het_paths['Roulette'],
                                                genes_list=None)


Entering `collect_rare_plps` function...
Initial total numbers of PLPs: 54758
Total numbers rare PLPs using <treshold=100000>: 54758
Function `collect_rare_plps` finished.



In [5]:
# read cohort PLPs
cohort_plps = variant_features.read_sample_plps(variants_paths_cfg['recessive']['cohort_files'], filter_homozygous=False)
print(f"Total numbers of PLP variants in cohort: {cohort_plps.shape[0]}")

All PLPs in the cohort: 764216
Total numbers of PLP variants in cohort: 764216


In [6]:
# filter rare PLPs in cohort
plps = cohort_plps.merge(rare_plps).drop_duplicates()
print(f"Total numbers of rare PLP variants in cohort: {plps.shape[0]}")

Total numbers of rare PLP variants in cohort: 764216


In [18]:
hom_samples = plps[plps['GT'] == '1/1']['s'].unique().tolist()

print ("Number of samples with hom variant:", len(hom_samples))

Number of samples with hom variant: 3001


In [19]:
plps_gene = plps[plps['GT'] != '1/1'].groupby(['s', 'gene'])[['pos']].agg('count').reset_index()

comp_het_samples = plps_gene[plps_gene['pos'] > 1]['s'].astype(str).unique().tolist()

print (f"Number of samples with more than 1 PLP variant: {len(comp_het_samples)}")

Number of samples with more than 1 PLP variant: 2287


In [22]:
total_samples = list(set(hom_samples + comp_het_samples))

print ("Number of samples to delete:", len(total_samples))

Number of samples to delete: 5288


In [24]:
with open(european_non_rel_samples, 'r') as f:
    european_non_rel_samples = [l.strip() for l in f.readlines()]

print (f"Number of european non-related samples: {len(european_non_rel_samples)}")

european_non_rel_samples = [s for s in european_non_rel_samples if s not in total_samples]
print (f"Number of european non-related samples without comp-het/hom cases: {len(european_non_rel_samples)}")

Number of european non-related samples: 378751
Number of european non-related samples without comp-het/hom cases: 376608


In [26]:
with open(".../450k/samples/european_non_related_no_withdrawal_to_include_450k.no_hom_comp_het.txt", 'w') as f:
    f.write('\n'.join(european_non_rel_samples) + '\n')

In [27]:
! wc -l .../450k/samples/european_non_related_no_withdrawal_to_include_450k.txt

378751 .../450k/samples/european_non_related_no_withdrawal_to_include_450k.txt


In [28]:
! wc -l .../450k/samples/european_non_related_no_withdrawal_to_include_450k.no_hom_comp_het.txt

376608 .../450k/samples/european_non_related_no_withdrawal_to_include_450k.no_hom_comp_het.txt
