This notebook should be run every time the PLPs selection process changes.

Here we filter samples variants so, that only PLPs are left. This is done to make it easier to collect dataset for the regression analysis.

In [2]:
from ukbb_recessive.data_collection.variants import VariantFeatures
import glob
import os
import datetime
import multiprocessing as mp
import pandas as pd

# Original PLPs

In [3]:
# original files with samples variants generated in RAP per chromosome
rap_files = glob.glob(".../450k/RAP_output_per_chr/*.all_parts_final.csv")

## file, that contains all PLPs found in the cohort
all_plps_file = (
    ".../450k/plp_selection/basic/new_gene_names/new_freq/new_relatedness/"
    "all_chr_total_presumable_plps_HFE_final_sorted.txt"
)

# output folder for PLPs
## bugfix with gnomad frequency -- now in %
output_folder= '.../450k/RAP_output_per_chr/filtered_plps/basic/new_gene_names/new_freq/new_relatedness'


# create output folder if not exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

In [3]:
# filter out non-PLP variants for every chromosome

VariantFeatures().filter_plps_in_samples(
    rap_files = rap_files, 
    output_folder = output_folder, 
    all_plps_file = all_plps_file
)


Entering `filter_plps_in_samples` function...
Number of total PLPs: 54758


100%|██████████| 22/22 [39:46<00:00, 108.48s/it]

Function `filter_plps_in_samples` finished, result written in `.../450k/RAP_output_per_chr/filtered_plps/basic/new_gene_names/new_freq/new_relatedness`.






In [4]:
print ("Filtered PLPs files:\n")

! ls -lah $output_folder

Filtered PLPs files:

total 73M
drwxrwx--- 2 gelana bioinf  937 Mar  5 11:25 .
drwxrwx--- 3 gelana bioinf 1.3K Mar  5 10:46 ..
-rwxrwx--- 1 gelana bioinf 1.4M Mar  5 11:23 chr10.all_parts_final.csv
-rwxrwx--- 1 gelana bioinf 3.1M Mar  5 11:25 chr11.all_parts_final.csv
-rwxrwx--- 1 gelana bioinf 2.0M Mar  5 11:27 chr12.all_parts_final.csv
-rwxrwx--- 1 gelana bioinf 1.8M Mar  5 11:11 chr13.all_parts_final.csv
-rwxrwx--- 1 gelana bioinf 2.4M Mar  5 11:07 chr14.all_parts_final.csv
-rwxrwx--- 1 gelana bioinf 2.3M Mar  5 11:02 chr15.all_parts_final.csv
-rwxrwx--- 1 gelana bioinf 3.0M Mar  5 10:57 chr16.all_parts_final.csv
-rwxrwx--- 1 gelana bioinf 3.0M Mar  5 10:49 chr17.all_parts_final.csv
-rwxrwx--- 1 gelana bioinf 807K Mar  5 11:19 chr18.all_parts_final.csv
-rwxrwx--- 1 gelana bioinf 2.0M Mar  5 11:04 chr19.all_parts_final.csv
-rwxrwx--- 1 gelana bioinf 5.2M Mar  5 11:18 chr1.all_parts_final.csv
-rwxrwx--- 1 gelana bioinf 559K Mar  5 10:52 chr20.all_parts_final.csv
-rwxrwx--- 1 gelana bi

# Numbers check for the paper

In [13]:
filter_homozygous=True

cohort_plp_files = glob.glob(f"{output_folder}/chr*")

cohort_plps = VariantFeatures().read_sample_plps(cohort_plp_files, filter_homozygous=filter_homozygous).drop(['raw pos', 'raw ref', 'raw alt', 'alleles', 'GT.alleles'], axis=1)

All PLPs in the cohort: 764216


In [5]:
print (f"Number of PLPs in the cohort:", cohort_plps.shape[0])
print (f"Number of unique PLPs:", cohort_plps[['chrom', 'pos', 'ref', 'alt']].drop_duplicates().shape[0])

Number of PLPs in the cohort: 761211
Number of unique PLPs: 54755


In [6]:
homozygous = cohort_plps[cohort_plps['GT'] == '1/1']

print (f"Number of hom PLPs in the cohort:", homozygous.shape[0])
print (f"Number of unique hom PLPs:", homozygous[['chrom', 'pos', 'ref', 'alt']].drop_duplicates().shape[0])

Number of hom PLPs in the cohort: 0
Number of unique hom PLPs: 0


In [7]:
all_plps = pd.read_csv(all_plps_file, sep='\t')

all_plps

Unnamed: 0,chr,position,ref,alt,gene,region,synonymous,Hgvsc,Hgvsp,variant_type,...,CADD_score,MOI-Pred_score,decipher,vkgl,hgmd-DM,clinvar,clinvar_stars,intervar,hets,homs
0,chr1,1014051,C,A,ISG15,EXON_REGION,False,c.47C>A,p.Ser16Ter,Substitution,...,29.400,,,,,,,Likely_pathogenic,1,0
1,chr1,1014143,C,T,ISG15,EXON_REGION,False,c.139C>T,p.Gln47Ter,Substitution,...,34.000,,,,Y,Pathogenic,0.0,Pathogenic,2,0
2,chr1,1014332,C,T,ISG15,EXON_REGION,False,c.328C>T,p.Gln110Ter,Substitution,...,35.000,,,,,Uncertain_significance,1.0,Pathogenic,66,0
3,chr1,1014355,CT,C,ISG15,EXON_REGION,False,c.353del,p.Phe118SerfsTer20,Deletion,...,27.100,,,,,,,Likely_pathogenic,1,0
4,chr1,1014359,G,T,ISG15,EXON_REGION,False,c.355G>T,p.Glu119Ter,Substitution,...,35.000,,,,Y,Likely_pathogenic,1.0,Pathogenic,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54753,chr22,50627390,C,CG,ARSA,EXON_REGION,False,c.240dup,p.Gly81ArgfsTer53,Insertion,...,30.000,,,,Y,Pathogenic/Likely_pathogenic,2.0,Pathogenic,42,0
54754,chr22,50627584,CG,C,ARSA,EXON_REGION,False,c.195del,p.Tyr65Ter,Deletion,...,33.000,,,,Y,Pathogenic/Likely_pathogenic,2.0,Pathogenic,1,0
54755,chr22,50627718,CG,C,ARSA,EXON_REGION,False,c.61del,p.Arg21ValfsTer7,Deletion,...,13.500,,,,,,,Likely_pathogenic,2,0
54756,chr22,50627739,G,GC,ARSA,EXON_REGION,False,c.40dup,p.Ala14GlyfsTer62,Insertion,...,12.040,,,,,,,Likely_pathogenic,1,0
