This notebook contains steps taken for the cohort data processing and variant selection. 

In [None]:
import hail as hl
import pandas as pd
import os
import glob

import csv

import numpy as np

Steps of processing the raw data from the RAP platform, all paths are relative to some workfolder: 

1. Data from RAP were copied into ---> `data_450k/vcfs/chrXX_lof.vcf.bgz`

2. This VCFs were re-saved into matrix tables ---> `data_450k/matrix_tables/chrXX_lof.mt`

3. Normalize genotypes into 0/1 format and represent variants in minimal form. Output 'chrom', 'pos', 'ref', 'alt', 'updated GT', 'locus', 'alleles', 'GT', 's' fields ---> `data_450k/annotations/chrXX_lof.normed.csv`

4. Select unique variants, leave 'chrom', 'pos', 'ref', 'alt', 'cohort_cnt' fields  ---> `data_450k/annotations/chrXX_lof.normed.unique.csv`. This is done to speed up VEP annotation step.

5. VEP annotation:

    a. Transform data to VEP input format (VCF) ---> `data_450k/annotations/chrXX_lof.normed.unique.vcf`

    b. Annotate VCF with VEP ---> `data_450k/annotations/chrXX_lof.vep.annotated.csv`

6. Select LoF singletons ---> `data_450k/annotations/all_singetones_annotated.csv`

7. Create files with individual's LoF singletons ---> `data_450k/sample_lofs/*`


# 1. Save VCFs as MatrixTables

We run this code for each VCF file, saving them as MatrixTables to speed up calculations. 

In [None]:
import hail as hl
import glob

WORKFOLDER = '...' # should be defined

# iterate over all vcf files 
for vcf_path in glob.glob(f"{WORKFOLDER}/data_450k/vcfs/*.vcf.bgz"):
    
    # read vcf file
    vcf = hl.methods.import_vcf(vcf_path,
                                force_bgz=True,
                                reference_genome='GRCh38', 
                                array_elements_required=False,
                                block_size=64)
    print (f"Processing {vcf_path}...", flush=True)
    # save vcf as matrix table
    vcf.write(vcf_path.replace('vcfs', 'matrix_tables').replace('.vcf.bgz', '.mt'))

# 2. Normalize variant calls 

Since variants came from multi-sample VCF file, they were not in their "minimal" form. Therefore, it was necessary to convert them to their minimal representation. 

In [None]:
WORKFOLDER = '...' # should be defined

In [None]:
# minimal representation of the variant

def get_minimal_representation(pos, ref, alt): 
    """
    Creates minimal representation for alleles. 
    
    Taken from here:
        http://www.cureffi.org/2014/04/24/converting-genetic-variants-to-their-minimal-representation/
    """
    # If it's a simple SNV, don't remap anything
    if len(ref) == 1 and len(alt) == 1: 
        return pos, ref, alt
    else:
        # strip off identical suffixes (from the end)
        while(alt[-1] == ref[-1] and min(len(alt),len(ref)) > 1):
            alt = alt[:-1]
            ref = ref[:-1]
            
        # strip off identical prefixes (from the start) and increment position
        while(alt[0] == ref[0] and min(len(alt),len(ref)) > 1):
            alt = alt[1:]
            ref = ref[1:]
            pos += 1
            
        return pos, ref, alt 
    
def get_new_genotype(old_genotype, allele):
    """
        Converts genotype from Hail's multi-allelic vcf representation 
        to biallelic 0/1, 1/0, 1/1
    """
    result_genotype = np.array([0, 0])
    result_genotype[np.array(old_genotype) == allele] = 1
    
    return "/".join(map(str, result_genotype.tolist()))

# alleles parsing (includes application of the minimal representation)
def parse_alleles_from_hail(row):
    """
        Splits multi-allelic variants (e.g. 1/2) into biallelic 1/0, 0/1, 1/1
        Gets minimal representation of the variants
    """
    
    # cast types 
    contig, position = tuple(row['locus'].split(':'))
    position = int(position)
    alleles = eval(row['alleles']) # should be list of alleles at locus: ['A', 'C', 'AT']

    genotype = [int(x) for x in row['GT'].split('/')]# should be [0, 1] for 0/1 etc/
    
    result_rows = []
    
    # iterates over genotypes, returns more that one row for heterozygous non ref (1/2)
    for gt in set(genotype):
        
        # skip reference
        if gt == 0:
            continue
        
        result_row = row.copy()
        
        # parse alleles  
        result_row['ref'] = alleles[0] # picks reference allele
        result_row['alt'] = alleles[gt] # picks alternative allele
        
        # parse genotype
        result_row['updated GT'] = get_new_genotype(genotype, gt) # returns 0/1 for [0, 2], [1, 2]

        # get minimal representation
        result_row['pos'], result_row['ref'], result_row['alt'] = get_minimal_representation(
            position,
            result_row['ref'], 
            result_row['alt']) 
        
        result_row['chrom'] = contig
                
        result_rows.append(result_row)
        
    return result_rows

# processing of a file
def process_and_save_vcf(input_path, output_path=None):
    """
        Applies all genotype and variant formats transformation to every row of the input file 
        and saves the result as gzipped csv
    """
    
    if output_path is None:
        output_path = input_path.replace('.csv', '.norm.csv')

    with open(input_path, 'r') as in_csvfile, open(output_path, 'w') as out_csvfile:
        
        # define output csv file field names
        fieldnames = ['chrom', 'pos', 'ref', 'alt', 'updated GT', 'locus', 'alleles', 'GT', 's']
        
        # create writer
        processed_data = csv.DictWriter(out_csvfile, fieldnames=fieldnames)
        processed_data.writeheader()
        
        # create reader
        data = csv.DictReader(in_csvfile, delimiter='\t')
        parsed_data = []
        
        # process every line of the input file
        for row in data:
            for processed_row in parse_alleles_from_hail(row):
                processed_data.writerow(processed_row)

First, we extract all genotypes from the cohort, that are not homozygous reference and save as .csv files:

In [None]:
# save data as .csv

for filename in glob.glob(f"{WORKFOLDER}/data_450k/matrix_tables/*.mt"):
    # output filename
    out_filename = filename.replace('matrix_tables', 'annotations').replace('.mt', '.raw.csv')
    
    print ('Processing', filename)
    
    # read matrix table
    data = hl.read_matrix_table(filename)
    
    # write every genotype as separate row
    data.filter_entries(~data.GT.is_hom_ref()).entries().GT.export(out_filename)

Normalize variants representation

In [None]:
for filename in glob.glob(f"{WORKFOLDER}/data_450k/annotations/*.raw.csv"):
    print ('Processing', filename, flush=True)

    out_filename = filename.replace('.raw.csv', '.normed.csv')

    process_and_save_vcf(input_path=filename, output_path=out_filename)
    
    print ('Writing to', out_filename, flush=True)
    print ()

# 3. Select unique variants with cohort counts

In [None]:
for filename in glob.glob(f"{WORKFOLDER}/data_450k/annotations/*.normed.csv"):

  print ('Processing', filename, flush=True)

  out_filename = filename.replace('.normed.csv', '.normed.unique.csv')
  
  # get unique variants
  df = (
    pd.read_csv(filename)
    .groupby(['chrom', 'pos', 'ref', 'alt'])
    .agg({'s': 'count'})
    .rename(columns={'s': 'cohort_cnt'})
    .reset_index()
  )

  df.to_csv(out_filename, index=False)

  print ('Writing to', out_filename)

# 4. VEP

## a. Converting to VCF header format, necessary for subsequent VEP annotation. 

In [None]:
# create VCF format from data frame
def make_vcf_format(df2):
    df2 = df2.rename(columns={'chrom':'#CHROM', 'pos':'POS', 'ref': 'REF', 'alt':'ALT'})

    df2['ID'] = '.'
    df2['QUAL'] = '.'
    df2['FILTER'] = '.'
    df2['INFO'] = '.'

    df2 = df2[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']]

    return df2

In [None]:
for filename in glob.glob(f"{WORKFOLDER}/data_450k/annotations/*.normed.unique.csv"):
    print ('Processing', filename, flush=True)

    out_filename = filename.replace('.normed.unique.csv', '.normed.unique.vcf')
    
    df = pd.read_csv(filename)
    df = make_vcf_format(df)
    df.to_csv(out_filename, sep='\t', index=False)
    
    print ('Writing to', out_filename, flush=True)
    print ()

## b. Create and run VEP annotation commands

In [None]:
def create_vep(in_file, annotated_file):
    job = f"""
vep --no_stats \
	--fasta ... \
    --format vcf \
    --tab \
	-i {in_file} \
	-o {annotated_file} \
	--gff ... \
	--assembly GRCh38 \
	--symbol --numbers --hgvs --hgvsg \
	--plugin LoF,loftee_path:...,check_complete_cds:1,max_scan_distance:30 \
	--plugin REVEL,...\
	--dir_plugins ... --fork 6
"""
    return job

In [None]:
for filename in glob.glob(f"{WORKFOLDER}/data_450k/annotations/*.normed.unique.vcf"):
    out_filename = filename.replace('.normed.unique.vcf', '.vep.annotated.csv')
    
    jobs_filename = (
        filename
        .replace('data_450k/annotations/', 'jobs_450k/2_vep_')
        .replace('.normed.unique.vcf', '.job')
    )
        
    vep_command = create_vep(in_file=filename, annotated_file=out_filename)
        
    with open(jobs_filename, mode='w') as f:
        f.writelines(vep_command)

These jobs should be executed for every chromosome. 

# 5. Variant selection: LoF singletons

In [None]:
def is_lof(consequence_terms):
    """
        Returns True if consequence terms contains loss-of-function variant.
    """
    is_lof = (
        consequence_terms.str.contains("splice_acceptor_variant") | 
        consequence_terms.str.contains("splice_donor_variant") | 
        consequence_terms.str.contains("stop_gained") | 
        consequence_terms.str.contains("frameshift_variant")
    )
    
    return is_lof


def get_variant_class(row):
    """
    Defining variant type (SNP, insertion, deletion)
    based on reference and alternative alleles
    Parameters:
    row: DataFrame row
    Returns:
    Substitution, Deletion, Insertion or Unknown
    """
    reference, variant = row['ref'], row['alt']

    if len(reference) == len(variant):
        return 'Substitution'
    elif len(reference) > len(variant):
        return 'Deletion'
    elif len(reference) < len(variant):
        return 'Insertion'

    return 'Unknown'

Get original variants from RAP

In [None]:
# read all variants found in the cohort
original_variants = []

for filename in glob.glob(f"{WORKFOLDER}/data_450k/annotations/*.normed.unique.csv"):
    original_variants.append(pd.read_csv(filename))
    
original_variants = pd.concat(original_variants)

# annotate with variant type
original_variants['variant_type'] = original_variants.apply(get_variant_class, axis=1)

print ("Total normed unique variants:", original_variants.shape)

original_variants.head(3)

Update InDels representation to match VEP: 

In [None]:
# Keep representation the same for substitutions
indels = original_variants['variant_type']!='Substitution'

original_variants['new pos'] = original_variants['pos'].copy()
original_variants['new ref'] = original_variants['ref'].copy()
original_variants['new alt'] = original_variants['alt'].copy()

# update representation for InDels by ignoring first matched nucleotide, i.e. AT>A --> T>
original_variants.loc[indels, 'new pos'] = original_variants.loc[indels, 'new pos'] + 1
original_variants.loc[indels, 'new ref'] = original_variants.loc[indels, 'new ref'].apply(lambda x: x[1:])
original_variants.loc[indels, 'new alt'] = original_variants.loc[indels, 'new alt'].apply(lambda x: x[1:])

# Create Key for matching with VEP annotations and delete redundant information
original_variants['key'] = (
    original_variants['chrom'] + '_' + 
    original_variants['new pos'].astype(str) + '_' + 
    original_variants['new ref'] + '_' + 
    original_variants['new alt']
)

original_variants = original_variants.drop(['new pos', 'new ref', 'new alt'], axis=1)

Read VEP annotations:

In [None]:
# Read VEP annotations
annotations = []

for filename in glob.glob(f"{WORKFOLDER}/data_450k/annotations/*.vep.annotated.csv"):
    df = pd.read_csv(filename, sep='\t', skiprows=41).rename(
        columns={'#Uploaded_variation': 'Uploaded_variation'})
    annotations.append(df)
    
annotations = pd.concat(annotations)

print ("Total annotated variants, all transcripts:", annotations.shape)
print ("Total annotated unique variants:", annotations['Uploaded_variation'].drop_duplicates().shape)


annotations.head(3)

Select high-quality LoF variants:

In [None]:
# select LoF
annotations = annotations[is_lof(annotations['Consequence'])]
# select high-quality 
annotations = annotations[annotations['LoF'] == 'HC']


print ("Total high quality LoF variants, all transcripts:", annotations.shape)
print ("Total high quality LoF  unique variants:", annotations['Uploaded_variation'].drop_duplicates().shape)

Creating the key for merging with cohort variants:

In [None]:
# Parse variant from VEP output
columns = annotations.columns.tolist()

annotations['chrom'] = annotations['Uploaded_variation'].apply(lambda x: x.split('_')[0])
annotations['pos'] = annotations['Uploaded_variation'].apply(lambda x: int(x.split('_')[1]))
annotations['ref'] = annotations['Uploaded_variation'].apply(lambda x: x.split('_')[2].split('/')[0])
annotations['alt'] = annotations['Uploaded_variation'].apply(lambda x: x.split('_')[2].split('/')[1])

annotations.loc[annotations['ref']=='-', 'ref'] = ''
annotations.loc[annotations['alt']=='-', 'alt'] = ''

# remove redundant information
annotations = annotations[['chrom', 'pos', 'ref', 'alt'] + ['SYMBOL']].drop_duplicates()

print ("Total high quality LoF variants, all genes:", annotations.shape)
print ("Total high quality LoF unique variants", annotations[['chrom', 'pos', 'ref', 'alt']].drop_duplicates().shape)

# add information about variant type
annotations['variant_type'] = annotations.apply(get_variant_class, axis=1)

# create key
annotations['key'] = (
    annotations['chrom'] + '_' + 
    annotations['pos'].astype(str) + '_' + 
    annotations['ref'] + '_' + 
    annotations['alt']
)

Filter original variants to contain high-quality LoF variants; 

Create new key for merged data based on original variant representation (for merging with individual's variant data):

In [None]:
# merge high-quality LoF with original data
merged = annotations[['SYMBOL', 'key']].merge(original_variants, on='key')

# generate new key
merged['key'] = (
    merged['chrom'] + '_' + 
    merged['pos'].astype(str) + '_' + 
    merged['ref'] + '_' + 
    merged['alt']
)

print ("Total high quality LoF variants all genes merged with cohort", merged.shape)

Leave only singletons:

In [None]:
singletones = merged[merged['cohort_cnt'] == 1]

print ("Number of singletones all genes:", singletones.shape[0])
print ("Number of unique singletones:", singletones[['chrom', 'pos', 'ref', 'alt']].drop_duplicates().shape)


singletones.to_csv(f"{WORKFOLDER}/data_450k/annotations/all_singetones_annotated.csv", sep='\t', index=False)

# 6.Filter individual files for singletones

Filter individual's files so, that they would only contain singleton LoFs. 

In [None]:
from ukbb_recessive.data_collection.variants import VariantFeaturesLoF

In [None]:
rap_files = glob.glob(f"{WORKFOLDER}/data_450k/annotations/*.normed.csv")

print (len(rap_files))

In [None]:
variant_features = VariantFeaturesLoF()

variant_features.filter_lofs_in_samples(
    rap_files=rap_files, 
    output_folder=f"{WORKFOLDER}/data_450k/sample_lofs", 
    all_lof_file=f"{WORKFOLDER}/data_450k/annotations/all_singetones_annotated.csv"
)