# Count targetable sites

In [1]:
import os
import sys
import glob
import shutil
import numpy as np
import subprocess as sbp
import pandas as pd
from Bio import Seq, SeqIO
import logomaker
import itertools
import matplotlib.pyplot as plt
from collections import Counter
from Bio import SearchIO
import math
from scipy import stats
from matplotlib import cm
from matplotlib import colors
import matplotlib.patches as mpatches
import multiprocessing as mp
from Bio import Phylo
import matplotlib.image as image
from Bio import Phylo
%matplotlib notebook

## ClinVar data

In [2]:
clinvar_data = pd.read_csv('variant_summary.txt', sep='\t', dtype=str)

In [3]:
pathogenic_tags = ['Pathogenic', 'Likely pathogenic', 'Pathogenic/Likely pathogenic',
                   'Likely pathogenic, risk factor', 'Pathogenic, risk factor',
                   'Likely pathogenic, Affects', 'Pathogenic/Likely pathogenic, risk factor',
                   'Pathogenic, Affects', 'Pathogenic/Likely pathogenic, drug response',
                   'Pathogenic, drug response', 'Pathogenic, association', 'Pathogenic, other',
                   'Likely pathogenic, other', 'Pathogenic, protective',
                   'Pathogenic, drug response, other', 'Pathogenic/Likely pathogenic, other',
                   'Pathogenic, association, protective', 'Likely pathogenic, drug response',
                   'Pathogenic, confers sensitivity']

In [4]:
clinvar_pathogenic = clinvar_data[[x in pathogenic_tags for x in clinvar_data[
    'ClinicalSignificance'].values]]

In [5]:
clinvar_pathogenic_known_pos = clinvar_pathogenic[clinvar_pathogenic['Start']!='-1']

In [6]:
clinvar_pathogenic_known_pos = clinvar_pathogenic_known_pos.assign(Length=clinvar_pathogenic_known_pos[
    'Stop'].apply(int)-clinvar_pathogenic_known_pos['Start'].apply(int))

In [7]:
type_tags = ['Indel', 'Deletion', 'single nucleotide variant', 'Insertion']

In [8]:
clinvar_selected_type = clinvar_pathogenic_known_pos[
    [x in type_tags for x in clinvar_pathogenic_known_pos['Type'].values]]

In [9]:
clinvar_selected_type = clinvar_selected_type[clinvar_selected_type['Assembly']!='NCBI36']

In [10]:
clinvar_selected_type.shape

(277788, 35)

## Read genomes

In [11]:
GRCh37_file = 'Homo_sapiens_assembly19.fasta'

In [12]:
GRCh37 = {rec.id:rec for rec in SeqIO.parse(GRCh37_file, 'fasta')}

In [13]:
GRCh38_file = 'hg38.fa'

In [14]:
GRCh38 = {rec.id:rec for rec in SeqIO.parse(GRCh38_file, 'fasta')}

In [15]:
assemblies = {'GRCh37':GRCh37, 'GRCh38':GRCh38}

## Get mutation flanking sequences

In [22]:
flank_len = 35

In [23]:
upstream, downstream = [], []
for n,i in enumerate(clinvar_selected_type.index):
    mut_type = clinvar_selected_type.loc[i, 'Type']
    chrm = clinvar_selected_type.loc[i, 'Chromosome']
    assembly = clinvar_selected_type.loc[i, 'Assembly']
    if assembly == 'GRCh38':
        chrm = 'chr' + (chrm if chrm!='MT' else 'M')
    start = int(clinvar_selected_type.loc[i, 'Start'])-1 # data is 1-indexed
    if mut_type == 'Deletion':
        start -= 1 # deletions are indexed wierdly, must subtract 2 from start in table
    end = int(clinvar_selected_type.loc[i, 'Stop'])
    down = assemblies[assembly][chrm][end:end+flank_len]
    up = assemblies[assembly][chrm][start-flank_len:start]
    upstream.append(up.seq)
    downstream.append(down.seq)

In [24]:
clinvar_selected_type = clinvar_selected_type.assign(**{'Downstream':[str(x) for x in downstream],
                                                        'Upstream':[str(x) for x in upstream]})

## Remove redundant info

In [25]:
index_to_keep = []
already_in = set()
for i in clinvar_selected_type.index:
    alleleID = clinvar_selected_type.loc[i, '#AlleleID']
    if alleleID not in already_in:
        already_in.add(alleleID)
        index_to_keep.append(i)

In [26]:
clinvar_selected_type_unique = clinvar_selected_type.loc[index_to_keep]

In [27]:
clinvar_selected_type_unique.to_csv('filtered_variant_summary_35_nt.tsv', sep='\t')