# Count targetable sites

In [1]:
import os
import sys
import glob
import shutil
import numpy as np
import subprocess as sbp
import pandas as pd
from Bio import Seq, SeqIO
import logomaker
import itertools
import matplotlib.pyplot as plt
from collections import Counter
from Bio import SearchIO
import math
from scipy import stats
from matplotlib import cm
from matplotlib import colors
import matplotlib.patches as mpatches
import multiprocessing as mp
from Bio import Phylo
import matplotlib.image as image
from Bio import Align
%matplotlib notebook

## Make PAM predictions

In [2]:
IDs = [98]

In [3]:
blastdir = 'Spacers_alignment'

In [4]:
all_oriented_flanks = {ID:pd.read_csv(os.path.join(blastdir,
    'realigned_oriented_flanking_regions_{}_clustering_new.tsv'.format(ID)),
    sep='\t', index_col=1) for ID in IDs}

In [5]:
for ID in IDs:
    all_oriented_flanks[ID] = all_oriented_flanks[ID].drop('Unnamed: 0', axis=1)
    all_oriented_flanks[ID].columns = ['Upstream', 'Downstream']

In [6]:
datadir = 'Cas9_tables'

In [7]:
cas9_datasets = {ID:pd.read_csv(os.path.join(datadir,
    'all_new_working_Cas9_clust_{}_oriented.tsv'.format(ID)),
    sep='\t', index_col=0) for ID in IDs}

In [8]:
pam_len = 30
relevant_range = 10

In [9]:
pam_predictions = {ID:{'Upstream':{}, 'Downstream':{}} for ID in IDs}

In [10]:
def make_info_df(seqs, pam_len, pos):
    counters = [Counter() for i in range(pam_len)]
    if pos == 'Downstream':
        for i in range(len(counters)):
            for seq in seqs:
                if i < len(seq):
                    if seq[i]!='N':
                        counters[i][seq[i]] +=1
    else:
        for i in range(1,len(counters)+1):
            for seq in seqs:
                if i <= len(seq):
                    if seq[-i]!='N':
                        counters[-i][seq[-i]] +=1
    freqs = pd.DataFrame(counters).fillna(0)
    freqs = freqs.divide(freqs.sum(axis=1), axis=0).fillna(1.0/freqs.shape[1])
    if not all(freqs.sum(axis=1).apply(lambda x: math.isclose(x, 1.0))):
        raise ValueError
    info = logomaker.transform_matrix(freqs, from_type='probability',
        to_type='information')
    return info

In [11]:
for ID in IDs:
    for i in all_oriented_flanks[ID].index:
        up = all_oriented_flanks[ID].loc[i, 'Upstream']
        down = all_oriented_flanks[ID].loc[i, 'Downstream']
        if pd.notna(up):
            pam_predictions[ID]['Upstream'][i] = make_info_df(up.split(','),
                                                              pam_len, 'Upstream')
        if pd.notna(down):
            pam_predictions[ID]['Downstream'][i] = make_info_df(down.split(','),
                                                                pam_len, 'Downstream')

In [12]:
n_clusters = {ID:len(cas9_datasets[ID]['Cluster ID'].unique()) for ID in IDs}
n_clusters_w_matches = {ID:all_oriented_flanks[ID].shape[0] for ID in IDs}

In [13]:
min_n_flanking_seqs = 10

In [14]:
n_upstream_flanks = {ID:all_oriented_flanks[ID]['Upstream'].apply(
    lambda x: len(x.split(',')) if pd.notna(x) else 0) for ID in IDs}
n_downstream_flanks = {ID:all_oriented_flanks[ID]['Downstream'].apply(
    lambda x: len(x.split(',')) if pd.notna(x) else 0) for ID in IDs}

In [15]:
more_flanks_than_min = {ID:{'Upstream':n_upstream_flanks[ID]>=min_n_flanking_seqs,
    'Downstream':n_downstream_flanks[ID]>=min_n_flanking_seqs} for ID in IDs}

In [16]:
all_info_all_PAMs = {ID:{pos:{i:pam_predictions[ID][pos][i].sum(axis=1).values
    for i in pam_predictions[ID][pos]} for pos in pam_predictions[ID]
    } for ID in IDs}

In [17]:
all_info_all_PAMs = {ID:{pos:{i:all_info_all_PAMs[ID][pos][i] for i in
    all_info_all_PAMs[ID][pos] if more_flanks_than_min[ID][pos].loc[i]} for pos in
    all_info_all_PAMs[ID]} for ID in IDs}

In [18]:
all_ids = {ID:np.unique([k for pos in all_info_all_PAMs[ID] for k in
                     all_info_all_PAMs[ID][pos]]) for ID in IDs}

In [19]:
thresholds = {ID:{k:min(2, max(1, pd.Series(np.concatenate(
    [all_info_all_PAMs[ID][pos][k] for pos in all_info_all_PAMs[ID] if k
     in all_info_all_PAMs[ID][pos]])).quantile(0.75)+1.5*stats.iqr(np.concatenate(
    [all_info_all_PAMs[ID][pos][k] for pos in all_info_all_PAMs[ID] if k
     in all_info_all_PAMs[ID][pos]])))) for k in all_ids[ID]} for ID in IDs}

In [20]:
pam_predicted_positions = {ID:pd.DataFrame({k:{pos:any((all_info_all_PAMs[ID][
    pos][k][0:relevant_range] if pos=='Downstream' else all_info_all_PAMs[ID][
    pos][k][-relevant_range:])>=thresholds[ID][k]) if k in
    all_info_all_PAMs[ID][pos] else False for pos in all_info_all_PAMs[ID]}
    for k in all_ids[ID]}).transpose() for ID in IDs}

In [21]:
has_prediction = {ID:pam_predicted_positions[ID][
    pam_predicted_positions[ID].apply(lambda x: x[0]^x[1], axis=1)] for ID in IDs}

In [22]:
high_confidence_PAM_positions = {ID:has_prediction[ID].apply(
    lambda x: 'Upstream' if x[0] else 'Downstream', axis=1) for ID in IDs}

In [23]:
len(high_confidence_PAM_positions[98])

2546

## Turn PAMs into sequences

In [24]:
def reverse_pam(pam):
    newpam = pam[['A', 'T', 'C', 'G']][::-1]
    newpam.columns = ['T', 'A', 'G', 'C']
    newpam.index = newpam.index[::-1]
    return newpam

In [25]:
def logo2seq(ID, centroid):
    pos = high_confidence_PAM_positions[ID].loc[centroid]
    info = pam_predictions[ID][pos][centroid][0:10] if pos=='Downstream' else reverse_pam(
        pam_predictions[ID][pos][centroid])[0:10]
    threshold = thresholds[ID][centroid]
    res = []
    for i in info.index:
        res.append([])
        for base in info.columns:
            if info.loc[i, base] >= threshold:
                res[i].append(base)
        if len(res[i])==0:
            res[i].append('N')
    return res

In [26]:
PAM_seqs = {centroid:logo2seq(98, centroid) for centroid in high_confidence_PAM_positions[98].index}

In [27]:
# do it recursively
def make_PAM_seq(PAM):
    if len(PAM) > 1:
        return [b+x for b in PAM[0] for x in make_PAM_seq(PAM[1:])]
    else:
        return PAM[0]

In [28]:
PAM_seqs_full = {centroid:make_PAM_seq(PAM_seqs[centroid]) for centroid in PAM_seqs}

In [29]:
PAM_seqs_all_fw = [s for centroid in PAM_seqs_full for s in PAM_seqs_full[centroid]]

In [30]:
PAM_seqs_all_rev = [str(Seq.Seq(s).reverse_complement()) for s in PAM_seqs_all_fw]

In [31]:
PAM_seqs_all = np.unique(PAM_seqs_all_fw + PAM_seqs_all_rev)

In [44]:
PAM_non_N_count = [10-s.count('N') for s in PAM_seqs_all]

## Read ClinVar dataset

In [34]:
clinvar_data = pd.read_csv('filtered_variant_summary.tsv', sep='\t', index_col=0)

## Find PAMs in mutated alleles

In [36]:
clinvar_data_short = clinvar_data[clinvar_data['Length']<10]

In [38]:
# generate sequence containing the wt and mutated allele
# take 6 up + allele + 6 down
# check if pam on mut and not on wt
ref_target = []
alt_target = []
for i in clinvar_data_short.index:
    up = clinvar_data_short.loc[i, 'Upstream'][-6:]
    down = clinvar_data_short.loc[i, 'Downstream'][0:6]
    ref = clinvar_data_short.loc[i, 'ReferenceAlleleVCF']
    alt = clinvar_data_short.loc[i, 'AlternateAlleleVCF']
    ref_target.append(up + ref + down)
    alt_target.append(up + alt + down)

In [39]:
clinvar_data_flank = clinvar_data_short.assign(**{'RefAlleleFlank':ref_target,
                                                  'AltAlleleFlank':alt_target})

In [45]:
aligner = Align.PairwiseAligner()
aligner.mode = 'local'
aligner.open_gap_score = -10
aligner.extend_gap_score = -10

In [46]:
has_match_only_alt = []
for m,i in enumerate(clinvar_data_flank.index):
    has_match = False
    if m%10000==0 and m!=0: print('{} loci processed...'.format(m))
    ref = clinvar_data_flank.loc[i, 'RefAlleleFlank']
    alt = clinvar_data_flank.loc[i, 'AltAlleleFlank']
    if type(ref)==str and type(alt)==str: # there's a couple of NaNs, ignore them
        for n,PAM in enumerate(PAM_seqs_all):
            alt_aln_score = aligner.align(alt, PAM).score
            # if match on alt, check if no match on ref
            if alt_aln_score == PAM_non_N_count[n]:
                ref_aln_score = aligner.align(ref, PAM).score
                if ref_aln_score < PAM_non_N_count[n]:
                    has_match = True
                    break
    has_match_only_alt.append(has_match)

10000 loci processed...
20000 loci processed...
30000 loci processed...
40000 loci processed...
50000 loci processed...
60000 loci processed...
70000 loci processed...
80000 loci processed...
90000 loci processed...
100000 loci processed...
110000 loci processed...
120000 loci processed...


In [47]:
clinvar_data_flank = clinvar_data_flank.assign(**{'Target on ALT allele':has_match_only_alt})

In [48]:
# fraction on sites with alt allele targetable
clinvar_data_flank['Target on ALT allele'].sum()/clinvar_data_flank.shape[0]

0.9855673126644902