# Filter matches and make clusters

In [1]:
import os
from Bio import Seq,SeqIO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
from collections import Counter
%matplotlib notebook

## Read Cas9 table and clusters

In [2]:
datadir = 'Cas9_tables'

In [3]:
all_working_cas9 = pd.read_csv(os.path.join(datadir,
    'all_new_working_Cas9_annotated.tsv'), sep='\t', index_col=0)

In [4]:
clustdir = 'Cas9_clustering'

In [5]:
IDs = [100, 99, 98, 97, 96, 95]
centroids = {ID:[rec.id for rec in SeqIO.parse(os.path.join(
    clustdir, 'centroid_{}_new_working_cas9.faa'.format(ID)),
    'fasta')] for ID in IDs}

In [6]:
for ID in IDs:
    print('{}% ID: {} clusters'.format(ID, len(centroids[ID])))

100% ID: 27062 clusters
99% ID: 14332 clusters
98% ID: 10475 clusters
97% ID: 8568 clusters
96% ID: 7538 clusters
95% ID: 6898 clusters


In [7]:
clusters = {ID:[[rec.id for rec in SeqIO.parse(
    cluster, 'fasta')] for cluster in glob(os.path.join(
    clustdir, 'MSA{}/cluster_*'.format(ID)))] for ID in IDs}

In [8]:
def get_centroid(seqid, centroids, clusters):
    for cluster in clusters:
        if seqid in cluster:
            for c in cluster:
                if c in centroids:
                    return c

In [9]:
all_working_cas9_oriented = {}
for ID in IDs:
    to_add = {'Cluster ID': [], 'Orientation': ['']*all_working_cas9.shape[0]}
    for i in all_working_cas9.index:
        seqid = all_working_cas9.loc[i, 'SeqID']
        centroid = get_centroid(seqid, centroids[ID], clusters[ID])
        to_add['Cluster ID'].append(centroid)
    all_working_cas9_oriented[ID] = all_working_cas9.assign(**to_add)

In [10]:
for ID in IDs:
    print('Processing ID: {}'.format(ID))
    for centroid in centroids[ID]:
        # define centroid locus order
        centroid_data = all_working_cas9[all_working_cas9['SeqID']==centroid]
        centroid_cas9_start = centroid_data['Cas9_start'].iloc[0]
        centroid_other_genes_start = {x.split(' ')[0][0:4]:int(
            x.split(' ')[1]) for x in centroid_data[
            'Other_genes'].iloc[0].split(';')}
        centroid_locus_order = np.array([centroid_other_genes_start['Cas1'], 
            centroid_other_genes_start['Cas2'], centroid_cas9_start]).argsort()
        
        for seqid in all_working_cas9_oriented[ID][all_working_cas9_oriented[
            ID]['Cluster ID']==centroid]['SeqID'].values:
            seqid_data = all_working_cas9[all_working_cas9['SeqID']==seqid]
            seqid_cas9_start = seqid_data['Cas9_start'].iloc[0]
            seqid_other_genes_start = {x.split(' ')[0][0:4]:int(
                x.split(' ')[1]) for x in seqid_data[
                'Other_genes'].iloc[0].split(';')}
            seqid_locus_order = np.array([seqid_other_genes_start['Cas1'], 
                seqid_other_genes_start['Cas2'], seqid_cas9_start]).argsort()
            
            if all(seqid_locus_order == centroid_locus_order):
                orientation = '+'
            elif all(seqid_locus_order == centroid_locus_order[::-1]):
                orientation = '-'
            else:
                orientation = 'Not conserved'
            
            idx = seqid_data.index[0]
            all_working_cas9_oriented[ID].loc[idx, 'Orientation'] = orientation

Processing ID: 100
Processing ID: 99
Processing ID: 98
Processing ID: 97
Processing ID: 96
Processing ID: 95


In [11]:
for ID in IDs:
    all_working_cas9_oriented[ID].to_csv(os.path.join(datadir,
    'all_new_working_Cas9_clust_{}_oriented.tsv'.format(ID)), sep='\t')

In [12]:
for ID in IDs:
    print(Counter(all_working_cas9_oriented[ID]['Orientation'].values))

Counter({'+': 62420, '-': 29615, 'Not conserved': 52})
Counter({'+': 54509, '-': 37511, 'Not conserved': 67})
Counter({'+': 51857, '-': 40162, 'Not conserved': 68})
Counter({'+': 52493, '-': 39530, 'Not conserved': 64})
Counter({'+': 50512, '-': 41511, 'Not conserved': 64})
Counter({'+': 51465, '-': 40558, 'Not conserved': 64})


## Fix bad centroids

In [15]:
perc_not_conserved = {}
number_not_conserved = {}
total_seqids = {}
for ID in IDs:
    print('Processing ID: {}'.format(ID))
    perc_not_conserved[ID] = []
    number_not_conserved[ID] = []
    total_seqids[ID] = []
    for centroid in centroids[ID]:
        orientations = all_working_cas9_oriented[ID][all_working_cas9_oriented[
            ID]['Cluster ID']==centroid]['Orientation']
        perc_not_conserved[ID].append(
            (sum(orientations=='Not conserved')/len(orientations)))
        number_not_conserved[ID].append(sum(orientations=='Not conserved'))
        total_seqids[ID].append(len(orientations))

Processing ID: 100
Processing ID: 99
Processing ID: 98
Processing ID: 97
Processing ID: 96
Processing ID: 95


In [16]:
perc_not_conserved_df = {}
for ID in IDs:
    perc_not_conserved_df[ID] = pd.DataFrame({'% NC':perc_not_conserved[ID],
        '# NC':number_not_conserved[ID], 'Total':total_seqids[ID]},
        index=centroids[ID])

In [17]:
many_not_conserved = {ID:perc_not_conserved_df[ID][
    perc_not_conserved_df[ID]['% NC']>0.5] for ID in IDs}

In [18]:
bad_centroids = {ID:many_not_conserved[ID][
    many_not_conserved[ID]['Total']>3].index for ID in IDs}

For each bad cluster, get the centroid that generates the largest number of loci with conserved orientation.

In [20]:
bad_orientations = {}
for ID in IDs:
    bad_orientations[ID] = {}
    for centroid in bad_centroids[ID]:
        bad_orientations[ID][centroid] = {}
        for seqid in all_working_cas9_oriented[ID][all_working_cas9_oriented[ID][
            'Cluster ID']==centroid]['SeqID'].values:
            seqid_data = all_working_cas9[all_working_cas9['SeqID']==seqid]
            seqid_cas9_start = seqid_data['Cas9_start'].iloc[0]
            seqid_other_genes_start = {x.split(' ')[0][0:4]:int(
                x.split(' ')[1]) for x in seqid_data[
                'Other_genes'].iloc[0].split(';')}
            seqid_locus_order = np.array([seqid_other_genes_start['Cas1'], 
                seqid_other_genes_start['Cas2'], seqid_cas9_start]).argsort()
            bad_orientations[ID][centroid][seqid] = seqid_locus_order

In [21]:
new_centroids = {}
for ID in IDs:
    new_centroids[ID] = {}
    for old_centroid in bad_orientations[ID]:
        new_centroid = None
        number_conserved_loci = 0
        for seqid in bad_orientations[ID][old_centroid]:
            base_o = bad_orientations[ID][old_centroid][seqid]
            new_number_conserved_loci = sum([all(o==base_o) or all(
                o==base_o[::-1]) for o in bad_orientations[ID][
                old_centroid].values()])
            if new_number_conserved_loci > number_conserved_loci:
                new_centroid = seqid
                number_conserved_loci = new_number_conserved_loci
        new_centroids[ID][old_centroid] = new_centroid

In [22]:
# define new orientations
new_orientations = {}
for ID in IDs:
    new_orientations[ID] = {}
    for old_centroid in bad_orientations[ID]:
        new_orientations[ID][old_centroid] = {}
        new_centroid = new_centroids[ID][old_centroid]
        new_centroid_locus_order = bad_orientations[ID][old_centroid][new_centroid]
        
        for seqid in bad_orientations[ID][old_centroid]:
            seqid_orientation = bad_orientations[ID][old_centroid][seqid]
            
            if all(new_centroid_locus_order == seqid_orientation):
                orientation = '+'
            elif all(new_centroid_locus_order == seqid_orientation[::-1]):
                orientation = '-'
            else:
                orientation = 'Not conserved'
            
            new_orientations[ID][old_centroid][seqid] = orientation

In [23]:
for ID in IDs:
    for old_centroid in new_orientations[ID]:
        for seqid in new_orientations[ID][old_centroid]:
            idx = all_working_cas9_oriented[ID][
                all_working_cas9_oriented[ID]['SeqID']==seqid].index[0]
            all_working_cas9_oriented[ID].loc[idx, 'Cluster ID'] = new_centroids[
                ID][old_centroid]
            all_working_cas9_oriented[ID].loc[idx,
                'Orientation'] = new_orientations[ID][old_centroid][seqid]

In [24]:
for ID in IDs:
    print(Counter(all_working_cas9_oriented[ID]['Orientation'].values))

Counter({'+': 62439, '-': 29613, 'Not conserved': 35})
Counter({'+': 54519, '-': 37518, 'Not conserved': 50})
Counter({'+': 51866, '-': 40169, 'Not conserved': 52})
Counter({'+': 52498, '-': 39537, 'Not conserved': 52})
Counter({'+': 50517, '-': 41518, 'Not conserved': 52})
Counter({'+': 51470, '-': 40565, 'Not conserved': 52})


In [25]:
for ID in IDs:
    all_working_cas9_oriented[ID].to_csv(os.path.join(datadir,
    'all_new_working_Cas9_clust_{}_oriented.tsv'.format(ID)), sep='\t')

In [37]:
for ID in IDs:
    conserved = sum(all_working_cas9_oriented[ID][
        'Orientation']!='Not conserved')/all_working_cas9_oriented[ID].shape[0]
    print('{}% identity clustering: {:.2f} loci with conserved orientation'.format(
        ID, conserved*100))

100% identity clustering: 99.96 loci with conserved orientation
99% identity clustering: 99.95 loci with conserved orientation
98% identity clustering: 99.94 loci with conserved orientation
97% identity clustering: 99.94 loci with conserved orientation
96% identity clustering: 99.94 loci with conserved orientation
95% identity clustering: 99.94 loci with conserved orientation
