# PAM clusters association with phylogenetic tree

In [38]:
import os
import sys
import glob
import shutil
import numpy as np
import subprocess as sbp
import pandas as pd
from Bio import Seq, SeqIO
import logomaker
import itertools
import matplotlib.pyplot as plt
from collections import Counter
from Bio import SearchIO
import math
from scipy import stats
from matplotlib import cm
from matplotlib import colors
import matplotlib.patches as mpatches
import multiprocessing as mp
from Bio import Phylo
from matplotlib.colors import ListedColormap
from matplotlib import transforms
import matplotlib
%matplotlib notebook

## Read PAM clusters

In [2]:
workdir = 'PAM_clustering'

In [3]:
cluster_file = os.path.join(workdir, 'JS_PAM_clusters_avg.txt')

In [4]:
clustering = pd.read_csv(cluster_file, sep='\t', header=None,
                         names=['Cluster', 'ID'])

In [5]:
clustering = clustering.groupby('Cluster')['ID'].apply(list)

In [6]:
large_clusters = clustering[clustering.apply(len)>=20]

## Read Cas9 data

In [7]:
datadir = 'Cas9_tables/'

In [8]:
IDs = [98]

In [9]:
cas9_datasets = {ID:pd.read_csv(os.path.join(datadir,
    'all_new_working_Cas9_clust_{}_oriented.tsv'.format(ID)),
    sep='\t', index_col=0) for ID in IDs}

In [10]:
all_working_cas9 = pd.read_csv(os.path.join(datadir,
    'all_new_working_Cas9_annotated.tsv'), sep='\t', index_col=0)

In [11]:
all_working_cas9.index = all_working_cas9['SeqID'].values

## Make 98% phylogenetic tree

In [12]:
all_centroids_with_prediction = set([s for c in clustering for s in c])

In [13]:
cas9_data_with_prediction = cas9_datasets[98][
    [ID in all_centroids_with_prediction for ID in cas9_datasets[98]['Cluster ID'].values]]

In [14]:
centroids_data = cas9_datasets[98][
    [ID in all_centroids_with_prediction for ID in cas9_datasets[98]['SeqID'].values]]

In [15]:
aln_dir = 'PAM_clustering/phylogenetic_tree'

In [16]:
seqs_to_write = centroids_data['Cas9_aa_seq'].apply(lambda x: x.replace('*', ''))

In [17]:
records = [SeqIO.SeqRecord(Seq.Seq(seqs_to_write.loc[i]), id=centroids_data.loc[i, 'SeqID'],
                           name='', description='') for i in seqs_to_write.index]

In [32]:
fasta_file = os.path.join(aln_dir, 'Cas9_centroids_98.faa')

In [33]:
SeqIO.write(records, fasta_file, 'fasta')

2539

In [35]:
# make alignment
command = 'mafft --maxiter 10 --thread 10 Cas9_centroids_98.faa > Cas9_centroids_98_aln.faa'
command

'mafft --maxiter 10 --thread 10 Cas9_centroids_98.faa > Cas9_centroids_98_aln.faa'

In [36]:
# make tree
command = 'fasttree -spr 4 -mlacc 2 -slownni < Cas9_centroids_98_aln.faa > Cas9_centroids_98.nwk'
command

'fasttree -spr 4 -mlacc 2 -slownni < Cas9_centroids_98_aln.faa > Cas9_centroids_98.nwk'

In [17]:
# cluster tree
command = 'TreeCluster.py -i Cas9_centroids_98_rooted.nwk -o Cas9_tree_clustered_09.txt -t 0.9 -m max_clade'
command

'TreeCluster.py -i Cas9_centroids_98_rooted.nwk -o Cas9_tree_clustered_09.txt -t 0.9 -m max_clade'

## Phylogenetic tree clustering

In [18]:
aln_dir = 'PAM_clustering/phylogenetic_tree'

In [19]:
clust_thresh = ['03','05','07','09','15','20','25', '40']

In [21]:
cluster_files = {x:os.path.join(aln_dir, 'Cas9_tree_clustered_{}.txt'.format(x))
                 for x in clust_thresh}

In [22]:
cas9_clusterings = {x:pd.read_csv(cluster_files[x], sep='\t') for x in clust_thresh}

In [23]:
cas9_clusterings = {x:cas9_clusterings[x].groupby('ClusterNumber')['SequenceName'].apply(
    list) for x in clust_thresh}

In [25]:
cas9_clusterings_taxonomy = {x:cas9_clusterings[x].apply(lambda y: [all_working_cas9.loc[
    z, 'Taxonomy'] for z in y]) for x in clust_thresh}

## Make contingency tables

In [26]:
contingency_tables = {}
for i in cas9_clusterings:
    ct = {}
    for pc in large_clusters.index:
        ct[pc] = {}
        for cc in cas9_clusterings[i].index:
            if cc != -1:
                ct[pc][cc] = sum([x in cas9_clusterings[i].loc[cc] for x in
                                  large_clusters.loc[pc]])
    contingency_tables[i] = pd.DataFrame.from_dict(ct)

In [52]:
contingency_tables['40']

Unnamed: 0,38,51,62,75,86,94,95,96,103,108,...,139,142,144,146,148,150,151,153,156,Tot
1,0,0,0,0,0,0,0,0,0,0,...,22,0,0,0,0,0,0,0,0,39
2,0,0,0,0,11,6,0,0,0,0,...,0,0,0,0,0,0,0,0,0,59
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,6
4,0,1,4,0,6,7,44,58,10,0,...,1,25,40,0,27,62,0,2,0,809
5,20,0,17,3,6,1,0,0,1,32,...,1,0,4,1,1,0,0,44,1,331
6,0,0,3,3,3,0,0,0,0,1,...,9,0,0,0,6,0,0,0,0,205
7,0,1,3,0,1,0,0,0,0,0,...,12,1,1,17,10,0,0,11,0,269
8,0,0,0,0,0,8,0,0,0,0,...,0,0,0,1,0,0,0,1,0,27
9,0,0,0,3,2,0,1,0,1,0,...,0,0,0,0,0,0,22,0,55,277
10,0,0,0,2,0,0,0,0,2,0,...,0,2,0,0,0,0,0,0,0,9


In [27]:
for i in cas9_clusterings:
    if -1 in cas9_clusterings[i].index:
        contingency_tables[i] = contingency_tables[i].assign(
            Tot=cas9_clusterings[i].apply(len)[1:])
    else:
        contingency_tables[i] = contingency_tables[i].assign(
            Tot=cas9_clusterings[i].apply(len))

## Write tables

In [51]:
for i in clust_thresh:
    contingency_tables[i].to_csv(
        os.path.join(workdir, 'contingency_table_{}.tsv'.format(i)), sep='\t')