In [2]:
# Perform the initialization and imports
import sys
import pickle
import re
import os
import csv
import argparse
import math
import pprint

from string import ascii_lowercase
from collections import Counter, defaultdict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from Bio import SeqIO, AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.Emboss.Applications import NeedleallCommandline

# Demand Python 3.
if sys.version_info[0] < 3:
    print("Python 3 is required, but you are using Python %i.%i.%i") % (
        sys.version_info[0], sys.version_info[1], sys.version_info[2])
    sys.exit(1)
    
# Retrieve the specific functions from ind and proteins.py
indels_path="/home/maya/InDelScanner"  # /PATH/TO/InDelScanner
if indels_path not in sys.path:
    sys.path.append(indels_path)
#from indels.ind import trim_read, findEnds, endMatch, findGap, gapAlign

from ipynb.fs.defs.Kinase_statistics import convert_variant_to_dict

os.chdir("/mnt/c/Users/Maya/Dropbox/mek_results")

with open('Remkes_protein.p', 'rb') as f:
    all_ref = pickle.load(f)
with open('Remkes_protein_low.p', 'rb') as f:
    low = pickle.load(f)

all_ref['mek']['low-v2'] = low['mek']['low-v2']

mek = {}
for fraction in ['high', 'med']:
    mek[fraction] = Counter(all_ref['mek'][fraction])
mek['low-t'] = Counter(all_ref['mek']['low']) + Counter(all_ref['mek']['low-v2'])

In [4]:
# Set general restrictions stemming from SpliMLib library design
aa_2 = ['A', 'Δ']
aa_12 = ['A','G','P','Y','D','K','M','V','I','L','F','W']
aa_13 = aa_12 +  ['Δ']
pos_aa = {'6': aa_12, '9': aa_12, '11': aa_12, '13': aa_12, '7a': aa_13, '8a': aa_2}

In [5]:
df_all = pd.DataFrame.from_dict(mek).fillna(0).sort_values(by=['high', 'med', 'low-t'], ascending=False).astype(int)
df_50p = df_all.loc[(df_all['high'] >= 50) & ((df_all['high']+df_all['med']) > 2*df_all['low-t'])]
df_20to50 = df_all.loc[(df_all['high'].isin(range(10,50))) & 
                       (df_all['high'] > df_all['med']) & 
                       ((df_all['high']+df_all['med']) > 5*df_all['low-t']) ]

df_pos = df_50p.append(df_20to50)
pos = df_pos.to_dict()

## Clustering: k-medoids and hiearchical

In [6]:
import gower
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram

In [7]:
active_ls = df_pos.index.tolist()
active = {short : convert_variant_to_dict(short) for short in active_ls}

In order to perform hiearchical clustering, I need a dataframe where each column represents the AA present at one randomised position. Then, each column is a categorical variable and the columns can be compared with Gowle dissimilarity, followed by K-medioid clustering or hiearchical clustering.

https://healthcare.ai/clustering-non-continuous-variables/

https://www.thinkdatascience.com/post/2019-12-16-introducing-python-package-gower/

https://www.researchgate.net/publication/272351873_NumPy_SciPy_Recipes_for_Data_Science_k-Medoids_Clustering

In [8]:
valid_pos = ['6', '7a', '8a', '9', '11', '13']

data = {}
for short, m_to_pos in active.items():
    if len(m_to_pos) != len(valid_pos):
        continue
    else:
        data[short] = [m_to_pos[i] for i in valid_pos]

factors = pd.DataFrame.from_dict(data, orient='index', columns=valid_pos)

In [7]:
factors.head()

Unnamed: 0,6,7a,8a,9,11,13
6L/7aI/8aA/9L/11F/13M,L,I,A,L,F,M
6F/7aP/9W/11L/13M,F,P,Δ,W,L,M
6L/7aF/9L/11I/13I,L,F,Δ,L,I,I
6A/7aI/8aA/9L/11L/13I,A,I,A,L,L,I
6W/7aI/9F/11L/13V,W,I,Δ,F,L,V


In [8]:
# gower_matrix = gower.gower_matrix(factors)

In [9]:
# np.savez_compressed('Full_gower_matrix.npz', gw=gower_matrix)

In [14]:
gower_matrix = np.load('Full_gower_matrix.npz')['gw']

In [15]:
gower_matrix.shape

(32375, 32375)

In [18]:
def kMedoids(D, k, tmax=100):
    # determine dimensions of distance matrix D
    m, n = D.shape
    
    # randomly initialize an array of k medoid indices
    M = np.sort(np.random.choice(n, k))
    
    # create a copy of the array of medoid indices
    Mnew = np.copy(M)
    
    # initialize a dictionary to represent clusters
    C = {}
    
    for t in range(tmax): # t iterates over the convergence cycles
        # determine clusters, i.e. arrays of data indices
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]
        
        # update cluster medoids
        for kappa in range(k):
            J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
            j = np.argmin(J)
            Mnew[kappa] = C[kappa][j]
            np.sort(Mnew)
                
        # check for convergence
        if np.array_equal(M, Mnew):
            break
                
        M = np.copy(Mnew)
    
    else:
        # final update of cluster memberships
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]
                
    # return results
    return M, C

In [19]:
medoids, clusters = kMedoids(gower_matrix, 5, tmax=20)

In [21]:
medoids[0]

402

In [23]:
active[medoids[0]]

KeyError: 402

In [None]:
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3848038/

## GraphML

In [24]:
import random

In [64]:
test = random.sample(active.items(), 1000)
test_dict = {}
for i in range(len(test)):
    test_dict[test[i][0]] = test[i][1]

In [65]:
header = '<?xml version="1.0" encoding="utf-8"?>'
graphml_open = '<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" \
    xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns  http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">'
graph_open = '<graph edgedefault="undirected">'

close = '</graph>\n</graphml>'

In [70]:
# hammet distance = 0 if the aa matches at all positions
def hammet_distance(s1, s2, pos_aa):
    d = 0
    if (len(s1) != len(pos_aa)) or (len(s2) != len(pos_aa)):
        d += 2

    for p in pos_aa.keys():
        if s1[p] != s2[p]:
            d += 1
    return d

In [None]:
# make a list of edges
def list_edge_nodes(graph_dict, pos_aa):
    node_pairs = []
    ordered_variants = list(graph_dict.keys())
    for i in range(len(ordered_variants)):
        short1 = ordered_variants[i]
        s1 = graph_dict[short1]
        for j in range(i+1, len(ordered_variants)):
            short2 = ordered_variants[j]
            s2 = graph_dict[short2]

            h_dis = hammet_distance(s1, s2, pos_aa)
            if h_dis == 1:
                node_pairs.append((short1, short2))
    return node_pairs

In [105]:
# GRAPHML functions

# create nodes
def gml_write_node(short, file):
    print('<node id=\"{node}"/>'.format(node=short), sep="", file=file)

def gml_list_edge_strings(node_pairs):
    edge_strings = []
    for pair in node_pairs:
        edge_id = '-'.join(pair)
        one_edge = '<edge id="{id}" source="{s}" target="{t}"/>'.format(
            id=edge_id, s=pair[0], t=pair[1])
        edge_strings.append(one_edge)
    return edge_strings

Now write the graph file.

In [110]:
def write_graphml(graph_dict, filename):
    with open (filename, 'w') as f:
        # print the GRAPHML header definitions
        print(header, graphml_open, graph, file=f, sep='\n')

        # write nodes
        for short in graph_dict.keys():
            gml_write_node(short, f)

        # write edges
        node_pairs = list_edge_nodes(graph_dict, pos_aa)
        edge_strings = gml_list_edge_strings(node_pairs)
        f.writelines(edge_strings)

        # end the graph and graphml
        print(close, file=f)
    
    return node_pairs

In [111]:
test_edges = write_graphml(test_dict, 'test.graphml')

In [114]:
active_edges = write_graphml(active, 'active.graphml')

## Sequence similarity network analysis with NetworkX

In [1]:
import networkx as nx

ModuleNotFoundError: No module named 'networkx'