# PhenoGraph clustering of methylation probes

This notebook aims to cluster the methylation probes (GSE 18700) and discover subgroups of probes. 

In [1]:
import phenograph
import pandas as pd
import numpy as np
import networkx as nx
from collections import Counter

In [2]:
methylation = pd.read_csv('../../data/Figueroa/methyl-successiveNormalized.tsv', sep='\t', header=0, index_col=0)
methylation.head(n=2)

Unnamed: 0_level_0,GSM464771,GSM464772,GSM464773,GSM464774,GSM464775,GSM464776,GSM464777,GSM464778,GSM464779,GSM464780,...,GSM465020,GSM465024,GSM465026,GSM465030,GSM465060,GSM465061,GSM465062,GSM465063,GSM465064,GSM465065
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LOC100133331,0.565665,-0.276251,0.546472,1.390992,-1.250427,-0.723163,-1.363963,-0.621765,-0.090072,-0.598943,...,-0.2578,0.402758,-0.649402,-0.430032,0.267208,2.583242,-0.050794,-0.68156,1.849207,0.31907
AK091100,0.836235,0.945424,1.065551,-1.012361,-1.288259,-0.319922,-0.336841,-0.9482,-0.632316,0.028876,...,-0.980434,0.360955,0.093377,-1.229339,-0.429456,1.59134,-0.495726,-1.219191,0.706632,0.152214


In [8]:
np.random.seed(1)
communities, graph, Q = phenograph.cluster(methylation)

Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm
Neighbors computed in 102.98744797706604 seconds
Jaccard graph constructed in 3.765331983566284 seconds
Wrote graph to binary file in 1.6877951622009277 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.826156
After 5 runs, maximum modularity is Q = 0.827243
Louvain completed 25 runs in 17.176079988479614 seconds
PhenoGraph complete in 125.85422682762146 seconds


In [9]:
Counter(communities)

Counter({4: 1660,
         3: 2061,
         0: 3467,
         1: 2334,
         12: 754,
         13: 695,
         15: 500,
         8: 970,
         6: 1331,
         11: 855,
         9: 941,
         7: 1273,
         2: 2327,
         5: 1363,
         17: 337,
         10: 888,
         14: 506,
         16: 377,
         18: 65,
         19: 55})

In [11]:
G = nx.from_scipy_sparse_matrix(graph)
nx.draw(G, node_color=communities, edge_color='#63666A', width=0.1, node_size=0.5)

  distance = np.sqrt((delta**2).sum(axis=0))


20 communities of probes were discovered. Sensitivity analysis is needed on the stability of the clusters

In [10]:
out = methylation
out['communities'] = communities
out.to_csv('../../data/Figueroa/methylc-cluster-k30.csv', index=True)

In [12]:
import scipy.sparse as sparse
import scipy.io
import numpy as np

def save_sparse_matrix(filename, x):
    x_coo = x.tocoo()
    row = x_coo.row
    col = x_coo.col
    data = x_coo.data
    shape = x_coo.shape
    np.savez(filename, row=row, col=col, data=data, shape=shape)

def load_sparse_matrix(filename):
    y = np.load(filename)
    z = sparse.coo_matrix((y['data'], (y['row'], y['col'])), shape=y['shape'])
    return z

In [14]:
save_sparse_matrix(filename='../../data/Figueroa/methyl-graph.npz', x=graph)