/
ClusterCollection.py
255 lines (199 loc) · 7.7 KB
/
ClusterCollection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# -*- Mode: python; tab-width: 4; indent-tabs-mode:nil; coding:utf-8 -*-
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
#
# MDAnalysis --- https://www.mdanalysis.org
# Copyright (c) 2006-2017 The MDAnalysis Development Team and contributors
# (see the file AUTHORS for the full list of names)
#
# Released under the GNU Public Licence, v2 or any higher version
#
# Please cite your use of MDAnalysis in published work:
#
# R. J. Gowers, M. Linke, J. Barnoud, T. J. E. Reddy, M. N. Melo, S. L. Seyler,
# D. L. Dotson, J. Domanski, S. Buchoux, I. M. Kenney, and O. Beckstein.
# MDAnalysis: A Python package for the rapid analysis of molecular dynamics
# simulations. In S. Benthall and S. Rostrup editors, Proceedings of the 15th
# Python in Science Conference, pages 102-109, Austin, TX, 2016. SciPy.
#
# N. Michaud-Agrawal, E. J. Denning, T. B. Woolf, and O. Beckstein.
# MDAnalysis: A Toolkit for the Analysis of Molecular Dynamics Simulations.
# J. Comput. Chem. 32 (2011), 2319--2327, doi:10.1002/jcc.21787
#
"""
Cluster representation --- :mod:`MDAnalysis.analysis.encore.clustering.ClusterCollection`
=========================================================================================
The module contains the Cluster and ClusterCollection classes which are
designed to store results from clustering algorithms.
:Author: Matteo Tiberti, Wouter Boomsma, Tone Bengtsen
.. versionadded:: 0.16.0
"""
from __future__ import absolute_import
import numpy as np
import six
class Cluster(object):
"""
Generic Cluster class for clusters with centroids.
Attributes
----------
id : int
Cluster ID number. Useful for the ClustersCollection class
metadata : iterable
dict of lists or numpy.array, containing metadata for the cluster
elements. The iterable must return the same number of elements as
those that belong to the cluster.
size : int
number of elements.
centroid : element object
cluster centroid.
elements : numpy.array
array containing the cluster elements.
"""
def __init__(self, elem_list=None, centroid=None, idn=None, metadata=None):
"""Class constructor. If elem_list is None, an empty cluster is created
and the remaining arguments ignored.
Parameters
----------
elem_list : numpy.array or None
numpy array of cluster elements
centroid : None or element object
centroid
idn : int
cluster ID
metadata : iterable
metadata, one value for each cluster element. The iterable
must have the same length as the elements array.
"""
self.id = idn
if elem_list is None:
self.size = 0
self.elements = np.array([])
self.centroid = None
self.metadata = {}
return
self.metadata = {}
self.elements = elem_list
if centroid not in self.elements:
raise LookupError("Centroid of cluster not found in the element list")
self.centroid = centroid
self.size = self.elements.shape[0]
if metadata:
for name, data in six.iteritems(metadata):
if len(data) != self.size:
raise TypeError("Size of metadata having label \"{0}\"\
is not equal to the number of cluster elmements".format(name))
self.add_metadata(name, data)
def __iter__(self):
"""
Iterate over elements in cluster
"""
return iter(self.elements)
def __len__(self):
"""
Size of cluster
"""
return len(self.elements)
def add_metadata(self, name, data):
if len(data) != self.size:
raise TypeError("Size of metadata is not equal to the number of\
cluster elmements")
self.metadata[name] = np.array(data)
def __repr__(self):
"""
Textual representation
"""
out = repr(self.elements)
return out
class ClusterCollection(object):
"""Clusters collection class; this class represents the results of a full
clustering run. It stores a group of clusters defined as
encore.clustering.Cluster objects.
Attributes
----------
clusters : list
list of of Cluster objects which are part of the Cluster collection
"""
def __init__(self, elements=None, metadata=None):
"""Class constructor. If elements is None, an empty cluster collection
will be created. Otherwise, the constructor takes as input an
iterable of ints, for instance:
[ a, a, a, a, b, b, b, c, c, ... , z, z ]
the variables a,b,c,...,z are cluster centroids, here as cluster
element numbers (i.e. 3 means the 4th element of the ordered input
for clustering). The array maps a correspondence between
cluster elements (which are implicitly associated with the
position in the array) with centroids, i. e. defines clusters.
For instance:
[ 1, 1, 1, 4, 4, 5 ]
means that elements 0, 1, 2 form a cluster which has 1 as centroid,
elements 3 and 4 form a cluster which has 4 as centroid, and
element 5 has its own cluster.
Parameters
----------
elements : iterable of ints or None
clustering results. See the previous description for details
metadata : {str:list, str:list,...} or None
metadata for the data elements. The list must be of the same
size as the elements array, with one value per element.
"""
idn = 0
if elements is None:
self.clusters = None
return
if not len(set((type(el) for el in elements))) == 1:
raise TypeError("all the elements must have the same type")
self.clusters = []
elements_array = np.array(elements)
centroids = np.unique(elements_array)
for i in centroids:
if elements[i] != i:
raise ValueError("element {0}, which is a centroid, doesn't \
belong to its own cluster".format(elements[i]))
for c in centroids:
this_metadata = {}
this_array = np.where(elements_array == c)
if metadata:
for k, v in six.iteritems(metadata):
this_metadata[k] = np.asarray(v)[this_array]
self.clusters.append(
Cluster(elem_list=this_array[0], idn=idn, centroid=c,
metadata=this_metadata))
idn += 1
def get_ids(self):
"""
Get the ID numbers of the clusters
Returns
-------
ids : list of int
list of cluster ids
"""
return [v.idn for v in self.clusters]
def get_centroids(self):
"""
Get the centroids of the clusters
Returns
-------
centroids : list of cluster element objects
list of cluster centroids
"""
return [v.centroid for v in self.clusters]
def __iter__(self):
"""
Iterate over clusters
"""
return iter(self.clusters)
def __len__(self):
"""
Length of clustering collection
"""
return len(self.clusters)
def __repr__(self):
"""
Textual representation
"""
out = ""
for cluster in self.clusters:
out += "{0} (size:{1},centroid:{2}): {3}\n".format(cluster.id,
len(cluster),
cluster.centroid,
repr(cluster))
return out