/
clustering.py
executable file
·354 lines (273 loc) · 11.3 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
"""
Author - Luke Quinn
Capabilties - 1) Creating numpy pickle file to imporve load speeds
2) General purpose PCA
Note this requires you to have the exon.csv file present
"""
import numpy as np
import csv
import pickle
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import time
# see https://github.com/jacoblevine/PhenoGraph
import phenograph
from sklearn.decomposition import TruncatedSVD
from scipy import sparse
from sklearn.manifold import TSNE
def PCAWrapper(data):
dataT ,variences = SVDWrapper(data, dim=2) # note this returns data.T as a sparse TrucatedSVD
Xaxis = "PC-1, Var=" + str(variences[0])
Yaxis = "PC-2, Var=" + str(variences[1])
DrawScatterPlot(dataT.T, "PCA of Gene data", labelX=Xaxis, labelY=Yaxis)
def BuildNumpyArray(save=False):
data = np.zeros((50281, 15928),dtype=np.float64)
with open('human_MTG_2018-06-14_exon-matrix.csv') as csv_file:
csv_reader = csv.reader(csv_file,delimiter=',', quoting=csv.QUOTE_NONE)
matCount = 0
for i,row in enumerate(csv_reader):
if i > 0:
geneData = np.array(row[1:], dtype=np.dtype(np.float64))
data[matCount] = geneData
matCount += 1
if save:
print("Saving matrix")
np.save("matrixPickle", data)
print("Data read but not reduced here")
return data
def get_genes_to_delete():
with open('human_MTG_2018-06-14_genes-rows.csv') as file:
entrez_ids_to_delete = []
for l in csv.reader(file, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True):
if 'X' in l[1] or 'Y' in l[1]:
entrez_id = l[2]
entrez_ids_to_delete.append(entrez_id)
return entrez_ids_to_delete
def get_samples_to_delete():
with open('human_MTG_2018-06-14_samples-columns.csv') as file:
samples_to_delete = []
next(file)
sample_name = 0
total_reads = 19
percent_exon = 20
percent_intron = 21
percent_aligned_reads_total = 28
reads_unique = 25
for l in csv.reader(file, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True):
# the total value doesn't change anything so may not be necessary
total_aligned_to_exon_intron = (float(l[percent_exon]) + float(l[percent_intron])) * float(l[total_reads]) / 100.0
if float(l[reads_unique]) <= 50 or float(l[percent_aligned_reads_total]) <= 40 or total_aligned_to_exon_intron <= 500000:
samples_to_delete.append(l[sample_name])
return samples_to_delete
def BuildFilteredNumpyArray(save=False):
genesToDelete = get_genes_to_delete()
samplesToDelete = get_samples_to_delete()
data = np.zeros((50281 - len(genesToDelete), 15928),dtype=np.float64)
with open('human_MTG_2018-06-14_exon-matrix.csv') as csv_file:
csv_reader = csv.reader(csv_file,delimiter=',', quoting=csv.QUOTE_NONE)
matCount = 0
samples_header = None
for i,row in enumerate(csv_reader):
if i == 0:
samples_header = row
if i > 0 and row[0].strip('\"') not in genesToDelete:
geneData = np.array(row[1:], dtype=np.dtype(np.float64))
data[matCount] = geneData
matCount += 1
columnIndicesToDelete = []
for i,name in enumerate(samples_header):
if name.strip('\"') in samplesToDelete:
columnIndicesToDelete.append(i)
data = np.delete(data, columnIndicesToDelete, axis=1)
#remove 0 rows
data = data[~np.all(data == 0, axis=1)]
if save:
print("Saving matrix")
np.save("matrixPickle", data)
print("Data read but not reduced here")
return data, columnIndicesToDelete
def DisplayPCAOnExonMatrix(data):
#if not data:
# data = np.load('./matrixPickle.npy')
new_data, variances, eigenvectors = PCA(data)
y = []
total = 0
for point in variances:
y.append(point)
total += point
if total > 50:
print("It took " + str(len(y)) + " to account for " + str(total) + " of the variance")
break
x = [i for i in range(len(y))]
plt.figure()
plt.plot(x,y)
plt.xlabel('Dimension')
plt.ylabel('Captured Variance')
plt.savefig('./VariancePlot.png')
np.save('Variances', variances)
"""
plt.figure()
plt.plot(new_data[0,:], new_data[1:], 'x')
plt.title('PCA modded data')
plt.savefig('./2dPCAPlot.png')
"""
def Testsetup():
y = np.random.random_integers(1,100, 100)
plt.figure()
plt.stem(y.ravel())
plt.xlabel('Linear')
plt.ylabel('Random')
plt.savefig('./Temp.png')
def DrawScatterPlot(data, name, labelX='X', labelY='Y', colour_seq=None):
plt.figure()
plt.scatter(data[0,:], data[1,:], c=colour_seq, s=1)
plt.title(name)
plt.xlabel(labelX)
plt.ylabel(labelY)
name = "./" + name.replace(" ", "") + str(time.time()) + ".png"
plt.savefig(name, transparent=True)
def VariableSubset(data):
# cut cells who have less than 1000 genes
keep_cells = np.count_nonzero(data, axis=0) > 1000
data = data[:,keep_cells]
#cut genes with low variabilty
keep_genes = np.std(data, axis=1) > 0
data = data[keep_genes]
data /= np.sum(data, axis=0) # convert to percents
data *= 1e6 # convert percents to cpm
data += 1 # add 1 for log transform
data = np.log(data) # log transform
print("The variable subset shape is :", data.shape)
return data
def SVDWrapper(data, dim=50):
data = sparse.csr_matrix(data.T)
svd = TruncatedSVD(n_components=dim, n_iter=7, random_state=42)
svd.fit(data)
return svd.transform(data), svd.explained_variance_ratio_
def TSNEWrapper(data):
return TSNE(n_components=2).fit_transform(data)
def TSNEPipeline(data):
print("Starting SVD at: ", time.time())
data = SVDWrapper(data, dim=20)[0] # note this returns data.T as a sparse TrucatedSVD
print("SVD complete at: " , time.time())
print("result shape is ", data.shape)
print("Starting TSNE ... ")
Y = TSNEWrapper(data)
print("TSNE finished at ", time.time())
print(Y.shape)
return Y.T
def SubsetsBasedOnType(data, ignoreIdx):
keep_cols_exc = []
keep_cols_inb = []
keep_cols_non_nerual = []
with open('human_MTG_2018-06-14_samples-columns.csv') as csv_file:
csv_reader = csv.reader(csv_file,delimiter=',')
line_count = 0
for row in csv_reader:
if line_count in ignoreIdx:
line_count += 1
continue
if "Inh" in row[-1]:
keep_cols_inb.append(line_count)
elif "Exc" in row[-1]:
keep_cols_exc.append(line_count)
else:
keep_cols_non_nerual.append(line_count)
line_count += 1
return (keep_cols_exc, keep_cols_inb, keep_cols_non_nerual)
def PhenoClusterSubsets(data, subsets_indices):
print("Clustering 1")
exc_colours, graph, Q = phenograph.cluster(data[subsets_indices[0]])
print("Clustering 2")
inb_colours, graph, Q = phenograph.cluster(data[subsets_indices[1]])
print("Clustering 3")
non_neural_colours, graph, Q = phenograph.cluster(data[subsets_indices[2]])
print(np.max(exc_colours))
print(np.max(inb_colours))
print(np.max(non_neural_colours))
exc_colours += 1
inb_colours += (np.max(exc_colours) + 1)
non_neural_colours += (np.max(inb_colours) + 1)
return (exc_colours, inb_colours, non_neural_colours)
def merge_arrays_inorder(idx, poll_array):
idx1 = 0
idx2 = 0
orderedIdx = []
orderedPoll = []
while idx1 + idx2 < len(idx[0]) + len(idx[1]):
if idx1 >= len(idx[0]):
orderedIdx.append(idx2)
orderedPoll.append(poll_array[1][idx2])
idx2 += 1
elif idx2 >= len(idx[1]):
orderedIdx.append(idx1)
orderedPoll.append(poll_array[0][idx1])
idx1 += 1
elif idx[0][idx1] < idx[1][idx2]:
orderedIdx.append(idx1)
orderedPoll.append(poll_array[0][idx1])
idx1 += 1
else:
orderedIdx.append(idx2)
orderedPoll.append(poll_array[1][idx2])
idx2 += 1
return orderedIdx, orderedPoll
def AgglogClusterSubsets(data, subsets_indices):
print("Clustering Exc ")
cluster1 = agg(n_clusters=24).fit(data[subsets_indices[0]])
print("Clustering Inb ")
cluster2 = agg(n_clusters=45).fit(data[subsets_indices[1]])
print("Clustering Non-Neural ")
cluster3 = agg(n_clusters=6).fit(data[subsets_indices[2]])
exc_colours = cluster1.labels_
inb_colours = cluster2.labels_
non_neural_colours = cluster3.labels_
print(np.max(exc_colours))
print(np.max(inb_colours))
print(np.max(non_neural_colours))
# builds a colour sequences that increases in number
inb_colours += (np.max(exc_colours) + 1)
non_neural_colours += (np.max(inb_colours) + 1)
print(np.max(non_neural_colours))
return (exc_colours, inb_colours, non_neural_colours)
def ColourPlot(data, name, subsets_indices, colours, labelX1="", labelY1=""):
print("Building Colour Sequence")
# 3 way merge sort
idxT, colourT = merge_arrays_inorder((subsets_indices[0], subsets_indices[1]), (colours[0], colours[1]))
inorder, colour_seq = merge_arrays_inorder((idxT, subsets_indices[2]), (colourT, colours[2]))
colour_seq = colour_seq[:-1] # chop the last one off
print("Ploting Data")
DrawScatterPlot(data, name, labelX=labelX1, labelY=labelY1, colour_seq=colour_seq)
def ColourPlotAdv(data, name, subsets_indices, colours):
idxT, colourT = merge_arrays_inorder((subsets_indices[0], subsets_indices[1]), (colours[0], colours[1]))
inorder, colour_seq = merge_arrays_inorder((idxT, subsets_indices[2]), (colourT, colours[2]))
colour_seq = colour_seq[:-1] # chop the last one off
fig, ax = plt.subplots(1,1, figsize=(6,6))
cmap = plt.cm.gist_ncar
cmaplist = [cmap(i) for i in range(cmap.N)]
cmaplist[0] = (.5, .5, .5, 1.0)
cmap = mpl.colors.LinearSegmentedColormap.from_list('Custom cmap', cmaplist, cmap.N)
bounds = np.linspace(0,1,75)
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
scat = ax.scatter(data[0,:], data[1,:], c=colour_seq,s=1, cmap=cmap, norm=norm)
ax.set_title("Better colors?")
name = "./" + name.replace(" ", "") + str(time.time()) + ".png"
plt.savefig(name, transparent=True)
def Main():
#data = BuildNumpyArray(True) # load array
data, removedIdx = BuildFilteredNumpyArray()
#data = np.load('matrixPickle.npy') # load saved data
data = VariableSubset(data) # make subeset
print("Spliting Based on type")
subsets_indices = SubsetsBasedOnType(data, removedIdx)
#print("PhenoClustering")
#colours = AgglogClusterSubsets(data, subsets_indices)
colours = PhenoClusterSubsets(data, subsets_indices)
#PCAWrapper(data)
print("Start TSNE")
projection = TSNEPipeline(data)
#DrawScatterPlot(projection,"Gene data mapping using T-SNE Projection", labelX="T-SNE X", labelY="T-SNE Y")
ColourPlot(projection,"Gene data TSNE projection coupled with Phenograph clustering", subsets_indices, colours, labelX1="TSNE Dim 1", labelY1="TSNE Dim 2")
if __name__ == "__main__":
Main()