-
Notifications
You must be signed in to change notification settings - Fork 1
/
fmriLSE.py
342 lines (310 loc) · 13.2 KB
/
fmriLSE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
'''
Different functions for the final project of CSE546
for predicting words from fMRI images
Authors: Stella Stylianidou, Tomasz sakrejda
'''
import scipy.io as io
import scipy.stats as stats
import numpy as np
from fmridataloader import *
from sklearn.decomposition import PCA
import test_suite
import matplotlib.pyplot as plt
# least squares calculation using the pseudo - inverse
def least_squares (X,Y):
pseudo_inv_X = np.linalg.pinv (X)
w_least_squares = np.dot(pseudo_inv_X,Y)
return w_least_squares
def pcaData (dimensions, training_data, testing_data):
'''
Finds the principal components of the training for given
amount of dimensions and returns the transformed training
and testing data.
:param dimensions: amount of principal components to keep
:param training_data: data for model to be trained on
:param testing_data: data for model to be tested on
:return: xtrainPCA : transformed training data ,xtestPCA : tranformed testing data
'''
pca = PCA(n_components=dimensions)
pca.fit(training_data)
xtrainPCA= pca.transform(training_data)
xtestPCA = pca.transform (testing_data)
print(pca.explained_variance_ratio_)
variance = pca.explained_variance_ratio_ # plots the variance of the dimensions used
plt.plot(variance)
plt.show()
return [xtrainPCA,xtestPCA]
def nonlinearFeatures (dimensions):
'''
Applies PCA for the given amount of dimensions. It then multiplies
each principal component with each other principal component to create
non linear features and finds the least squares solution.
:param dimensions: amount of principal components to keep
:return: [accuracy,rmsetrain,rmsetest,rmsetestwrong]
'''
[xtrainPCA,xtestPCA] = pcaData(300, fmri_train, fmri_train)
n = xtrainPCA.shape[0]
dimPCA = xtrainPCA.shape[1]
xNonlinear = np.zeros((n,dimPCA + dimPCA*dimPCA/2))
xtestNonlinear = np.zeros((xtestPCA.shape[0],dimPCA + dimPCA*dimPCA/2))
counter = dimPCA
xNonlinear [:,0:dimPCA] = xtrainPCA
xtestNonlinear [:,0:dimPCA] = xtestPCA
for i in range(dimPCA):
for j in range(i+1,dimPCA):
xNonlinear[:,counter] = xtrainPCA[:,i] * xtrainPCA[:,j]
xtestNonlinear[:,counter] = xtestPCA[:,i] * xtestPCA[:,j]
counter +=1
num_features = ytrain.shape[1]
d = xNonlinear.shape[1]
ntotdata = xNonlinear.shape[0]
bestw = np.zeros([num_features,d])
bestw0 = np.zeros(num_features)
accuracy = np.zeros(d)
for i in range(num_features):
y = ytrain[:,i].reshape(ntotdata,1)
x = xNonlinear[:,:]
w = least_squares (x,y)
bestw[i,:] = w.reshape(d)
wfile = "w_lse_nonlinear.mtx"
io.mmwrite(wfile, bestw)
[accuracy] = test_suite.main(bestw,bestw0,wordid_test,wordfeature_std,xtestNonlinear)
print(accuracy)
[ytest,ywrong] = test_suite.prepareData (wordid_test,wordfeature_std)
rmsetest = rmse_per_semantic_feature (xtestNonlinear,ytest,bestw)
rmsetrain = rmse_per_semantic_feature (xNonlinear,ytrain,bestw)
rmsetestwrong = rmse_per_semantic_feature (xtestNonlinear,ywrong,bestw)
return [accuracy,rmsetrain,rmsetest,rmsetestwrong]
def findw_PCA_LSE (dimensions, train_data,test_data, wordid_test, ytrain, wfile = "w_lse_dim299.mtx"):
'''
Find number of dimensions given principal components of the data
and using that it solves least squares to obtain the weights for each semantic
feature. Using the weights obtained it calculates the predicted semantic features
on the test data set and returns the accuracy on a guess out of 2 words. It also
returnsrmse on test and training data, and on the wrong column of training data.
:param dimensions: number of pca components
:param train_data: training data
:param test_data: test data
:param wordid_test: ids of the words in the test data set
:param ytrain: semantic features values for training data
:param wfile: name of file where w's will be written
:return:
'''
[xtrainPCA,xtestPCA] = pcaData(dimensions, train_data, test_data)
num_features = ytrain.shape[1]
d = xtrainPCA.shape[1]
ntotdata = xtrainPCA.shape[0]
bestw = np.zeros([num_features,d])
bestw0 = np.zeros(num_features)
accuracy = np.zeros(d)
for i in range(num_features):
print ('looking at feature ', i)
y = ytrain[:,i].reshape(ntotdata,1)
x = xtrainPCA[:,:]
w = least_squares (x,y)
bestw[i,:] = w.reshape(dimensions)
io.mmwrite(wfile, bestw)
[accuracy] = test_suite.main(bestw,bestw0,wordid_test,wordfeature_std,xtestPCA)
print(accuracy)
[ytest,ywrong] = test_suite.prepareData (wordid_test,wordfeature_std)
rmsetest = rmse_per_semantic_feature (xtestPCA,ytest,bestw)
rmsetrain = rmse_per_semantic_feature (xtrainPCA,ytrain,bestw)
rmsetestwrong = rmse_per_semantic_feature (xtestPCA,ywrong,bestw)
return [accuracy,rmsetrain,rmsetest,rmsetestwrong]
def word_ranking (x,w,wordid_true):
'''
for each x datapoint (fmri image) it calculated the predicted
semantic features value. It then finds the rmse of the predicted semantic
features value and each other word in the data set (y_word - x.w) ^ 2.
Each word is ranked according to this value.
:param x: voxels values (or PCAed x datapoint)
:param w: weight of our model
:param wordid_true: id of the correct word
:return: ranks
'''
ntest = x.shape[0]
num_words = wordfeature_std.shape[0]
ranks = []
for i in range(ntest):
x_current = x[i]
print('word at location : ', i)
rmse_per_word = []
for j in range (num_words): # go through every single word
y = wordfeature_std [j] #semantic features for jth word.
rmse = rmse_per_semantic_feature (x_current,y,w)
rmse_per_word.append(rmse)
sorted_indexes = stats.rankdata(rmse_per_word) #note argsort returns indices that would sort array
correct_word = wordid_true[i] - 1
rank_of_word = sorted_indexes [correct_word]
ranks.append(rank_of_word)
print(ranks)
return ranks
def two_class_rankings(x,w,wordid_test):
'''
:param x:
:param w:
:param wordid_test:
:return:
'''
wd_true = wordid_test[:,0]
wordid_false = wordid_test[:,1]
true_ranks = word_ranking(x,w,wd_true)
false_ranks = word_ranking(x,w,wordid_false)
incorrect_tests = [i for i in range(60) if my_true_ranks[i] > my_false_ranks[i]]
incorrect_true_ids = [int(wordid_test[i][0] - 1) for i in incorrect_tests] #compensate for wordids numbered from 1 to 60
incorrect_false_ids = [int(wordid_test[i][1] - 1) for i in incorrect_tests]
incorrect_true_words = [dictionary[i] for i in incorrect_true_ids]
incorrect_false_words = [dictionary[i] for i in incorrect_false_ids]
true_false_words = [(incorrect_true_words[i],incorrect_false_words[i]) for i in range(len(incorrect_true_words))]
return [true_ranks, false_ranks, true_false_words]
def differentPCADimensions ():
'''
It goes through different pca dimensions and calculates the w
accuracy and rmse for test and training data. It plots rmse and accuracy
versus pca dimensions.
'''
dimension_list = list(range (1,300,10))
ndimensions = len(dimension_list)
accuracy_list = []
rmsetrain_list = []
rmsetest_list = []
n_semantic = 218
rmsetest_matrix = np.zeros((ndimensions,n_semantic))
rmsetestwrong_list = []
j = 0
for i in dimension_list:
print(i)
[accuracy,rmsetrain,rmsetest,rmsetestwrong] = findw_PCA_LSE (i, fmri_train, fmri_test, wordid_test, ytrain )
accuracy_list.append(accuracy)
rmsetest_matrix[j] = rmsetest
rmsetrain_list.append(np.sum(rmsetrain)/n_semantic)
rmsetest_list.append(np.sum(rmsetest)/n_semantic)
rmsetestwrong_list.append(np.sum(rmsetestwrong)/n_semantic)
j = j+1
plt.plot(dimension_list,accuracy_list)
plt.xlabel ("PCA Dimensions")
plt.ylabel("Accuracy")
plt.show()
train_line, = plt.plot(dimension_list,rmsetrain_list, label='Training Data')
test_line, = plt.plot(dimension_list,rmsetest_list, label='Test Data')
wrong_test_line, = plt.plot(dimension_list,rmsetestwrong_list, label='Wrong Test Data')
plt.xlabel ("PCA Dimensions")
plt.ylabel("RMSE")
plt.legend(handles=[train_line, test_line,wrong_test_line])
plt.show()
plt.plot(dimension_list,rmsetest_matrix)
plt.xlabel ("PCA Dimensions")
plt.ylabel("RMSE")
plt.show()
dif = rmsetest_matrix[0] - rmsetest_matrix[j-1]
a = dif < 0 # good semantic features should have a decreasing root mean square error with more pca dimensions
print ("semantic features that the rmse decreased", a.nonzero())
b = dif>0.5
print ("semantic features that the rmse increased by more than 0.5", b.nonzero())
bad_semantic_feature = b.nonzero()
bad_semantic_features = bad_semantic_feature[0].tolist()
def testing (w,xtrain,ytrain,xtest,ytest):
'''
Runs the different testing functions we have
:param w: weights
:param xtrain: training data
:param ytrain: target training data
:param xtest: test data
:param ytest: target test data
'''
rmseTrain = rmse_per_semantic_feature (xtrain,ytrain,w)
rmseTest = rmse_per_semantic_feature (xtest,ytest,w)
train_line, = plt.plot(rmseTrain, label='Training Data')
test_line, = plt.plot(rmseTest, label='Testing Data')
plt.xlabel ("semantic feature") # rmse per semantic feature
plt.ylabel("RMSE")
plt.legend(handles=[train_line, test_line])
plt.show()
differentPCADimensions ()
def findBadWords ():
'''
Plots different information about the words that were incorrectly guessed
such as the semantic features rmse's and counts of words versus the semantic
feature at which they didn't perform well
:return: [rmsecorrect,rmseincorrect]
'''
w = io.mmread ("w_lse_dim299.mtx")
[xtrainPCA,xtestPCA] = pcaData (299, fmri_train, fmri_test)
[yright,ywrong] = test_suite.prepareData (wordid_test,wordfeature_std)
[guessed_words, percentage] = test_suite.word_guesser(xtestPCA, w, 0, yright, ywrong)
indexes_incorrect_guesses = np.where(guessed_words == False)[0].tolist()
n = yright.shape[0]
f = yright.shape[1]
rmsecorrect = np.zeros((n,f))
rmseincorrect = np.zeros((n,f))
j = 0
for j in range(len(indexes_incorrect_guesses)):
i = indexes_incorrect_guesses[j]
ypredict = xtestPCA[i].dot(w.T)
correct_word = dictionary[int(wordid_test[i][0] - 1)]
ycorrect = yright[i]
rmsecorrect[j] = np.sqrt(np.square(ycorrect - ypredict))
incorrect_word = dictionary[int(wordid_test[i][1] - 1)]
yincorrect = ywrong[i]
rmseincorrect[j] = np.sqrt(np.square(yincorrect - ypredict))
print(correct_word, sum(rmsecorrect[j])/f,incorrect_word,sum(rmseincorrect[j])/f)
correct_line, = plt.plot(rmsecorrect[0], label='bear')
incorrect_line, = plt.plot(rmseincorrect[0], label='airplane')
plt.legend(handles=[correct_line,incorrect_line])
plt.xlabel('semantic feature')
plt.ylabel('RMSE')
plt.show()
difference_rmse = rmsecorrect - rmseincorrect # if rmsecorrect > rmseincorrect then this is bad
indexes_bad_rmse = np.where(difference_rmse>0)
plt.hist(indexes_bad_rmse [1],range(f))
plt.xlabel('semantic feature')
plt.ylabel('Counts of Words')
plt.show()
differences_summed = np.sum (difference_rmse,axis = 0)
plt.plot(differences_summed)
plt.show()
return [rmsecorrect,rmseincorrect]
def rmse_per_semantic_feature (x,y,w):
'''
Root mean square error per semantic feature
:param x: data
:param y: target values
:param w: weights
:return: root mean square error
'''
y_predict = x.dot(w.T)
n = y_predict.shape[0]
rmse = np.sum(1./n * np.square(y_predict - y),axis = 0) # rmse per semantic feature
return rmse
def drop_10_words(fmri_train,wordid_train, ytrain):
'''
Drops 10 words and trains the model on the rest of the words
:param fmri_train: training data set
:param wordid_train: word id training data set
:param ytrain: training target values
:return: short_fmri_train, short_wordid_train, short_ytrain
'''
words_to_keep = 20
num_train_words = 5 * words_to_keep
short_wordid_train = []
short_fmri_train = np.ones((num_train_words,21764))
short_ytrain = ytrain
short_ytrain = np.ones((num_train_words,218))
drop_short = [i[0] if i[0] <= words_to_keep else -1 for i in wordid_train]
q = 0
for i in range(num_train_words):
while drop_short[q] == -1:
q += 1
short_fmri_train[i] = fmri_train[q]
short_ytrain[i] = ytrain[q]
short_wordid_train.append(wordid_train[q])
q += 1
[xtrainPCA,xtestPCA] = pcaData (num_train_words, fmri_train, fmri_test)
wfile = "50_word_w_lse_dim299.mtx"
[accuracy,rmsetrain,rmsetest,rmsetestwrong] = findw_PCA_LSE(num_train_words, short_fmri_train,fmri_test,wordid_test, short_ytrain, wfile = wfile)
return short_fmri_train, short_wordid_train, short_ytrain
def main():
print('hello from fmri starter type run () or findlambda_crossvalidation()')
nonlinearFeatures (10)
if __name__ == '__main__':
main()