forked from jjfiv/cs451-practicals
-
Notifications
You must be signed in to change notification settings - Fork 0
/
p07-perceptron.py
286 lines (238 loc) · 9.77 KB
/
p07-perceptron.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#%%
from collections import defaultdict
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
# new helpers:
from shared import (
dataset_local_path,
)
# stdlib:
from dataclasses import dataclass, field
import json
from typing import List, DefaultDict
#%% load up the data
examples = []
ys = []
##
# Notice that we're using hand-designed features here, not text features:
##
with open(dataset_local_path("poetry_id.jsonl")) as fp:
for line in fp:
info = json.loads(line)
# Note: the data contains a whole bunch of extra stuff; we just want numeric features for now.
keep = info["features"]
# whether or not it's poetry is our label.
ys.append(info["poetry"])
# hold onto this single dictionary.
examples.append(keep)
## CONVERT TO MATRIX:
feature_numbering = DictVectorizer(sort=True, sparse=False)
X = feature_numbering.fit_transform(examples)
print("Features as {} matrix.".format(X.shape))
## SPLIT DATA:
RANDOM_SEED = 12345678
# Numpy-arrays are more useful than python's lists.
y = np.array(ys)
# split off train/validate (tv) pieces.
rX_tv, rX_test, y_tv, y_test = train_test_split(
X, y, train_size=0.75, shuffle=True, random_state=RANDOM_SEED
)
# split off train, validate from (tv) pieces.
rX_train, rX_vali, y_train, y_vali = train_test_split(
rX_tv, y_tv, train_size=0.66, shuffle=True, random_state=RANDOM_SEED
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# TODO: Exploration 2: What normalization is best for your models?
# THINK: Why didn't we normalize for decision trees?
#
# These are the three approaches to scaling I see in practice: variance / standard-deviation, min/max, nothing.
# This replaces the X / 1000 hack we did a few weeks ago.
norm = "var"
if norm == "var":
scale = StandardScaler()
X_train = scale.fit_transform(rX_train)
X_vali = scale.transform(rX_vali)
X_test = scale.transform(rX_test)
elif norm == "max":
scale = MinMaxScaler()
X_train = scale.fit_transform(rX_train)
X_vali = scale.transform(rX_vali)
X_test = scale.transform(rX_test)
else:
X_train = rX_train
X_vali = rX_vali
X_test = rX_test
print(X_train.shape, X_vali.shape, X_test.shape)
# Delete these generic variables that we really shouldn't use anymore now that the data is fully-prepared.
del X, y, ys, rX_train, rX_vali, rX_test
#%% Define and Train Models
@dataclass
class LinearModel:
weights: np.ndarray # note we can't specify this is 1-dimensional
bias: float = 0.0
def decision_function(self, X: np.ndarray) -> np.ndarray:
""" Compute the signed distance from the self.weights hyperplane. """
(N, D) = X.shape
assert self.weights.shape == (D, 1)
# Matrix multiplication; sprinkle transpose and assert to get the shapes you want (or remember Linear Algebra)... or both!
output = np.dot(self.weights.transpose(), X.transpose())
assert output.shape == (1, N)
return (output + self.bias).reshape((N,))
def predict(self, X: np.ndarray) -> np.ndarray:
""" Take whether the points are above or below our hyperplane as a prediction. """
return self.decision_function(X) > 0
def score(self, X: np.ndarray, y: np.ndarray) -> float:
""" Take predictions and compute accuracy. """
y_hat = self.predict(X)
return metrics.accuracy_score(np.asarray(y), y_hat)
def compute_auc(self, X: np.ndarray, y: np.ndarray) -> float:
""" Distance to hyperplane is used for AUC-style metrics. """
return metrics.roc_auc_score(y, self.decision_function(X))
@dataclass
class ModelTrainingCurve:
train: List[float] = field(default_factory=list)
validation: List[float] = field(default_factory=list)
def add_sample(
self,
m: LinearModel,
X: np.ndarray,
y: np.ndarray,
X_vali: np.ndarray,
y_vali: np.ndarray,
) -> None:
self.train.append(m.score(X, y))
self.validation.append(m.score(X_vali, y_vali))
# These are the named lines that will be plotted:
learning_curves: DefaultDict[str, ModelTrainingCurve] = defaultdict(ModelTrainingCurve)
def train_perceptron(y, X, y_vali, X_vali, num_iter=100, seed=1231) -> LinearModel:
rand = np.random.default_rng(seed)
(num_examples, num_features) = X.shape
assert len(y) == num_examples
w = np.zeros((num_features, 1))
b = 0.0
indices = list(range(num_examples))
for iteration in range(num_iter):
rand.shuffle(indices)
wrong = 0
for i in indices:
if y[i]:
y_val = 1
else:
y_val = -1
x_i = X[i, :].reshape((num_features, 1))
activation = np.dot(w.transpose(), x_i) + b
if y[i] != (activation > 0):
wrong += 1
# we got it wrong! update!
w += y_val * x_i
b += y_val
if wrong == 0:
break
tmp = LinearModel(w, b)
learning_curves["Perceptron"].add_sample(tmp, X, y, X_vali, y_vali)
return LinearModel(w, b)
def train_averaged_perceptron(
y, X, y_vali, X_vali, num_iter=100, seed=1231
) -> LinearModel:
rand = np.random.default_rng(seed)
(num_examples, num_features) = X.shape
assert len(y) == num_examples
w_avg = np.zeros((num_features, 1))
b_avg = 0.0
w = np.zeros((num_features, 1))
b = 0.0
current_correct = 0
indices = list(range(num_examples))
for iteration in range(num_iter):
rand.shuffle(indices)
wrong = 0
for i in indices:
if y[i]:
y_val = 1
else:
y_val = -1
x_i = X[i, :].reshape((num_features, 1))
activation = np.dot(w.transpose(), x_i) + b
if y[i] != (activation > 0):
# update 'average' vector:
w_avg += current_correct * w
b_avg += current_correct * b
current_correct = 0
# update 'current' vector
wrong += 1
# we got it wrong! update!
w += y_val * x_i
b += y_val
else:
current_correct += 1
if wrong == 0:
break
tmp = LinearModel(w_avg, b_avg)
learning_curves["Averaged-Perceptron"].add_sample(tmp, X, y, X_vali, y_vali)
return LinearModel(w_avg, b_avg)
model = train_perceptron(y_train, X_train, y_vali, X_vali, num_iter=1000)
print("P. Train-Accuracy: {:.3}".format(model.score(X_train, y_train)))
print("P. Vali-Accuracy: {:.3}".format(model.score(X_vali, y_vali)))
model = train_averaged_perceptron(y_train, X_train, y_vali, X_vali, num_iter=1000)
print("AP. Train-Accuracy: {:.3}".format(model.score(X_train, y_train)))
print("AP. Vali-Accuracy: {:.3}".format(model.score(X_vali, y_vali)))
# Note that Sci-Kit Learn's Perceptron uses an alternative method of training.
# Is it an averaged perceptron or a regular perceptron?
skP = Perceptron()
print("Train sklearn-Perceptron (skP)")
for iter in range(1000):
# Note we use partial_fit rather than fit to expose the loop to our code!
skP.partial_fit(X_train, y_train, classes=(0, 1))
learning_curves["skPerceptron"].add_sample(skP, X_train, y_train, X_vali, y_vali)
print("skP. Train-Accuracy: {:.3}".format(skP.score(X_train, y_train)))
print("skP. Vali-Accuracy: {:.3}".format(skP.score(X_vali, y_vali)))
## TODO Exploration 1: use a loop around partial-fit to generate another graph!
#
## TODO Exploration 1A: Try a MLP (Multi-Layer Perceptron).
mlp = MLPClassifier(hidden_layer_sizes=(32,))
print("Train MLPClassifier (mla)")
for iter in range(1000):
# Note we use partial_fit rather than fit to expose the loop to our code!
mlp.partial_fit(X_train, y_train, classes=(0, 1))
learning_curves["MLPClassifier"].add_sample(mlp, X_train, y_train, X_vali, y_vali)
print("mlp. Train-Accuracy: {:.3}".format(mlp.score(X_train, y_train)))
print("mlp. Vali-Accuracy: {:.3}".format(mlp.score(X_vali, y_vali)))
## TODO Exploration 1B: Try another Linear Model
sgdc = SGDClassifier()
print("Train SGDClassifier (sgdc)")
for iter in range(1000):
# Note we use partial_fit rather than fit to expose the loop to our code!
sgdc.partial_fit(X_train, y_train, classes=(0, 1))
learning_curves["MLPClassifier"].add_sample(sgdc, X_train, y_train, X_vali, y_vali)
print("sgdc. Train-Accuracy: {:.3}".format(sgdc.score(X_train, y_train)))
print("sgdc. Vali-Accuracy: {:.3}".format(sgdc.score(X_vali, y_vali)))
## TODO Think: Why can't we make a graph like this for DecisionTreeClassifier?
# DecisionTreeClassifier doesn't train by iteration. The model trains once and produces
# a single score. There's no partial fit for decision tree.
#%% Plot!
#
# This is the first time we're seeing how to make a line plot.
# Also the first time we're creating plots in a loop! (Gets too busy if they're all on the same chart, IMO)
# Matplotlib *does* have subplots, but they're so fiddly.
#
for key, dataset in learning_curves.items():
xs = np.array(list(range(len(dataset.train))))
# line-plot:
plt.plot(xs, dataset.train, label="{} Train".format(key), alpha=0.7)
plt.plot(xs, dataset.validation, label="{} Validate".format(key), alpha=0.7)
# scatter-plot: (maybe these look nicer to you?)
# plt.scatter(xs, points, label=key, alpha=0.7, marker=".")
plt.ylim((0.75, 1.0))
plt.title("{} Learning Curves [norm={}]".format(key, norm))
plt.xlabel("Iteration")
plt.ylabel("Accuracy")
plt.legend()
plt.tight_layout()
plt.savefig("graphs/p07-{}-curve-{}.png".format(key, norm))
plt.show()