In [1]:
from sklearn.datasets import load_iris
import numpy as np

In [2]:
data = load_iris()
X = data['data'][:,(2,3)]
y = data['target']


In [40]:
X.shape

(150, 2)

In [3]:
#смещение
X_with_bias = np.c_[np.ones([len(X), 1]), X]
np.random.seed(1238)

In [4]:
test_ratio = 0.2
validation_ratio = 0.2
total_size = len(X_with_bias)

test_size = int(total_size * test_ratio)
validation_size = int(total_size * validation_ratio)
train_size = total_size - test_size - validation_size

rnd_indicies =  np.random.permutation(total_size)

X_train = X_with_bias[rnd_indicies[:train_size]]
y_train = y[rnd_indicies[:train_size]]
X_test = X_with_bias[rnd_indicies[-test_size:]]
y_test = y[rnd_indicies[-test_size:]]
X_valid = X_with_bias[rnd_indicies[train_size: -test_size]]
y_valid = y[rnd_indicies[train_size: -test_size]]

In [5]:
#вектор классов в в матрицу one_hot
def to_one_hot(y):
    n_classes = y.max() + 1
    m = len(y)
    Y_one_hot = np.zeros((m, n_classes))
    Y_one_hot[np.arange(m), y] = 1
    return Y_one_hot

In [44]:
#проверим функцию
print(y_train[:10])
to_one_hot(y_train[:10])

[2 1 2 1 2 2 0 0 1 0]


array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [6]:
y_train_one_hot = to_one_hot(y_train)
y_test_one_hot = to_one_hot(y_test)
y_valid_one_hot = to_one_hot(y_valid)


In [7]:
def softmax(logits):
    exps = np.exp(logits)
    exp_sums = np.sum(exps, axis = 1, keepdims=True)
    return exps / exp_sums


In [8]:
n_inputs = X_train.shape[1]
n_outputs = len(np.unique(y_train))


In [50]:
eta = 0.01
n_iterations = 5001
m = len(X_train)
eplison = 1e-7
Theta = np.random.randn(n_inputs, n_outputs)


for iterations in range(n_iterations):
    logits = X_train.dot(Theta)
    y_proba = softmax(logits)
    if iterations % 500 == 0:
        loss = - np.mean(np.sum(y_train_one_hot * np.log(y_proba + eplison), axis = 1))
        print(iterations, loss)
    error = y_proba - y_train_one_hot
    gradients = 1 / m * X_train.T.dot(error)
    Theta = Theta - eta * gradients

0 4.29637560309506
500 0.6593354752556455
1000 0.5889612997099655
1500 0.5401417012181138
2000 0.5040257999626816
2500 0.47600657844272043
3000 0.45347822549106304
3500 0.43485813175779714
4000 0.4191275186720931
4500 0.40559896625729197
5000 0.393790596450582


In [51]:
Theta


array([[ 4.01405007,  0.35467851, -1.89989067],
       [-0.28904872,  0.56741465,  0.38997154],
       [-2.10952097, -0.32079596,  1.78183178]])

In [55]:
#проверка точности модели
logits = X_valid.dot(Theta)
y_proba = softmax(logits)
y_predict = np.argmax(y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_valid)
accuracy_score

0.9

In [61]:
#добавим регуляризацию
eta = 0.1
n_iterations = 5001
m = len(X_train)
eplison = 1e-7
alpha = 0.1  #гиперпараметр регуляризации

Theta = np.random.randn(n_inputs, n_outputs)

for iterations in range(n_iterations):
    logits = X_train.dot(Theta)
    y_proba = softmax(logits)
    if iterations % 500 == 0:
        xentropy_loss = -np.mean(np.sum(y_train_one_hot * np.log(y_proba + eplison), axis = 1))
        l2_loss = 1/2 *  np.sum(np.square(Theta[1:]))
        loss = xentropy_loss + alpha * l2_loss
        print(iterations, loss)
    error = y_proba - y_train_one_hot
    gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]
    Theta = Theta - eta * gradients

0 7.78166571360394
500 0.5505590462012283
1000 0.5271994152258911
1500 0.5200967069923391
2000 0.5173417498405739
2500 0.5161802386155885
3000 0.5156697370823268
3500 0.5154399275387939
4000 0.5153349412472252
4500 0.5152865261824984
5000 0.5152640615616183


In [74]:
logits = X_valid.dot(Theta)
y_proba = softmax(logits)
y_predict = np.argmax(y_proba, axis = 1)
y_true = y_predict[y_predict == y_valid]
acc = len(y_true) / len(y_valid)
acc
# accuracy_score = np.mean(y_predict == y_valid)
# accuracy_score

0.9333333333333333

In [14]:
eta = 0.1 
n_iterations = 5001
m = len(X_train)
epsilon = 1e-7
alpha = 0.1  # regularization hyperparameter
best_loss = np.infty

Theta = np.random.randn(n_inputs, n_outputs)

for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    y_proba = softmax(logits)
    error = y_proba - y_train_one_hot
    gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]
    Theta = Theta - eta * gradients

    logits = X_valid.dot(Theta)
    y_proba = softmax(logits)
    xentropy_loss = -np.mean(np.sum(y_valid_one_hot * np.log(y_proba + epsilon), axis=1))
    l2_loss = 1/2 * np.sum(np.square(Theta[1:]))
    loss = xentropy_loss + alpha * l2_loss
    if iteration % 500 == 0:
        print(iteration, loss)
    if loss < best_loss:
        best_loss = loss
    else:
        print(iteration - 1, best_loss)
        print(iteration, loss, "early stopping!")
        break

0 1.3155876510898834
500 0.5737008422554332
1000 0.5369783259344605
1500 0.5227301964984776
2000 0.5158919344193085
2500 0.5122450298839996
3000 0.5101585146528007
3500 0.5089027104923389
4000 0.5081182780573976
4500 0.5076148147050745
5000 0.5072852924297672


In [15]:
logits = X_valid.dot(Theta)
y_proba = softmax(logits)
y_predict = np.argmax(y_proba, axis = 1)
y_true = y_predict[y_predict == y_valid]
acc = len(y_true) / len(y_valid)
acc

0.9333333333333333