In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import load_iris

iris = load_iris()
X = iris['data'][:, (2, 3)]
y = iris['target']

In [3]:
X_with_bias = np.c_[np.ones([len(X), 1]), X]

In [4]:
np.random.seed(42)

In [5]:
def train_valid_test_split(X, y, valid_ratio=0.2, test_ratio=0.2):
    
    size = len(X)
    valid_size = int(size * valid_ratio)
    test_size = int(size * test_ratio)
    train_size = size - valid_size - test_size
    
    rnd_indices = np.random.permutation(size)
    X_train = X[rnd_indices[:train_size]]
    y_train = y[rnd_indices[:train_size]]
    X_valid = X[rnd_indices[train_size:-test_size]]
    y_valid = y[rnd_indices[train_size:-test_size]]
    X_test = X[rnd_indices[-test_size:]]
    y_test = y[rnd_indices[-test_size:]]
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [6]:
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(X_with_bias, y)

In [7]:
def to_one_hot(y):
    
    n_classes = y.max() + 1
    m = len(y)
    y_one_hot = np.zeros((m, n_classes))
    y_one_hot[np.arange(m), y] = 1
    
    return y_one_hot

In [8]:
y_train[:10]

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1])

In [9]:
to_one_hot(y_train[:10])

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [10]:
y_train_one_hot = to_one_hot(y_train)
y_valid_one_hot = to_one_hot(y_valid)
y_test_one_hot = to_one_hot(y_test)

In [11]:
def softmax(logits):
    
    exps = np.exp(logits)
    exp_sums = np.sum(exps, axis=1, keepdims=True)
    
    return exps / exp_sums

In [12]:
n_inputs = X_train.shape[1]
n_outputs = len(np.unique(y_train))

In [13]:
eta = 0.01
n_iters = 5000
m = len(X_train)
epsilon = 1e-7

theta = np.random.randn(n_inputs, n_outputs)

In [14]:
for iter in range(n_iters+1):
    logits = X_train.dot(theta)
    y_proba = softmax(logits)
    if iter % 500 == 0:
        loss = -np.mean(np.sum(y_train_one_hot * np.log(y_proba + epsilon), axis=1))
        print(f'iterration {iter}, loss : {loss}')
    error = y_proba - y_train_one_hot
    gradients = 1 / m * X_train.T.dot(error)
    theta = theta - eta * gradients

iterration 0, loss : 3.5356045081790177
iterration 500, loss : 0.7698276617097016
iterration 1000, loss : 0.6394784332731978
iterration 1500, loss : 0.5618741363839648
iterration 2000, loss : 0.5095831080853224
iterration 2500, loss : 0.471273775599093
iterration 3000, loss : 0.44155863305230325
iterration 3500, loss : 0.4175598664804123
iterration 4000, loss : 0.3975941721521857
iterration 4500, loss : 0.38060484552797946
iterration 5000, loss : 0.3658905593000994


In [15]:
theta

array([[ 2.44942005, -1.63172695, -3.63642175],
       [-0.61947541,  0.50273412,  0.22142236],
       [-0.96378971,  0.39312153,  2.48742003]])

In [16]:
logits = X_valid.dot(theta)
y_proba = softmax(logits)
y_predict = np.argmax(y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_valid)
accuracy_score

0.9333333333333333