In [362]:
import numpy as np
from sklearn import datasets
np.random.seed(10)

In [381]:
iris = datasets.load_iris()
X = iris.data[:,(2,3)]
y = iris.target.astype(int)

X[:5]

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2]])

In [383]:
X_bias = np.c_[np.ones((len(X),1)),X]
X_bias[:5]

array([[1. , 1.4, 0.2],
       [1. , 1.4, 0.2],
       [1. , 1.3, 0.2],
       [1. , 1.5, 0.2],
       [1. , 1.4, 0.2]])

In [365]:
def train_test_val_split(X,y,shuffle=True,test_ratio=0.2,val_ratio=0.2):
    size = len(X)
    if shuffle:
        indexes = np.random.permutation(len(X))
    else:
        indexes = range(len(X))

    test_size = int(size*test_ratio)
    val_size = int(test_size*val_ratio)
    train_size = size-test_size-val_size

    X_train = X[indexes[:train_size]]
    y_train = y[indexes[:train_size]]
    X_val = X[indexes[train_size:-test_size]]
    y_val = y[indexes[train_size:-test_size]]
    X_test = X[indexes[-test_size:]]
    y_test = y[indexes[-test_size:]]

    return X_train,y_train,X_val,y_val,X_test,y_test

In [384]:
X_train,y_train,X_val,y_val,X_test,y_test = train_test_val_split(X_bias,y)
X_train[:5]

array([[1. , 1.9, 0.2],
       [1. , 3.6, 1.3],
       [1. , 5.9, 2.1],
       [1. , 6.7, 2.2],
       [1. , 6.7, 2. ]])

In [385]:
def hot_encode(y):
    num_classes = y.max()+1
    m = len(y)
    y_hot_encode = np.zeros((m,num_classes))
    y_hot_encode[np.arange(m), y] = 1 
    return y_hot_encode

y_train_encode = hot_encode(y_train)
y_test_encode = hot_encode(y_test)
y_val_encode = hot_encode(y_val)
y_train_encode[:5]

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [368]:
def score(X,Theta):
    return X.dot(Theta)

def softmax_function(scores):
    dividend = np.exp(scores)
    divisor = np.sum(dividend, axis=1, keepdims=True)
    return dividend/divisor

def cross_entropy_cost_function(y,probabilities):
    return -np.mean(np.sum(y_train_encode*np.log(probabilities),axis=1))

def cross_entropy_gradient(X,y,probabilities):
    m = len(X)
    return (1/m)*X.T.dot(probabilities-y)

In [373]:
num_features = X_bias.shape[1] 
num_classes = len(np.unique(y))
Theta = np.random.rand(num_features,num_classes)
epsilon = 1e-6
epochs = 5001
learning_rate = 0.01

for i in range(epochs):
    scores = score(X_train,Theta)
    probabilities = softmax_function(scores)
    if i%500 == 0:
        print('Iteration #',i,'     Cost function:',cross_entropy_cost_function(y_train_encode,probabilities))
    gradient = cross_entropy_gradient(X_train,y_train_encode,probabilities)
    Theta = Theta - learning_rate*gradient


Iteration # 0      Cost function: 1.7809351823164337
Iteration # 500      Cost function: 0.7853082500658067
Iteration # 1000      Cost function: 0.662175381833871
Iteration # 1500      Cost function: 0.5866044593945988
Iteration # 2000      Cost function: 0.535453912319336
Iteration # 2500      Cost function: 0.49804949859141173
Iteration # 3000      Cost function: 0.46909122544766735
Iteration # 3500      Cost function: 0.4457124769766781
Iteration # 4000      Cost function: 0.4262389179734997
Iteration # 4500      Cost function: 0.40962717736569243
Iteration # 5000      Cost function: 0.3951912238521942


In [386]:
Theta

array([[ 3.71815309, -0.26937708, -2.47500322],
       [-0.36943744,  0.99537361,  0.74931853],
       [-0.79703177, -0.24043082,  1.92060092]])

In [378]:
def val_score(X_val,y_val,Theta):
    scores = score(X_val,Theta)
    probabilities = softmax_function(scores)
    prediction = np.argmax(probabilities,axis=1)
    accuracy = np.mean(prediction == y_val)
    print('Accuracy:',accuracy)
    return accuracy

val_score(X_val,y_val,Theta);

Accuracy: 0.8333333333333334
