In [148]:
import numpy as np
from sklearn import datasets
np.random.seed(2042)

In [149]:
iris = datasets.load_iris()
X = iris.data[:,(2,3)]
y = iris.target.astype(int)

In [150]:
X_bias = np.c_[np.ones((len(X),1)),X]

In [154]:
def train_test_val_split(X,y,shuffle=True,test_ratio=0.2,val_ratio=0.2):
    size = len(X)
    if shuffle:
        indexes = np.random.permutation(len(X))
    else:
        indexes = range(len(X))

    test_size = int(size*test_ratio)
    val_size = int(test_size*val_ratio)
    train_size = size-test_size-val_size

    X_train = X[indexes[:train_size]]
    y_train = y[indexes[:train_size]]
    X_val = X[indexes[train_size:-test_size]]
    y_val = y[indexes[train_size:-test_size]]
    X_test = X[indexes[-test_size:]]
    y_test = y[indexes[-test_size:]]

    return X_train,y_train,X_val,y_val,X_test,y_test

In [155]:
X_train,y_train,X_val,y_val,X_test,y_test = train_test_val_split(X_bias,y)

In [158]:
def hot_encode(y):
    num_classes = y.max()+1
    m = len(y)
    y_hot_encode = np.zeros((m,num_classes))
    y_hot_encode[np.arange(m), y] = 1 
    return y_hot_encode

y_train_encode = hot_encode(y_train)
y_test_encode = hot_encode(y_test)
y_val_encode = hot_encode(y_val)

In [159]:
def score(X,Theta):
    return X.dot(Theta)

In [160]:
def softmax_function(scores):
    dividend = np.exp(scores)
    divisor = np.sum(dividend, axis=1, keepdims=True)
    return dividend/divisor

In [188]:
def cross_entropy_cost_function(y,probabilities):
    return -np.mean(np.sum(y_train_encode*np.log(probabilities),axis=1))

In [185]:
def cross_entropy_gradient(X,y,probabilities):
    m = len(X)
    return (1/m)*X.T.dot(probabilities-y)

In [164]:
num_features = X_bias.shape[1] 
num_classes = len(np.unique(y))

In [194]:
Theta = np.random.rand(num_features,num_classes)
epsilon = 1e-6
epochs = 10**6
learning_rate = 0.01

for i in range(epochs):
    scores = score(X_train,Theta)
    probabilities = softmax_function(scores)
    if i%1000 == 0:
        print('Iteration #',i,'     Cost function:',cross_entropy_cost_function(y_train_encode,probabilities))
    gradient = cross_entropy_gradient(X_train,y_train_encode,probabilities)
    Theta = Theta - learning_rate*gradient


Iteration # 0      Cost function: 2.254005338027965
Iteration # 1000      Cost function: 0.6706762427206363
Iteration # 2000      Cost function: 0.5347580164506129
Iteration # 3000      Cost function: 0.46650471614900035
Iteration # 4000      Cost function: 0.4237422073817545
Iteration # 5000      Cost function: 0.39335911992940525
Iteration # 6000      Cost function: 0.37008306659424184
Iteration # 7000      Cost function: 0.35135918378165787
Iteration # 8000      Cost function: 0.33577861607154263
Iteration # 9000      Cost function: 0.3224895829437495
Iteration # 10000      Cost function: 0.31094085649890885
Iteration # 11000      Cost function: 0.30075660908287793
Iteration # 12000      Cost function: 0.2916697979622943
Iteration # 13000      Cost function: 0.28348420086705056
Iteration # 14000      Cost function: 0.27605156180106294
Iteration # 15000      Cost function: 0.2692572021518554
Iteration # 16000      Cost function: 0.2630106160755168
Iteration # 17000      Cost function