In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy.io import loadmat
from scipy.optimize import minimize
from sklearn.metrics import classification_report

In [2]:
def one_hot(Y):
    res = []
    for i in Y:
        temp = np.zeros(10)
        temp[i-1] = 1
        res.append(temp)
    return res

In [3]:
path_1 = 'ex4data1.mat'
path_2 = 'ex4weights.mat'
data = loadmat(path_1)
theta = loadmat(path_2)
X = data['X']
Y = data['y']
theta_1 = theta['Theta1']
theta_2 = theta['Theta2']
YY = Y.reshape(-1)
YY = one_hot(YY)
YY = np.array(YY)
X = np.insert(X,0,np.ones(X.shape[0]),axis=1)

In [4]:
def serialize(theta_1,theta_2):
    return np.concatenate((np.ravel(theta_1),np.ravel(theta_2)))

In [5]:
def deserialize(theta_all):
    return theta_all[:25*401].reshape(25,401), theta_all[25*401:].reshape(10,26)

In [6]:
def sigmoid(X):
    return 1 / (1 + np.exp(-X))

In [7]:
def feed_forward(theta_all,X):
    w_1,w_2 = deserialize(theta_all)
    a1 = X
    z2 = a1@w_1.T
    a2 = sigmoid(z2)
    a2 = np.insert(a2,0,np.ones(a2.shape[0]),axis=1)

    z3 = a2@w_2.T
    a3 = sigmoid(z3)

    y_p = a3
    return a1,z2,a2,z3,a3

In [8]:
print(theta_2.shape)
print(theta_1.shape)
# print(np.array(feed_forward(serialize(theta_1,theta_2),X,Y)[-1]).shape)
print(np.array(YY).shape)
print(np.array(feed_forward(serialize(theta_1,theta_2),X)[1]).shape)

(10, 26)
(25, 401)
(5000, 10)
(5000, 25)


In [9]:
def cost(theta, X, y):
    h = feed_forward(theta, X)[-1]
    tmp = -y * np.log(h) - (1-y) * np.log(1-h)
    return tmp.sum() / y.shape[0]

In [10]:
def sigmoid_gradient(X):
    return sigmoid(X) * (1 - sigmoid(X))

In [11]:
def gradient(theta_all,X,Y):
    w_1,w_2 = deserialize(theta_all)
    a1,z2,a2,z3,a3 = feed_forward(theta_all,X)
    d3 = a3 - Y
    d2 = d3@w_2[:,1:]*sigmoid_gradient(z2)
    D2 = (d3.T@a2) / len(X)
    D1 = (d2.T@a1) / len(X)
    return serialize(D1,D2)

In [12]:
def reg_cost(theta_serialize,X,y,lamda):
    theta1,theta2 = deserialize(theta_serialize)
    sum1 = np.sum(np.power(theta1[:,1:],2)) # 注意从1开始
    sum2 = np.sum(np.power(theta2[:,1:],2))
    reg = (sum1 + sum2) * lamda / (2*len(X))
    return reg + cost(theta_serialize,X,y)  # 注意返回值加上了cost()

In [13]:
def reg_gradient(theta_serialize,X,y,lamda):
    D = gradient(theta_serialize,X,y)
    D1,D2 = deserialize(D)
    
    theta1,theta2 = deserialize(theta_serialize)
    D1[:,1:] = D1[:,1:]  + theta1[:,1:] * lamda / len(X)
    D2[:,1:] = D2[:,1:]  + theta2[:,1:] * lamda / len(X)
    
    return serialize(D1,D2)

In [14]:
def training(X,Y):
    # theta = np.zeros(10285)
    # theta = np.random.uniform(-0.5,0.5,10285)
    theta = np.random.uniform(-0.12, 0.12, 10285)
    result = minimize(fun=reg_cost,x0=theta,args=(X,Y,1),method='TNC',jac=reg_gradient)
    return result

In [15]:
res = training(X,YY)
y_p = feed_forward(res.x,X)[-1]

In [16]:
res

 message: Converged (|f_n-f_(n-1)| ~= 0)
 success: True
  status: 1
     fun: 0.2974060293738159
       x: [-2.068e+00 -5.269e-11 ... -1.622e+00 -1.094e+00]
     nit: 165
     jac: [ 8.808e-08 -1.054e-14 ...  8.438e-08  3.500e-08]
    nfev: 4348

In [17]:
y_pred = np.argmax(y_p,axis=1) + 1
print(y_pred)

[10 10 10 ...  9  9  9]


In [18]:
print(classification_report(Y,y_pred))

              precision    recall  f1-score   support

           1       0.99      1.00      1.00       500
           2       1.00      1.00      1.00       500
           3       1.00      0.99      1.00       500
           4       1.00      0.99      1.00       500
           5       1.00      1.00      1.00       500
           6       1.00      1.00      1.00       500
           7       0.99      1.00      1.00       500
           8       1.00      1.00      1.00       500
           9       0.99      0.99      0.99       500
          10       1.00      1.00      1.00       500

    accuracy                           1.00      5000
   macro avg       1.00      1.00      1.00      5000
weighted avg       1.00      1.00      1.00      5000

