# Многослойный перцептрон

## MSE и MAE функции ошибки

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np

In [2]:
class FullyConnectedNetwork:
    __REGULARIZATION_GRAD = {None: lambda _w: 0, "l1": lambda _w: np.sign(_w), "l2": lambda _w: 2*_w}
    __REGULARIZATION_FUNC = {None: lambda _w: 0, "l1": lambda _w: np.abs(_w), "l2": lambda _w: _w ** 2}
    __LOSS = 0
    # создание нейронной сети: alpha — скорость обучения (шаг градиентного спуска), reg_type — тип регуляризации (если есть), lambda — параметр регуляризации; слои будут храниться в списке layers
    def __init__(self, alpha=0.01, reg_type=None, lambda_=0):
        self.__layers = list()
        self.__alpha = alpha
        self.__reg_type = reg_type
        self.__lambda = lambda_

    # метод, позволяющий добавить новый слой: указываем правильные размеры слоя, название функции активации, class_number — количество классов в случае использования Sotmax'а на последнем слое, параметр a — параметр LeakyReLU
    def add_layer(self, size: tuple, activation_func: str, a=0, loss_ = 'None'):
        if not self.__layers or self.__layers[-1].size[1] == size[0]:
            self.__layers.append(FullyConnectedLayer(size, activation_func, a, loss_))
        else:
            raise Exception("Wrong size of the layer!")

    def change_alpha(self, alpha):
        self.__alpha = alpha

    def get_loss(self):
        return FullyConnectedNetwork.__LOSS

    # метод, выдающий предсказания для заданного набора данных после обучения модели
    def predict(self, data):
        current_output = data
        for layer in self.__layers:
            layer_weights, layer_biases = layer.get_weights()
            current_output = np.matmul(current_output, layer_weights) - layer_biases
        return current_output

    def score(self, data, answers):
        return np.round((np.sum((answers - self.predict(data)) ** 2)) / len(answers), 2)

    def fit(self, data, answers):
        # выход входного слоя совпадает с фичами входных данных
        layer_outputs = [data]
        current_output = layer_outputs[0]
        grads = []
        # forward pass и вычисление градиентов функций активации
        for layer in self.__layers:
            current_output, gradient = layer.forward(current_output, answers)
            #print(current_output[0], gradient[0])
            layer_outputs.append(current_output)
            grads.append(gradient)
        # для вычисления градиентов по правилу цепочки, удобно развернуть массив
        grads = grads[::-1]
        # для градиента параметров самого первого слоя, умножаем на «производную» независимой переменной
        grads.append(1)
        current_gradient = grads[0]
        FullyConnectedNetwork.__LOSS = layer_outputs[-1]
        for i, layer in enumerate(self.__layers[::-1]):
            layer_weights, layer_biases = layer.get_weights()
            FullyConnectedNetwork.__LOSS += self.__lambda * (np.sum(FullyConnectedNetwork.__REGULARIZATION_FUNC[self.__reg_type](layer_weights) + FullyConnectedNetwork.__REGULARIZATION_FUNC[self.__reg_type](layer_biases)))
            d_weights = np.matmul(layer_outputs[-2 - i].T, current_gradient)
            # вычисление градиента параметров db слоя layer
            d_bias = -np.matmul(np.ones(layer_outputs[-2 - i].shape[0]), current_gradient) / layer_outputs[-2 - i].shape[0]
            # выполнение шага градиентного спуска
            layer.update_weights(self.__alpha * (d_weights + self.__lambda * FullyConnectedNetwork.__REGULARIZATION_GRAD[self.__reg_type](layer_weights)) , self.__alpha * (d_bias + self.__lambda * FullyConnectedNetwork.__REGULARIZATION_GRAD[self.__reg_type](layer_biases)))
            # правило цепочки
            current_gradient = np.matmul(current_gradient, layer_weights.T) * grads[i + 1]

# класс, отвечающий за слой в нейронной сети
class FullyConnectedLayer:
    # мы предполагаем, что реализованы следующие функции активации, на последнем слое возможно решение задачи классификации с Softmax
    __ACTIVATION_FUNCTIONS = {'ReLU': {'func': lambda a, x: np.maximum(x, 0), 'derivative': lambda a, x: np.where(x >= 0, 1, 0)},
                              'LReLU': {'func': lambda a, x: np.where(x >= 0, x, a*x), 'derivative': lambda a, x: np.where(x >= 0, 1, a)},
                              'None': {'func': lambda a, x: x, 'derivative': lambda a, x: 1},
                              'Sigmoid': {'func': lambda a, x: np.exp(x) / (1 + np.exp(x)), 'derivative': lambda a, x: np.exp(x) / (1 + np.exp(x)) ** 2}}
    # создание нового слоя: задание размеров слоя, случайная (равномерная на [-1/2, 1/2]) инициализация весов, запоминание функции активации, фиксация количества классов в случае решения задачи классификации
    def __init__(self, size: tuple, activation_func: str, a=0, loss_ = 'None'):
        self.size = size
        self.__weights = np.random.random((size[0], size[1])) - 0.5
        self.__bias = np.random.random((1, size[1])) - 0.5
        self.__a = a
        self.__loss = loss_
        if activation_func in FullyConnectedLayer.__ACTIVATION_FUNCTIONS.keys():
            self.__activation_func = activation_func
        else:
            raise Exception("No such activation function!")

    # метод, возвращающий значения весов: веса и смещения
    def get_weights(self):
        return self.__weights, self.__bias

    # метод, модифицирующий веса после градиентного шага
    def update_weights(self, d_weights, d_biases):
        self.__weights -= d_weights
        self.__bias -= d_biases

    # метод, возвращающий градиент
    def __get_grad(self, data, answers):
        if self.__loss == 'MSE':
            return 2*(data-answers) / answers.shape[0]
        elif self.__loss == 'MAE':
            return -((answers - data) / (abs(answers - data) + 10**-100))/ answers.shape[0]
        else:
            return FullyConnectedLayer.__ACTIVATION_FUNCTIONS[self.__activation_func]['derivative'](self.__a, data)

    # проход по слою с вычислением градиента функции активации на текущей итерации и текущем наборе данных, для последнего слоя нет нужды вычислять значение, если только не хочется узнать что-то про функцию потерь
    def forward(self, data, answers):
        matrix_pass = np.matmul(data, self.__weights) - self.__bias
        activation = FullyConnectedLayer.__ACTIVATION_FUNCTIONS[self.__activation_func]['func'](self.__a, matrix_pass)
        gradient = self.__get_grad(matrix_pass, answers)
        return activation, gradient

In [3]:
import pandas as pd

Данные: https://www.kaggle.com/datasets/bumba5341/advertisingcsv

In [4]:
df = pd.read_csv('Advertising.csv', index_col=0)
df

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9
...,...,...,...,...
196,38.2,3.7,13.8,7.6
197,94.2,4.9,8.1,9.7
198,177.0,9.3,6.4,12.8
199,283.6,42.0,66.2,25.5


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Делим на обучающую и тестовую выборку

In [6]:
X = df.drop(columns='Sales').to_numpy()
y = df.Sales.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=343, shuffle=True)

Приводим к виду (количество образцов, данные)

In [7]:
y_train = y_train.reshape(-1, 1).astype("float32")
y_test = y_test.reshape(-1, 1).astype("float32")

Стандартизируем данные

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
X_train = X_train.reshape(-1, 3).astype("float32")
X_test = X_test.reshape(-1, 3).astype("float32")

In [10]:
NN = FullyConnectedNetwork(alpha=0.00001, reg_type=None, lambda_=0.2)
NN.add_layer((3,1),'ReLU', loss_ = 'MSE')

In [11]:
%%time
loss = []
batch_size = int(len(X_train) / 100)
for ep in range(2000):
    for _ in range(0, 100):
        X_batch = X_train[_ * batch_size : (_ + 1) * batch_size]
        y_batch = y_train[_ * batch_size : (_ + 1) * batch_size]
        NN.fit(X_batch, y_batch)
        loss.append((ep, NN.get_loss()))
    if (ep + 1) % 100 == 0:
        print('Training MSE: ', NN.score(X_train, y_train), 'Test MSE:', NN.score(X_test, y_test))

Training MSE:  140.9 Test MSE: 147.39
Training MSE:  92.46 Test MSE: 94.2
Training MSE:  61.4 Test MSE: 60.97
Training MSE:  41.31 Test MSE: 40.02
Training MSE:  28.21 Test MSE: 26.71
Training MSE:  19.63 Test MSE: 18.19
Training MSE:  13.97 Test MSE: 12.7
Training MSE:  10.24 Test MSE: 9.15
Training MSE:  7.77 Test MSE: 6.84
Training MSE:  6.13 Test MSE: 5.34
Training MSE:  5.05 Test MSE: 4.35
Training MSE:  4.33 Test MSE: 3.71
Training MSE:  3.86 Test MSE: 3.29
Training MSE:  3.56 Test MSE: 3.01
Training MSE:  3.36 Test MSE: 2.83
Training MSE:  3.23 Test MSE: 2.71
Training MSE:  3.16 Test MSE: 2.64
Training MSE:  3.11 Test MSE: 2.59
Training MSE:  3.08 Test MSE: 2.56
Training MSE:  3.06 Test MSE: 2.54
Wall time: 8.61 s


In [12]:
NN = FullyConnectedNetwork(alpha=0.0001, reg_type=None, lambda_=0.2)
NN.add_layer((3,1),'ReLU', loss_ = 'MAE')

In [13]:
%%time
loss = []
batch_size = int(len(X_train) / 100)
for ep in range(3000):
    for _ in range(0, 100):
        X_batch = X_train[_ * batch_size : (_ + 1) * batch_size]
        y_batch = y_train[_ * batch_size : (_ + 1) * batch_size]
        NN.fit(X_batch, y_batch)
        loss.append((ep, NN.get_loss()))
    if (ep + 1) % 100 == 0:
        print('Training MAE: ', NN.score(X_train, y_train), 'Test MAE:', NN.score(X_test, y_test))

Training MAE:  194.25 Test MAE: 211.17
Training MAE:  168.68 Test MAE: 182.73
Training MAE:  145.17 Test MAE: 156.59
Training MAE:  123.73 Test MAE: 132.75
Training MAE:  104.31 Test MAE: 111.1
Training MAE:  86.87 Test MAE: 91.63
Training MAE:  71.42 Test MAE: 74.57
Training MAE:  57.92 Test MAE: 59.9
Training MAE:  46.1 Test MAE: 47.24
Training MAE:  35.97 Test MAE: 36.59
Training MAE:  27.5 Test MAE: 27.82
Training MAE:  20.4 Test MAE: 20.49
Training MAE:  14.67 Test MAE: 14.59
Training MAE:  10.38 Test MAE: 10.2
Training MAE:  7.32 Test MAE: 7.1
Training MAE:  5.49 Test MAE: 5.25
Training MAE:  4.39 Test MAE: 4.17
Training MAE:  3.73 Test MAE: 3.47
Training MAE:  3.38 Test MAE: 3.04
Training MAE:  3.27 Test MAE: 2.89
Training MAE:  3.26 Test MAE: 2.86
Training MAE:  3.26 Test MAE: 2.82
Training MAE:  3.25 Test MAE: 2.76
Training MAE:  3.23 Test MAE: 2.7
Training MAE:  3.21 Test MAE: 2.67
Training MAE:  3.2 Test MAE: 2.67
Training MAE:  3.2 Test MAE: 2.66
Training MAE:  3.2 Test MAE