In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
X = np.array([ [   1,    1,  500,    1],
               [   1,    1,  700,    1],
               [   1,    2,  750,    2],
               [   1,    5,  600,    1],
               [   1,    3, 1450,    2],
               [   1,    0,  800,    1],
               [   1,    5, 1500,    3],
               [   1,   10, 2000,    3],
               [   1,    1,  450,    1],
               [   1,    2, 1000,    2]], dtype=np.float64)

y = np.array([0, 0, 1, 0, 1, 0, 1, 0, 1, 1], dtype=np.float64)

# Задание №1

Измените функцию calc_logloss так, чтобы нули по возможности не попадали в np.log.

In [3]:
def calc_logloss(y, y_pred):
    eps = np.finfo(y_pred.dtype).eps
    P = np.array([max(eps, min(1 - eps, i)) for i in y_pred])
    err = - np.mean(y * np.log(P) + (1.0 - y) * np.log(1.0 - P))

    return err

In [4]:
# Пример применения
y1 = np.array([1, 0])
y_pred1 = np.array([0.8, 0.1])
calc_logloss(y1, y_pred1)

0.164252033486018

In [5]:
# Плохой пример применения
y1 = np.array([1, 0])
y_pred1 = np.array([1, 0.2])
calc_logloss(y1, y_pred1)

0.11157177565710497

Проверка

In [6]:
from sklearn.metrics import log_loss

In [7]:
log_loss(y1, y_pred1)

0.11157177565710535

In [8]:
calc_logloss(y1, y_pred1) - log_loss(y1, y_pred1)

-3.885780586188048e-16

# Задание №2

Подберите аргументы функции eval_model для логистической регрессии таким образом, чтобы log loss был минимальным.

In [9]:
def standard_scale(X):
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    return (X - mean) / std

In [10]:
X_st = X.copy().astype(np.float64)
X_st[:, 1:4] = standard_scale(X_st[:, 1:4])
X_st

array([[ 1.        , -0.70710678, -0.97958969, -0.89625816],
       [ 1.        , -0.70710678, -0.56713087, -0.89625816],
       [ 1.        , -0.35355339, -0.46401617,  0.38411064],
       [ 1.        ,  0.70710678, -0.77336028, -0.89625816],
       [ 1.        ,  0.        ,  0.97958969,  0.38411064],
       [ 1.        , -1.06066017, -0.36090146, -0.89625816],
       [ 1.        ,  0.70710678,  1.08270439,  1.66447944],
       [ 1.        ,  2.47487373,  2.11385144,  1.66447944],
       [ 1.        , -0.70710678, -1.08270439, -0.89625816],
       [ 1.        , -0.35355339,  0.05155735,  0.38411064]])

In [11]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [12]:
def eval_model(X, y, iterations, eta=1):
    np.random.seed(42)
    W = np.random.randn(X.shape[1])
    n = X.shape[0]
    
    
    for i in range(iterations):
        z = np.dot(X, W)
        y_pred = sigmoid(z)
        err = calc_logloss(y, y_pred)
        
        dQ = 1/n * X.T @ (y_pred - y)
        W -= eta * dQ
        if i % (iterations / 10) == 0:
            print(f'Iter - {i}, Weights - {W}, error - {err}')
    final_error = calc_logloss(y, y_pred)
    return W, final_error

In [13]:
W, error = eval_model(X_st, y, iterations=100, eta=1)

Iter - 0, Weights - [ 0.46121499 -0.40908382  0.41426183  1.39855008], error - 0.760958797591889
Iter - 10, Weights - [ 0.0387634  -1.30386446 -0.15001914  1.74758184], error - 0.4254600283700163
Iter - 20, Weights - [-0.01161527 -1.64023671 -0.31196552  2.27533881], error - 0.38108858271134743
Iter - 30, Weights - [-0.01156172 -1.88360488 -0.46035705  2.69723081], error - 0.3542235298084087
Iter - 40, Weights - [ 9.33444488e-04 -2.07189803e+00 -5.97280973e-01  3.05064764e+00], error - 0.3358002141655348
Iter - 50, Weights - [ 0.01934395 -2.22448855 -0.72422385  3.35653644], error - 0.3221745410588311
Iter - 60, Weights - [ 0.04127542 -2.35245169 -0.84255667  3.6277315 ], error - 0.31154425165004745
Iter - 70, Weights - [ 0.06539266 -2.46266676 -0.95347146  3.87262358], error - 0.3029143996576552
Iter - 80, Weights - [ 0.090846   -2.55967079 -1.05796576  4.09697143], error - 0.2956909276718162
Iter - 90, Weights - [ 0.11707191 -2.64659092 -1.15686395  4.30488038], error - 0.28949721467

# Задание №3

Создайте функцию calc_pred_proba, возвращающую предсказанную вероятность класса 1. На вход подаётся W, который уже посчитан функцией eval_model, и X, на выходе — массив y_pred_proba.

In [14]:
def calc_pred_proba(W, X):
    y_pred_proba = sigmoid(np.dot(X, W))
    return np.round(y_pred_proba, 4)

In [15]:
calc_pred_proba(W, X_st)

array([0.3238, 0.223 , 0.9677, 0.0079, 0.656 , 0.3674, 0.987 , 0.1477,
       0.3524, 0.9404])

# Задание №4

Создайте функцию calc_pred, возвращающую предсказанный класс. На вход подаётся W, который уже посчитан функцией eval_model, и X, на выходе — массив y_pred.

In [16]:
def calc_pred(W, X):
    y_pred = np.sign(np.dot(X, W))
    return np.where(y_pred < 0, 0, 1)

In [17]:
y_pred = calc_pred(W, X_st)
y_pred

array([0, 0, 1, 0, 1, 0, 1, 0, 0, 1])

# Задание №5

Посчитайте Accuracy, матрицу ошибок, точность и полноту, а также F1 score.

In [18]:
def accuracy(y, y_pred):
    return np.mean(y == y_pred)

In [19]:
accuracy(y, y_pred)

0.9

In [20]:
def conf_matrix(y, y_pred):
    result = np.zeros((2, 2))
    for i in range(len(y)):
        if y[i] == y_pred[i] == 1: #TP
            result[0][0] += 1
        elif y[i] == y_pred[i] == 0: #TN
            result[1][1] += 1
        elif y[i] != y_pred[i] == 1: #FP
            result[0][1] += 1
        elif y[i] != y_pred[i] == 0: #FN
            result[1][0] += 1
    return result   

In [21]:
conf_matrix(y, y_pred)

array([[4., 0.],
       [1., 5.]])

In [22]:
def precision(y, y_pred):
    TP = conf_matrix(y, y_pred)[0][0]
    FP = conf_matrix(y, y_pred)[0][1]
    result = TP / (TP + FP)
    return result

In [23]:
precision(y, y_pred)

1.0

In [24]:
def recall(y, y_pred):
    TP = conf_matrix(y, y_pred)[0][0]
    FN = conf_matrix(y, y_pred)[1][0]
    result = TP / (TP + FN)
    return result

In [25]:
recall(y, y_pred)

0.8

In [26]:
def f1_score(y, y_pred):
    result = 2 * precision(y, y_pred) * recall(y, y_pred) / (precision(y, y_pred) + recall(y, y_pred))
    return result

In [27]:
f1_score(y, y_pred)

0.888888888888889

Проверка

In [28]:
from sklearn.metrics import classification_report

In [29]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

         0.0       0.83      1.00      0.91         5
         1.0       1.00      0.80      0.89         5

    accuracy                           0.90        10
   macro avg       0.92      0.90      0.90        10
weighted avg       0.92      0.90      0.90        10

