# Подготовка данных

In [510]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import math
from prettytable import PrettyTable

In [511]:
data = pd.read_csv("/content/drive/MyDrive/ais-datasets/diabetes.csv")
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


# Предварительная обработка

In [512]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [513]:
data.shape

(768, 9)

In [514]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Отсутствующих значений не обнаружено

# Разделение датасета на матрицу признаков `X` и вектор зависимых переменных `Y`

In [515]:
X = data.drop('Outcome', axis=1)
Y = data['Outcome']

In [516]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [517]:
Y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

# Кодирование категориальных признаков

Не требуется:

In [518]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


# Разделение данных на обучающую и тестовую выборки

In [519]:
# Список индексов данных
indices = list(range(len(X)))

# Размер тестовой выборки
test_size = 0.2 # 20%

# Начальное состояние генератора случайных чисел
random.seed(42)

# Шафлим данные (чтобы потом не балансировать)
random.shuffle(indices)

split_index = int(len(X) * test_size)

X_train = X.iloc[indices[split_index:]]
X_test = X.iloc[indices[:split_index]]
Y_train = Y.iloc[indices[split_index:]]
Y_test = Y.iloc[indices[:split_index]]

Данные после разделения:

In [520]:
print('X_train ->', X_train.shape)
print('X_test ->', X_test.shape)
print('Y_train ->', Y_train.shape)
print('Y_test ->', Y_test.shape)

X_train -> (615, 8)
X_test -> (153, 8)
Y_train -> (615,)
Y_test -> (153,)


# Масштабирование данных

## Min-max scaler

Для обучающей выборки

In [521]:
for column_name, params in X_train.items():
  minimum = min(params)
  maximum = max(params)
  difference = maximum - minimum
  X_train[column_name] = (X_train[column_name] - minimum) / difference

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[column_name] = (X_train[column_name] - minimum) / difference


Для тестовой выборки

In [522]:
for column_name, params in X_test.items():
  minimum = min(params)
  maximum = max(params)
  difference = maximum - minimum
  X_test[column_name] = (X_test[column_name] - minimum) / difference

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[column_name] = (X_test[column_name] - minimum) / difference


Данные после масштабирования

In [523]:
X_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
489,0.470588,0.974874,0.655738,0.0,0.0,0.439394,0.198542,0.766667
451,0.117647,0.673367,0.57377,0.0,0.0,0.486532,0.194683,0.033333
637,0.117647,0.472362,0.622951,0.181818,0.08871,0.531987,0.240566,0.033333
399,0.176471,0.969849,0.57377,0.313131,0.0,0.587542,0.065609,0.066667
452,0.0,0.457286,0.557377,0.323232,0.282258,0.671717,0.125643,0.066667


In [524]:
X_test.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
490,0.133333,0.42132,0.570175,0.518519,0.078014,0.548435,0.323547,0.06383
593,0.133333,0.416244,0.45614,0.407407,0.135934,0.424739,0.95185,0.085106
719,0.333333,0.492386,0.666667,0.5,0.0,0.530551,0.17616,0.659574
174,0.133333,0.380711,0.561404,0.444444,0.065012,0.442623,0.171462,0.255319
595,0.0,0.954315,0.719298,0.259259,0.218676,0.4769,0.354668,0.021277


# Реализация метода логистической регрессии

## Определим сигмоидную функцию и функцию потерь

In [525]:
def sig(t):
    return 1 / (1 + np.exp(-t))

In [526]:
def cost(Y_actual, Y_predicted):
    return -np.mean(Y_actual * np.log(Y_predicted) + (1 - Y_actual) * np.log(1 - Y_predicted))

## Определим методы обучения

### Градиентный спуск

In [527]:
def gradient_descent(X_train, Y_train, iterations, learning_rate):
    objects_num, characteristics_num = X_train.shape

    weights = np.zeros(characteristics_num)
    losses = []
    bias = 0

    for iteration in range(1, iterations + 1):

        t = np.dot(X_train, weights) + bias
        #  prediction
        z = sig(t)

        #  ЧП стоимости по весам
        dw = (1 / objects_num) * np.dot(X_train.T, (z - Y_train))
        #  ЧП стоимости по смещению
        db = (1 / objects_num) * np.sum(z - Y_train)

        weights -= learning_rate * dw
        bias -= learning_rate * db

        if iteration % 100 == 0:
            loss = cost(Y_train, z)
            losses.append(loss)
            # print(f'{iteration}) cost = {loss}')

    coeff = {'weights': weights, 'bias': bias}
    return coeff, losses

# gradient_descent(X_train, Y_train, 100, 0.01)

### Оптимизация Ньютона

In [528]:
def newton_optimization(X_train, Y_train, iterations):
    objects_num, characteristics_num = X_train.shape

    weights = np.zeros(characteristics_num)
    losses = []
    bias = 0

    for iteration in range(1, iterations + 1):

        t = np.dot(X_train, weights) + bias
        #  prediction
        z = sig(t)

        #  ЧП стоимости по весам
        dw = (1 / objects_num) * np.dot(X_train.T, (z - Y_train))
        #  ЧП стоимости по смещению
        db = (1 / objects_num) * np.sum(z - Y_train)

        hessian = (1 / objects_num) * (X_train.T @ ((z * (1 - z)) * X_train.T).T)

        weights -= np.linalg.inv(hessian) @ dw
        bias -= db

        if iteration % 100 == 0:
            loss = cost(Y_train, z)
            losses.append(loss)
            # print(f'{iteration}) cost = {loss}')

    coeff = {'weights': weights, 'bias': bias}
    return coeff, losses

# newton_optimization(X_train, Y_train, 100, 0.01)

## Определим функцию предсказания

In [529]:
def predict(X_test, coeff):
    weights = coeff['weights']
    bias = coeff['bias']

    t = np.dot(X_test, weights) + bias

    z = sig(t)

    return (z > 0.6).astype(int)

# coeff, losses = newton_optimization(X_train, Y_train, 100, 0.01)
# predict(X_train, coeff)

# Оценка модели

Определим функцию для подсчета метрик

In [530]:
def calculate_metrics(Y_prediction, Y_test):
    TP = np.sum((Y_prediction == 1) & (Y_test == 1))
    TN = np.sum((Y_prediction == 0) & (Y_test == 0))
    FP = np.sum((Y_prediction == 1) & (Y_test == 0))
    FN = np.sum((Y_prediction == 0) & (Y_test == 1))

    accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) != 0 else 0
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0

    return {'accuracy': accuracy, 'precision': precision,  'recall': recall, 'f1_score': f1_score}

# Исследование гиперпараметров

Создадим вариации гиперпараметров

In [531]:
rates = [0.01, 0.2, 0.375, 0.5]
iterations = [100, 1000, 5000]

In [532]:
max_f1_score = 0
best_params = {}
table = PrettyTable(['method', 'rate', 'iterations', 'accuracy', 'precision', 'recall', 'f1_score', 'losses'])
table.align['rate'] = "l"
table.align['iterations'] = "l"
table.align['accuracy'] = "l"
table.align['precision'] = "l"
table.align['recall'] = "l"
table.align['f1_score'] = "l"
table.align['losses'] = "l"

#  Для метода градиентного спуска
for rate in rates:
    for iteration in iterations:

        coeff, losses = gradient_descent(X_train, Y_train, iteration, rate)
        Y_prediction = predict(X_test, coeff)

        metrics = calculate_metrics(Y_prediction, Y_test)

        if (metrics['f1_score'] > max_f1_score):
            best_params = {'method': gradient_descent.__name__, 'rate': rate, 'iterations': iteration, 'accuracy': metrics['accuracy'], 'precision': metrics['precision'], 'recall': metrics['recall'], 'f1_score': metrics['f1_score'], 'losses': losses[0] - losses[len(losses) - 1]}

        table.add_row([gradient_descent.__name__, rate, iteration, metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f1_score'], losses[0] - losses[len(losses) - 1]])

#  Для метода Ньютона
for iteration in iterations:

    coeff, losses = newton_optimization(X_train, Y_train, iteration)
    Y_prediction = predict(X_test, coeff)

    metrics = calculate_metrics(Y_prediction, Y_test)

    if (metrics['f1_score'] > max_f1_score):
        best_params = {'method': newton_optimization.__name__, 'rate': '-', 'iterations': iteration, 'accuracy': metrics['accuracy'], 'precision': metrics['precision'], 'recall': metrics['recall'], 'f1_score': metrics['f1_score'], 'losses': losses[0] - losses[len(losses) - 1]}

    table.add_row([newton_optimization.__name__, '-', iteration, metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f1_score'], losses[0] - losses[len(losses) - 1]])

print(table)

+---------------------+-------+------------+--------------------+--------------------+---------------------+---------------------+----------------------+
|        method       | rate  | iterations | accuracy           | precision          | recall              | f1_score            | losses               |
+---------------------+-------+------------+--------------------+--------------------+---------------------+---------------------+----------------------+
|   gradient_descent  | 0.01  | 100        | 0.6013071895424836 | 0                  | 0.0                 | 0                   | 0.0                  |
|   gradient_descent  | 0.01  | 1000       | 0.6013071895424836 | 0                  | 0.0                 | 0                   | 0.029989532332527657 |
|   gradient_descent  | 0.01  | 5000       | 0.6013071895424836 | 0                  | 0.0                 | 0                   | 0.08038821404821395  |
|   gradient_descent  | 0.2   | 100        | 0.6013071895424836 | 0         

Выведем лучшую калибровку гиперпараметров

In [533]:
best_params_table = PrettyTable(['method', 'rate', 'iterations', 'accuracy', 'precision', 'recall', 'f1_score', 'losses'])
best_params_table.add_row([best_params['method'], best_params['rate'], best_params['iterations'], best_params['accuracy'], best_params['precision'], best_params['recall'], best_params['f1_score'], best_params['losses']])

print(best_params_table)

+---------------------+------+------------+--------------------+-----------+--------------------+-------------------+---------------------+
|        method       | rate | iterations |      accuracy      | precision |       recall       |      f1_score     |        losses       |
+---------------------+------+------------+--------------------+-----------+--------------------+-------------------+---------------------+
| newton_optimization |  -   |    5000    | 0.7189542483660131 |    0.75   | 0.4426229508196721 | 0.556701030927835 | 0.05782816516573308 |
+---------------------+------+------------+--------------------+-----------+--------------------+-------------------+---------------------+


# Выводы

В ходе анализа представленной таблицы сравнения был сделан следующий вывод:

- Метод ньютона в среднем работает точнее
- Для выбора оптимального количества итераций методу ньютона нет надобности варьировать значение learning_rate (шаг) в связи с вычислением гессиана (второй производной функции)
- Метод градиентного спуска не всегда сходится к оптимальному решению  (из-за сложной формы функции и множества локальных минимумов)