### Построить указанные в задании классификаторы.

#### Классификаторы

Классификатор 1 - Байес
Классификатор 2 - Логистическая регрессия

#### Признаки

malic_acid , nonflavanoid_phenols , magnesium

Лучшей парой из задания 4 были: nonflavanoid_phenols , magnesium


In [115]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [116]:
# 1. Загрузить датасет WINE.
from sklearn.datasets import load_wine
wine = load_wine(as_frame=True)

In [117]:
# 2. Подготовить выборки: обучающую 70% и тестовую 30%.
def split_dataset(
        x: np.ndarray,
        y: np.ndarray,
        propotion=80) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:

    def get_intervals(ind):
        return ind[0][0], ind[0][-1]
    total_size = x.shape[0]
    train_size = int((propotion * total_size) / 100) - 1
    each_class_on_train = int(train_size / len(set(y)))
    each_class_on_test = int((total_size - train_size) / len(set(y)))

    x_train = np.empty((train_size, ), dtype=x.dtype)
    y_train = np.empty((train_size, ))
    x_test = np.empty((total_size - train_size, ), dtype=x.dtype)
    y_test = np.empty((total_size - train_size, ))

    for idx, c in enumerate(set(y)):
        min, max = get_intervals(np.where(y == c))

        inds = np.random.randint(min, max, size=each_class_on_train)
        x_train[
            idx * each_class_on_train:
            idx * each_class_on_train + each_class_on_train] = x[inds]
        y_train[
            idx * each_class_on_train:
            idx * each_class_on_train + each_class_on_train
        ] = y[inds]

        inds = np.random.randint(min, max, size=each_class_on_test)
        x_test[
            idx * each_class_on_test:
            idx * each_class_on_test + each_class_on_test] = x[inds]
        y_test[
            idx * each_class_on_test:
            idx * each_class_on_test + each_class_on_test
        ] = y[inds]
    return x_train, y_train, x_test, y_test


def extract_x_factors(
    data: pd.DataFrame,
    factors: list[str]
) -> np.ndarray:
    dtype = [(factor, data[factor].dtype) for factor in factors]
    result = np.zeros((len(data), ), dtype=dtype)
    for factor in factors:
        result[factor] = data[factor].to_numpy()
    return result


FACTORS = [
    'malic_acid',
    'nonflavanoid_phenols',
    'magnesium'
]
data = extract_x_factors(
    wine.data,
    FACTORS
)

x_train, y_train, x_test, y_test = split_dataset(
    data,
    wine.target,
    propotion=70
)

In [118]:
def merge_x_y(x, y) -> np.ndarray:
    new_dt = np.dtype(
        list(x.dtype.descr) +
        [('target', y_train.dtype)])
    merged = np.zeros(
        x.shape[0],
        dtype=new_dt)
    for name in x.dtype.names:
        merged[name] = x[name]
    merged['target'] = y
    return merged

In [119]:
def structed_to_reg_np(x: np.ndarray) -> np.ndarray:
    return x.view(np.float64).reshape(x.shape + (-1,))

In [120]:
from sklearn.metrics import classification_report


def report(model, x_test, y_true):
    y_pred = model.predict(x_test)
    print(classification_report(y_true.astype('int32'), y_pred.astype('int32')))

In [121]:
# 3. Построить указанные в задании классификаторы.
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

CLASSES = ['nonflavanoid_phenols', 'magnesium']
xt = structed_to_reg_np(x_train)
# # Классификатор 1 - Байес
br = GaussianNB()
br.fit(xt, y_train)
print('Bayes')
report(br, structed_to_reg_np(x_test), y_test)

# Классификатор 2 - Логистическая регрессия
print('LogisticRegression')
lr = LogisticRegression(max_iter=10000)
lr.fit(xt, y_train)
report(lr, structed_to_reg_np(x_test), y_test)

Bayes
              precision    recall  f1-score   support

           0       0.60      0.67      0.63        18
           1       0.71      0.53      0.61        19
           2       0.71      0.83      0.77        18

    accuracy                           0.67        55
   macro avg       0.68      0.68      0.67        55
weighted avg       0.68      0.67      0.67        55

LogisticRegression
              precision    recall  f1-score   support

           0       0.52      0.61      0.56        18
           1       0.62      0.53      0.57        19
           2       0.78      0.78      0.78        18

    accuracy                           0.64        55
   macro avg       0.64      0.64      0.64        55
weighted avg       0.64      0.64      0.64        55



In [122]:
# Провести анализ главных компонент с числом компонент =2.
from sklearn.decomposition import PCA
pm = PCA(n_components=2, copy=True)
pm.fit(xt, y_train)

In [123]:
xt = structed_to_reg_np(x_train)
pca_xt = pm.transform(xt)

In [124]:
# 5. Получить оценку качества классификации в новом пространстве
# признаков с использованием указанных классификаторов
print('Bayes')
print('Before')
report(br, structed_to_reg_np(x_test), y_test)

pca_br = GaussianNB()
pca_br.fit(pca_xt, y_train)
print('After')
report(pca_br, pm.transform(structed_to_reg_np(x_test)), y_test)

Before
              precision    recall  f1-score   support

           0       0.60      0.67      0.63        18
           1       0.71      0.53      0.61        19
           2       0.71      0.83      0.77        18

    accuracy                           0.67        55
   macro avg       0.68      0.68      0.67        55
weighted avg       0.68      0.67      0.67        55

After
              precision    recall  f1-score   support

           0       0.56      0.78      0.65        18
           1       0.83      0.53      0.65        19
           2       0.78      0.78      0.78        18

    accuracy                           0.69        55
   macro avg       0.72      0.69      0.69        55
weighted avg       0.73      0.69      0.69        55



In [127]:
print('LogisticRegression')
print('Before')
report(lr, structed_to_reg_np(x_test), y_test)

pca_lr = LogisticRegression()
pca_lr.fit(pca_xt, y_train)
print('After')
report(pca_lr, pm.transform(structed_to_reg_np(x_test)), y_test)

LogisticRegression
Before
              precision    recall  f1-score   support

           0       0.52      0.61      0.56        18
           1       0.62      0.53      0.57        19
           2       0.78      0.78      0.78        18

    accuracy                           0.64        55
   macro avg       0.64      0.64      0.64        55
weighted avg       0.64      0.64      0.64        55

After
              precision    recall  f1-score   support

           0       0.53      0.56      0.54        18
           1       0.58      0.58      0.58        19
           2       0.71      0.67      0.69        18

    accuracy                           0.60        55
   macro avg       0.60      0.60      0.60        55
weighted avg       0.60      0.60      0.60        55



In [138]:
# 6. Провести линейный дискриминантный анализ путем построения одного признака.

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=1)
X_train = lda.fit_transform(xt, y_train)
X_test = lda.transform(structed_to_reg_np(x_test))

In [139]:
# 7. Получить оценку качества классификации в новом пространстве
# признаков с использованием указанных классификаторов
print('Bayes')
print('Before')
report(br, structed_to_reg_np(x_test), y_test)

lda_br = GaussianNB()
lda_br.fit(X_train, y_train)
print('After')
report(lda_br, X_test, y_test)

Bayes
Before
              precision    recall  f1-score   support

           0       0.60      0.67      0.63        18
           1       0.71      0.53      0.61        19
           2       0.71      0.83      0.77        18

    accuracy                           0.67        55
   macro avg       0.68      0.68      0.67        55
weighted avg       0.68      0.67      0.67        55

After
              precision    recall  f1-score   support

           0       0.52      0.72      0.60        18
           1       0.42      0.26      0.32        19
           2       0.83      0.83      0.83        18

    accuracy                           0.60        55
   macro avg       0.59      0.61      0.59        55
weighted avg       0.59      0.60      0.58        55



In [140]:
print('LogisticRegression')
print('Before')
report(lr, structed_to_reg_np(x_test), y_test)

lda_lr = LogisticRegression()
lda_lr.fit(X_train, y_train)
print('After')
report(lda_lr, X_test, y_test)

LogisticRegression
Before
              precision    recall  f1-score   support

           0       0.52      0.61      0.56        18
           1       0.62      0.53      0.57        19
           2       0.78      0.78      0.78        18

    accuracy                           0.64        55
   macro avg       0.64      0.64      0.64        55
weighted avg       0.64      0.64      0.64        55

After
              precision    recall  f1-score   support

           0       0.57      0.67      0.62        18
           1       0.50      0.42      0.46        19
           2       0.83      0.83      0.83        18

    accuracy                           0.64        55
   macro avg       0.63      0.64      0.64        55
weighted avg       0.63      0.64      0.63        55



### Сравнить результаты классификации в п5 и п 7 с лучшей парой признаков и классификатором из Задания 4.

#### Итоговые результаты

```
---------------- Logistic regression ----------------
None modified                                          PCA Modified                                           LDA Modified
              precision    recall  f1-score   support |              precision    recall  f1-score   support |              precision    recall  f1-score   support

           0       0.52      0.61      0.56        18 |           0       0.53      0.56      0.54        18 |           0       0.57      0.67      0.62        18
           1       0.62      0.53      0.57        19 |           1       0.58      0.58      0.58        19 |           1       0.50      0.42      0.46        19
           2       0.78      0.78      0.78        18 |           2       0.71      0.67      0.69        18 |           2       0.83      0.83      0.83        18

    accuracy                           0.64        55 |    accuracy                           0.60        55 |    accuracy                           0.64        55
   macro avg       0.64      0.64      0.64        55 |   macro avg       0.60      0.60      0.60        55 |   macro avg       0.63      0.64      0.64        55
weighted avg       0.64      0.64      0.64        55 |weighted avg       0.60      0.60      0.60        55 |weighted avg       0.63      0.64      0.63        55
----------------                     ----------------
------------------ Bayes            -----------------
None modified                                          PCA Modified                                           LDA Modified
              precision    recall  f1-score   support |              precision    recall  f1-score   support |              precision    recall  f1-score   support

           0       0.60      0.67      0.63        18 |           0       0.56      0.78      0.65        18 |           0       0.52      0.72      0.60        18
           1       0.71      0.53      0.61        19 |           1       0.83      0.53      0.65        19 |           1       0.42      0.26      0.32        19
           2       0.71      0.83      0.77        18 |           2       0.78      0.78      0.78        18 |           2       0.83      0.83      0.83        18

    accuracy                           0.67        55 |    accuracy                           0.69        55 |    accuracy                           0.60        55
   macro avg       0.68      0.68      0.67        55 |   macro avg       0.72      0.69      0.69        55 |   macro avg       0.59      0.61      0.59        55
weighted avg       0.68      0.67      0.67        55 |weighted avg       0.73      0.69      0.69        55 |weighted avg       0.59      0.60      0.58        55
------------------                  -----------------
------------------ PR4 Bayes        -----------------
              precision    recall  f1-score   support

           0       0.75      0.79      0.77        19
           1       0.61      0.61      0.61        18
           2       0.71      0.67      0.69        18

    accuracy                           0.69        55
   macro avg       0.69      0.69      0.69        55
weighted avg       0.69      0.69      0.69        55
------------------                  -----------------
```

#### Сделать вывод

Для PCA случаев изменение размерности привело к увелиечнию f1 точности для Баесовского классификатора, но при этом снижение этой же метрики для логистической регрессии.

LDA для одного признака даёт увеличение метрик точности для этого признака, при этом точность остальных признаков снижается

Что кастаельно сравнения с данными из предыдущей практики, то сравнение с ними будет некорректным (так как отсутствует использованный сид, поэтому фактически используемые при обучении выборки отличаются), но можно предположить, что снижение размерности данных также приведёт к увеличению точности