## **Модуль ML-7. Прогнозирование биологического ответа (HW-3)**
### ***Итоговое задание***: 

---

#### **1. Описание задачи:**  
Наша практика будет основана на соревновании Kaggle: Predicting a Biological Response (Прогнозирование биологического ответа).

Необходимо предсказать биологический ответ молекул (столбец 'Activity') по их химическому составу (столбцы D1-D1776). 

Данные представлены в формате CSV.  Каждая строка представляет молекулу. 

- Первый столбец Activity содержит экспериментальные данные, описывающие фактический биологический ответ [0, 1]; 
- Остальные столбцы D1-D1776 представляют собой молекулярные дескрипторы — это вычисляемые свойства, которые могут фиксировать некоторые характеристики молекулы, например размер, форму или состав элементов.

Предварительная обработка не требуется, данные уже закодированы и нормализованы.

В качестве метрики будем использовать F1-score.

Необходимо обучить две модели: логистическую регрессию и случайный лес. Далее нужно сделать подбор гиперпараметров с помощью базовых и продвинутых методов оптимизации. Важно использовать все четыре метода (GridSeachCV, RandomizedSearchCV, Hyperopt, Optuna) хотя бы по разу, максимальное количество итераций не должно превышать 50.

In [110]:
#импорт библиотек
import numpy as np #для матричных вычислений
import pandas as pd #для анализа и предобработки данных
import matplotlib.pyplot as plt #для визуализации
import seaborn as sns #для визуализации

from sklearn import linear_model #линейные моделиё
from sklearn import tree #деревья решений
from sklearn import ensemble #ансамбли
from sklearn import metrics #метрики
from sklearn import preprocessing #предобработка
from sklearn.model_selection import train_test_split #сплитование выборки

#### **2. Знакомство с данными и их исследование:**

In [111]:
# Загружаем исходные данные:
data = pd.read_csv('data/train_sem09.csv')
# Выводим на экран данные:
data.head()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [112]:
# Проверяем наличие пропусков:
data.isnull().sum()
data.describe()

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
count,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,...,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0,3751.0
mean,0.542255,0.076948,0.592436,0.068142,0.03899,0.212112,0.686653,0.274713,0.455133,0.749517,...,0.026926,0.014663,0.013863,0.021861,0.015196,0.016796,0.012263,0.01173,0.020261,0.011197
std,0.498278,0.079989,0.10586,0.078414,0.115885,0.102592,0.078702,0.090017,0.162731,0.071702,...,0.161889,0.120215,0.116938,0.146249,0.122348,0.128522,0.110074,0.107683,0.140911,0.105236
min,0.0,0.0,0.282128,0.0,0.0,0.00263,0.137873,0.00613,0.0,0.27559,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0333,0.517811,0.0,0.0,0.138118,0.625627,0.207374,0.378062,0.707339,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0667,0.585989,0.05,0.0,0.190926,0.674037,0.277845,0.499942,0.738961,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.1,0.668395,0.1,0.0,0.261726,0.740663,0.335816,0.569962,0.788177,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,0.964381,0.95,1.0,1.0,0.994735,0.790831,0.98987,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [113]:
# Создаем матрицу наблюдений X и вектор ответов y:

X = data.drop(['Activity'], axis=1)
y = data['Activity']

# Разделяем выборку на тренировочную и тестовую в соотношении 80/20.
# Для сохранения соотношений целевого признака используем
# параметр stratify (стратифицированное разбиение). 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state = 1, test_size = 0.2)

print('Train shape: {}'.format(X_train.shape))
print('Test shape: {}'.format(X_test.shape))

Train shape: (3000, 1776)
Test shape: (751, 1776)


In [114]:
print('Train :')
display(y_train.value_counts(normalize = True))
print('\n')
print('Test :', )
display(y_test.value_counts(normalize =  True))

Train :


Activity
1    0.542333
0    0.457667
Name: proportion, dtype: float64



Test :


Activity
1    0.541944
0    0.458056
Name: proportion, dtype: float64

### Оптимизация гиперпараметров модели:

#### **Логистическая регрессия:**

In [115]:
# Создаем объект класса логистическая регрессия:
log_reg = linear_model.LogisticRegression(max_iter = 1000)
# Обучаем модель, минимизируя logloss:
log_reg.fit(X_train, y_train)

# Делаем предсказание для тренировочной выборки:
y_train_pred = log_reg.predict(X_train)

# Делаем предсказание для тестовой выборки:
y_test_pred = log_reg.predict(X_test)

print("accuracy на тестовом наборе: {:.2f}".format(log_reg.score(X_test, y_test)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

accuracy на тестовом наборе: 0.75
f1_score на тестовом наборе: 0.78


#### **Случайный лес:**

In [116]:
# Создаем объект класса случайный лес:
rf = ensemble.RandomForestClassifier(
    # Число деревьев:
    n_estimators = 500,
    # Критерий эффективности:
    criterion = 'entropy',
    # Максимальная глубина дерева:
    max_depth = 8,
    # Минимальное число объектов в листе:    
    min_samples_leaf = 10,
    # Генератор случайных чисел:
    random_state = 42
)
# Обучаем модель:
rf.fit(X_train, y_train)

# Делаем предсказание для тренировочной выборки:
y_train_pred = rf.predict(X_train)

# Делаем предсказание для тестовой выборки:
y_test_pred = rf.predict(X_test)

print("accuracy на тестовом наборе: {:.2f}".format(rf.score(X_test, y_test)))
print('f1_score на тестовом наборе: {:.2f}'.format(metrics.f1_score(y_test, y_test_pred)))

accuracy на тестовом наборе: 0.78
f1_score на тестовом наборе: 0.80
