# Неделя 3. Понедельник
## Обучение с учителем

### Применение базовых методов классификации

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Важная настройка для корректной настройки pipeline!
import sklearn
sklearn.set_config(transform_output="pandas")

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, OrdinalEncoder
from sklearn.model_selection import GridSearchCV, KFold
from category_encoders import TargetEncoder
# for model learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

#models
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from catboost import CatBoostRegressor

# Metrics
from sklearn.metrics import accuracy_score


# tunning hyperparamters model
import optuna

#### 0. Ознакомьтесь с датасетом

In [55]:
train = pd.read_csv('/home/UBkarima/1_Понедельник_3/heart.csv')
test = pd.read_csv('/home/UBkarima/1_Понедельник_3/heart.csv')


* __Age__: age of the patient [years]
* __Sex__: sex of the patient [M: Male, F: Female]
* __ChestPainType__: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
* __RestingBP__: resting blood pressure [mm Hg]
* __Cholesterol__: serum cholesterol [mm/dl]
* __FastingBS__: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
* __RestingECG__: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite * left ventricular hypertrophy by Estes' criteria]
* __MaxHR__: maximum heart rate achieved [Numeric value between 60 and 202]
* __ExerciseAngina__: exercise-induced angina [Y: Yes, N: No]
* __Oldpeak__: oldpeak = ST [Numeric value measured in depression]
* __ST_Slope__: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
* __HeartDisease__: output class [1: heart disease, 0: Normal]

* Таргетом является столбец `HeartDisease`. Необходимо предсказать по имеющимся данным, есть ли проблемы с сердцем

#### 1. Небольшие рекомендации ниже 


* __Baseline pipeline (базовый пайплайн)__ - это простой пайплайн, который используется как отправная точка или точка сравнения при разработке и оценке более сложных моделей или алгоритмов. 

* Для этого сначала используйте самые простые идеи по заполнению пропусков(средними, медианами, модами) и кодированию категориальных данных, которые вам приходят в голову. 

* После того, как вы построите модели провалидируете их. Можно будет приступать к попыткам улучшить свою модель с помощью ваших идей - пробовать создавать новые фичи, кодировать данные по-другому, заполнять иначе NaN и тд

#### 2. Заполните пропущенные значения(`Imputing`), как считаете нужным.  

- Не забывайте памятку выше, сначала заполняйте самыми тривиальными идеями. Наприсер, средними, медианами и т.д

In [56]:
pd.DataFrame(data={'Nan_count': train.isna().sum(), 'data_type':train.dtypes})

Unnamed: 0,Nan_count,data_type
Age,10,float64
Sex,0,object
ChestPainType,0,object
RestingBP,0,int64
Cholesterol,0,int64
FastingBS,0,int64
RestingECG,0,object
MaxHR,0,int64
ExerciseAngina,0,object
Oldpeak,0,float64


In [57]:
num_features = train.select_dtypes(exclude='object')
cat_features = train.select_dtypes(include='object')

In [58]:
X, y = train.drop('HeartDisease', axis=1), train['HeartDisease']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


##### 2.1 Оберните в `ColumnTransformer` свой `Imputing` данных. Проверьте корректность его работы. Для этого необходимо сделать:

1. Обучить и трансформировать свой `Imputer` с помощью `your_imputer.fit_transform` - на тренировочных данных
2. Заполнить с помощью `your_imputer.transform` - на тестовых данных

Убедитесь, что данные прошли через этап `Imputing'а` и пропусков в них больше нет

In [59]:
X_train

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
485,63.0,M,ATA,139,217,1,ST,128,Y,1.2,Flat
486,55.0,M,ATA,110,214,1,ST,180,N,0.4,Up
117,59.0,F,ASY,130,338,1,ST,130,Y,1.5,Flat
361,47.0,M,ASY,160,0,0,Normal,124,Y,0.0,Flat
296,50.0,M,ASY,145,0,1,Normal,139,Y,0.7,Flat
...,...,...,...,...,...,...,...,...,...,...,...
276,51.0,M,NAP,135,160,0,Normal,150,N,2.0,Flat
201,46.0,M,NAP,120,230,0,Normal,150,N,0.0,Up
462,59.0,M,ASY,122,233,0,Normal,117,Y,1.3,Down
252,61.0,M,ASY,125,292,0,ST,115,Y,0.0,Up


In [60]:
drop_features=['Cholesterol']

my_imputer = ColumnTransformer(
    transformers = [
        ('drop_features','drop', drop_features),
        ('num_imputer', SimpleImputer(strategy='mean'), ['Age'])# SimpleImputer Позволяет заполнить каким-либо простым показателем (средним, модой, медианой)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)    


In [61]:
filled_data = my_imputer.fit_transform(X_train)

In [62]:
pd.DataFrame(data={'Nan_count': filled_data.isna().sum(), 'data_type':filled_data.dtypes})

Unnamed: 0,Nan_count,data_type
Age,0,float64
Sex,0,object
ChestPainType,0,object
RestingBP,0,int64
FastingBS,0,int64
RestingECG,0,object
MaxHR,0,int64
ExerciseAngina,0,object
Oldpeak,0,float64
ST_Slope,0,object


In [63]:
my_imputer.transform(X_valid).isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
dtype: int64

#### 3. Закодируйте категориальные переменные, как считаете нужным

* `OneHotEncoding` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)  
* `TargetEncoding` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.TargetEncoder.html)  
* `OrdinalEncoding` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html)  
* `CatBoostEncoding` (https://www.geeksforgeeks.org/categorical-encoding-with-catboost-encoder/)  

In [64]:
ordinal_encoding_columns = ['Sex','ExerciseAngina'] # Столбец, который планируем кодировать порядково, с помощью OrdinalEncoder 
one_hot_encoding_columns = ['ST_Slope', 'RestingECG'] # Столбец, который планируем кодировать с помощью OneHotEncoder 
cb_encoding_columns = ['ChestPainType']

##### 3.1 Оберните в `ColumnTransformer` свой `Encoding` данных. Проверьте корректность его работы. 

In [65]:
encoder = ColumnTransformer(
    [
        ('ordinal_encoding', OrdinalEncoder(), ordinal_encoding_columns),
        ('one_hot_encoding_columns', OneHotEncoder(sparse_output=False),one_hot_encoding_columns),
        ('cb_encoding_columns', TargetEncoder(smoothing=0.0), cb_encoding_columns)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)

In [66]:
processed_data = encoder.fit_transform(filled_data, y)

ValueError: Lengths must match to compare

In [173]:
processed_data

Unnamed: 0,Sex,ExerciseAngina,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,Age,RestingBP,FastingBS,MaxHR,Oldpeak
485,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,63.0,139,1,128,1.2
486,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,55.0,110,1,180,0.4
117,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,59.0,130,1,130,1.5
361,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,47.0,160,0,124,0.0
296,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,50.0,145,1,139,0.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,51.0,135,0,150,2.0
201,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,46.0,120,0,150,0.0
462,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,59.0,122,0,117,1.3
252,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,61.0,125,0,115,0.0


#### 4. То же самое проделать с нормализацией данных

* `StandardScaler` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* `MinMaxScaler` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)
* `RobustScaler` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)

In [174]:
standard_scaler_columns = ['Age', 'RestingBP','MaxHR','Oldpeak'] # Числовые столбцы, которые необходимо пронормировать




#### 4.1 Оберните в `ColumnTransformer` свой `Scaling` данных, проверьте корректность работы.

In [175]:
scaler = ColumnTransformer(
    [
        ('scaling_num_columns', StandardScaler(), standard_scaler_columns)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)

In [176]:
processed_data = scaler.fit_transform(filled_data, y)

In [None]:
processed_data

Unnamed: 0,Age,RestingBP,MaxHR,Oldpeak,Sex,ChestPainType,FastingBS,RestingECG,ExerciseAngina,ST_Slope
485,0.971201,0.339016,-0.324520,0.317046,M,ATA,1,ST,Y,Flat
486,0.118353,-1.266031,1.689837,-0.440356,M,ATA,1,ST,N,Up
117,0.544777,-0.159102,-0.247045,0.601071,F,ASY,1,ST,Y,Flat
361,-0.734495,1.501291,-0.479470,-0.819056,M,ASY,0,Normal,Y,Flat
296,-0.414677,0.671094,0.101594,-0.156330,M,ASY,1,Normal,Y,Flat
...,...,...,...,...,...,...,...,...,...,...
276,-0.308071,0.117630,0.527708,1.074447,M,NAP,0,Normal,N,Flat
201,-0.841101,-0.712567,0.527708,-0.819056,M,NAP,0,Normal,N,Up
462,0.544777,-0.601874,-0.750634,0.411721,M,ASY,0,Normal,Y,Down
252,0.757989,-0.435834,-0.828109,-0.819056,M,ASY,0,ST,Y,Up


In [None]:
pd.DataFrame(data={'Nan_count': processed_data.isna().sum(), 'data_type':processed_data.dtypes})

Unnamed: 0,Nan_count,data_type
Age,0,float64
RestingBP,0,float64
MaxHR,0,float64
Oldpeak,0,float64
Sex,0,object
ChestPainType,0,object
FastingBS,0,int64
RestingECG,0,object
ExerciseAngina,0,object
ST_Slope,0,object


#### 5. Соберите весь препроцессинг в общий Pipeline.

In [179]:
preprocessor = Pipeline(
    [
        ('imputer', my_imputer),
        ('encoder', encoder),
        ('scaler', scaler)
    ]
)

##### 5.1 Прогоните свои данные через `preprocessor` и убедитесь, что ваши данные проходят через него корректно и уже готовы к ML-модели

In [180]:
preprocessor.fit_transform(X_train)

Unnamed: 0,Age,RestingBP,MaxHR,Oldpeak,Sex,ExerciseAngina,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,FastingBS
485,0.971201,0.339016,-0.324520,0.317046,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
486,0.118353,-1.266031,1.689837,-0.440356,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
117,0.544777,-0.159102,-0.247045,0.601071,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
361,-0.734495,1.501291,-0.479470,-0.819056,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
296,-0.414677,0.671094,0.101594,-0.156330,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,-0.308071,0.117630,0.527708,1.074447,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
201,-0.841101,-0.712567,0.527708,-0.819056,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
462,0.544777,-0.601874,-0.750634,0.411721,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
252,0.757989,-0.435834,-0.828109,-0.819056,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0


In [None]:
preprocessor.transform(X_valid)

Unnamed: 0,Age,RestingBP,MaxHR,Oldpeak,Sex,ExerciseAngina,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,FastingBS
356,-0.841101,-0.989299,-0.905584,0.601071,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
763,0.438171,-0.048409,1.418673,2.210549,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0
817,0.651383,-0.435834,0.179069,1.831848,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
735,-0.521283,-0.712567,0.101594,1.074447,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
892,-1.587343,0.283669,0.605183,-0.819056,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,-0.947707,0.394362,-0.556946,-0.819056,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1
752,0.224959,-0.435834,0.295282,0.317046,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1
492,-0.521283,-0.159102,0.334020,2.021198,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
622,0.544777,-1.266031,0.217807,0.317046,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0


#### 6.ML-модели

* `LogisticRegression` (из `sklearn.linear_model`)  
* `LogisticRegression with regularization` (из `sklearn.linear_model`)  
* `KNeighborsClassifier` (из `sklearn.neighbors`)  
* `DecisionTree` (из `sklearn.tree`)  

##### 6.1 Обучите свой `Pipeline` с помощью метода `.fit()` с разными моделями.

In [182]:
ml_pipeline = Pipeline(
     [
         ('preprocessor', preprocessor),
         ('model', LogisticRegression())
     ]
)

In [None]:
ml_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



#### 7. С помощью метода `.predict()` (на вход поступают только матрица признаков, без целевой переменной) предсказать значения на обучающей выборке (`X_train`) и валидационной выборке (`X_valid`).

In [None]:
print('train accuracy:', accuracy_score(y_train, ml_pipeline.predict(X_train))) # Доля правильных ответов на выборке, которая была использована для обучения
print('valid accuracy:', accuracy_score(y_valid, ml_pipeline.predict(X_valid))) # Доля правильных ответов на выборке, которую обученный алгоритм еще не видел

train accuracy: 0.8569482288828338
valid accuracy: 0.8858695652173914


##### 7.1 С помощью функции оценки качества (`accuracy_score`) собрать следующую таблицу ниже

* значение функции на обучающих данных
* значение функции на валидационных данных 
    
Результатом выполнения этого пункта будет `DataFrame` формата: 
    
|  |train|valid|
|--|-----|-----|
|**LogReg**|  train_score  | valid_score    |
|**LogReg with l1**|  train_score  | valid_score    |
|**LogReg with l2**|  train_score  | valid_score    |
|**KNN**| train_score  |  valid_score   |
|**Tree**| train_score | valid_score    |

#### 8. Теперь реализуйте __кросс-валидацию__ с KFold=5 и выведите средний __score__

In [None]:
cv = KFold(n_splits=5, random_state=66, shuffle=True)

cross_validation_result = cross_val_score(
    ml_pipeline,
    X, # Подаем датасет целиком!!! Разделение на train и valid происходит внутри
    y,
    cv = cv
)

# cross_validation_result
cross_validation_result.mean()

NameError: name 'ml_pipeline' is not defined

|  |cross_val_score|
|--|-----|
|**LogReg**|  your_score |
|**LogReg with l1**|  your_score  |
|**LogReg with l2**|  your_score  |
|**KNN**| your_score  |
|**SVC**| your_score  |
|**Tree**| your_score |

<img src="https://icons.iconarchive.com/icons/icons8/windows-8/256/Programming-Github-icon.png" width=32 /> Пора сохранить изменения для __github__. 

1. Перейди в командной строке в папку, в которой расположен этот нотбук. 
2. Выполни команду `git add 06-01-task.ipynb`
3. Выполни команду `git commit -m "base models in progress"`
4. Выполни команду `git push`

##### 9. Теперь, когда вы проделали весь pipeline и обучили базовую модель, можно вернуться к началу и пробовать новые идеи и искать точки роста для ваших моделей, в том числе и добавление новых фичей

<img src="https://icons.iconarchive.com/icons/icons8/windows-8/256/Programming-Github-icon.png" width=32 /> Сохрани файл для __github__ и выполни команду `!git status` в ячейке ниже.


In [186]:
# code