# Неделя 3. Понедельник
## Обучение с учителем

### Применение базовых методов классификации

In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Важная настройка для корректной настройки pipeline!
import sklearn
sklearn.set_config(transform_output="pandas")

# Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, MinMaxScaler, OrdinalEncoder
from sklearn.model_selection import GridSearchCV, KFold
from category_encoders import TargetEncoder
# for model learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score

#models
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from catboost import CatBoostRegressor

# Metrics
from sklearn.metrics import accuracy_score


# tunning hyperparamters model
import optuna

#### 0. Ознакомьтесь с датасетом

In [78]:
df = pd.read_csv('/home/UBkarima/phase_1_hw/1_Понедельник_3/heart.csv')
df


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40.0,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49.0,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37.0,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48.0,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54.0,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45.0,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68.0,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57.0,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57.0,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


* __Age__: age of the patient [years]
* __Sex__: sex of the patient [M: Male, F: Female]
* __ChestPainType__: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
* __RestingBP__: resting blood pressure [mm Hg]
* __Cholesterol__: serum cholesterol [mm/dl]
* __FastingBS__: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
* __RestingECG__: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite * left ventricular hypertrophy by Estes' criteria]
* __MaxHR__: maximum heart rate achieved [Numeric value between 60 and 202]
* __ExerciseAngina__: exercise-induced angina [Y: Yes, N: No]
* __Oldpeak__: oldpeak = ST [Numeric value measured in depression]
* __ST_Slope__: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
* __HeartDisease__: output class [1: heart disease, 0: Normal]

* Таргетом является столбец `HeartDisease`. Необходимо предсказать по имеющимся данным, есть ли проблемы с сердцем

#### 1. Небольшие рекомендации ниже 


* __Baseline pipeline (базовый пайплайн)__ - это простой пайплайн, который используется как отправная точка или точка сравнения при разработке и оценке более сложных моделей или алгоритмов. 

* Для этого сначала используйте самые простые идеи по заполнению пропусков(средними, медианами, модами) и кодированию категориальных данных, которые вам приходят в голову. 

* После того, как вы построите модели провалидируете их. Можно будет приступать к попыткам улучшить свою модель с помощью ваших идей - пробовать создавать новые фичи, кодировать данные по-другому, заполнять иначе NaN и тд

#### 2. Заполните пропущенные значения(`Imputing`), как считаете нужным.  

- Не забывайте памятку выше, сначала заполняйте самыми тривиальными идеями. Наприсер, средними, медианами и т.д

In [79]:
pd.DataFrame(data={'Nan_count': df.isna().sum(), 'data_type':df.dtypes})

Unnamed: 0,Nan_count,data_type
Age,10,float64
Sex,0,object
ChestPainType,0,object
RestingBP,0,int64
Cholesterol,0,int64
FastingBS,0,int64
RestingECG,0,object
MaxHR,0,int64
ExerciseAngina,0,object
Oldpeak,0,float64


In [80]:
num_features = df.select_dtypes(exclude='object')
cat_features = df.select_dtypes(include='object')

In [81]:
X, y = df.drop('HeartDisease', axis=1), df['HeartDisease']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


##### 2.1 Оберните в `ColumnTransformer` свой `Imputing` данных. Проверьте корректность его работы. Для этого необходимо сделать:

1. Обучить и трансформировать свой `Imputer` с помощью `your_imputer.fit_transform` - на тренировочных данных
2. Заполнить с помощью `your_imputer.transform` - на тестовых данных

Убедитесь, что данные прошли через этап `Imputing'а` и пропусков в них больше нет

In [82]:
my_imputer = ColumnTransformer(
    transformers = [
        ("num_imputer", SimpleImputer(strategy="mean"), ["Age"])
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough'
)

In [83]:


#drop_features=['Cholesterol']

#my_imputer = ColumnTransformer(
#    transformers = [
#        ('drop_features','drop', drop_features),
 #       ('num_imputer', SimpleImputer(strategy='mean'), ['Age'])# SimpleImputer Позволяет заполнить каким-либо простым показателем (средним, модой, медианой)
  #  ],
   # verbose_feature_names_out = False,
    #remainder = 'passthrough' 
#)    


In [84]:
filled_data = my_imputer.fit_transform(X_train)

In [85]:
pd.DataFrame(data={'Nan_count': filled_data.isna().sum(), 'data_type':filled_data.dtypes})

Unnamed: 0,Nan_count,data_type
Age,0,float64
Sex,0,object
ChestPainType,0,object
RestingBP,0,int64
Cholesterol,0,int64
FastingBS,0,int64
RestingECG,0,object
MaxHR,0,int64
ExerciseAngina,0,object
Oldpeak,0,float64


In [86]:
my_imputer.transform(X_valid).isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
dtype: int64

#### 3. Закодируйте категориальные переменные, как считаете нужным

* `OneHotEncoding` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)  
* `TargetEncoding` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.TargetEncoder.html)  
* `OrdinalEncoding` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html)  
* `CatBoostEncoding` (https://www.geeksforgeeks.org/categorical-encoding-with-catboost-encoder/)  

In [87]:
ordinal_encoding_columns = ['Sex','ExerciseAngina'] # Столбец, который планируем кодировать порядково, с помощью OrdinalEncoder 
one_hot_encoding_columns = ['ST_Slope', 'RestingECG','ChestPainType'] # Столбец, который планируем кодировать с помощью OneHotEncoder b


##### 3.1 Оберните в `ColumnTransformer` свой `Encoding` данных. Проверьте корректность его работы. 

In [88]:
encoder = ColumnTransformer(
    [
        ('ordinal_encoding', OrdinalEncoder(), ordinal_encoding_columns),
        ('one_hot_encoding_columns', OneHotEncoder(sparse_output=False),one_hot_encoding_columns),
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)

#### 4. То же самое проделать с нормализацией данных

* `StandardScaler` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* `MinMaxScaler` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)
* `RobustScaler` (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)

In [89]:
standard_scaler_columns = ['Age', 'RestingBP','MaxHR','Oldpeak'] # Числовые столбцы, которые необходимо пронормировать

#### 4.1 Оберните в `ColumnTransformer` свой `Scaling` данных, проверьте корректность работы.

In [90]:
scaler = ColumnTransformer(
    [
        ('scaling_num_columns', StandardScaler(), standard_scaler_columns)
    ],
    verbose_feature_names_out = False,
    remainder = 'passthrough' 
)

#### 5. Соберите весь препроцессинг в общий Pipeline.

In [91]:
preprocessor = Pipeline(
    [
        ('imputer', my_imputer),
        ('encoder', encoder),
        ('scaler', scaler)
    ]
)

##### 5.1 Прогоните свои данные через `preprocessor` и убедитесь, что ваши данные проходят через него корректно и уже готовы к ML-модели

In [92]:
preprocessor.fit_transform(X_train)

Unnamed: 0,Age,RestingBP,MaxHR,Oldpeak,Sex,ExerciseAngina,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,Cholesterol,FastingBS
485,0.971201,0.339016,-0.324520,0.317046,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,217,1
486,0.118353,-1.266031,1.689837,-0.440356,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,214,1
117,0.544777,-0.159102,-0.247045,0.601071,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,338,1
361,-0.734495,1.501291,-0.479470,-0.819056,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0,0
296,-0.414677,0.671094,0.101594,-0.156330,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,-0.308071,0.117630,0.527708,1.074447,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,160,0
201,-0.841101,-0.712567,0.527708,-0.819056,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,230,0
462,0.544777,-0.601874,-0.750634,0.411721,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,233,0
252,0.757989,-0.435834,-0.828109,-0.819056,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,292,0


In [93]:
preprocessor.transform(X_valid)

Unnamed: 0,Age,RestingBP,MaxHR,Oldpeak,Sex,ExerciseAngina,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,Cholesterol,FastingBS
356,-0.841101,-0.989299,-0.905584,0.601071,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0,0
763,0.438171,-0.048409,1.418673,2.210549,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,224,0
817,0.651383,-0.435834,0.179069,1.831848,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,258,0
735,-0.521283,-0.712567,0.101594,1.074447,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,188,0
892,-1.587343,0.283669,0.605183,-0.819056,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,220,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,-0.947707,0.394362,-0.556946,-0.819056,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,224,1
752,0.224959,-0.435834,0.295282,0.317046,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,249,1
492,-0.521283,-0.159102,0.334020,2.021198,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,0
622,0.544777,-1.266031,0.217807,0.317046,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,239,0


#### 6.ML-модели

* `LogisticRegression` (из `sklearn.linear_model`)  
* `LogisticRegression with regularization` (из `sklearn.linear_model`)  
* `KNeighborsClassifier` (из `sklearn.neighbors`)  
* `DecisionTree` (из `sklearn.tree`)  

##### 6.1 Обучите свой `Pipeline` с помощью метода `.fit()` с разными моделями.

In [94]:
ml_pipeline = Pipeline(
     [
         ('preprocessor', preprocessor),
         ('model', LogisticRegression())
     ]
)

In [95]:
ml_pipeline.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



#### 7. С помощью метода `.predict()` (на вход поступают только матрица признаков, без целевой переменной) предсказать значения на обучающей выборке (`X_train`) и валидационной выборке (`X_valid`).

In [96]:
print('train accuracy:', accuracy_score(y_train, ml_pipeline.predict(X_train))) # Доля правильных ответов на выборке, которая была использована для обучения
print('valid accuracy:', accuracy_score(y_valid, ml_pipeline.predict(X_valid))) # Доля правильных ответов на выборке, которую обученный алгоритм еще не видел

train accuracy: 0.8583106267029973
valid accuracy: 0.8858695652173914


In [97]:
coeffs = np.round(ml_pipeline['model'].coef_[0], 3)
features = list(preprocessor.transform(X_train).columns)

In [98]:
pd.DataFrame(data={'weight': coeffs}, index=features).sort_values(by='weight', key=lambda x: abs(x), ascending=False)

Unnamed: 0,weight
ST_Slope_Up,-1.363
ChestPainType_ASY,1.163
Sex,1.114
ST_Slope_Flat,1.052
FastingBS,0.96
ExerciseAngina,0.86
ChestPainType_NAP,-0.675
ChestPainType_ATA,-0.465
Oldpeak,0.283
ChestPainType_TA,-0.239


In [99]:
equation_terms = [f"({coef} * {col})" for coef, col in zip(coeffs, features)]
equation_string = " + ".join(equation_terms)

print(f'y = {equation_string}')

y = (0.03 * Age) + (0.048 * RestingBP) + (-0.191 * MaxHR) + (0.283 * Oldpeak) + (1.114 * Sex) + (0.86 * ExerciseAngina) + (0.095 * ST_Slope_Down) + (1.052 * ST_Slope_Flat) + (-1.363 * ST_Slope_Up) + (0.178 * RestingECG_LVH) + (-0.2 * RestingECG_Normal) + (-0.194 * RestingECG_ST) + (1.163 * ChestPainType_ASY) + (-0.465 * ChestPainType_ATA) + (-0.675 * ChestPainType_NAP) + (-0.239 * ChestPainType_TA) + (-0.005 * Cholesterol) + (0.96 * FastingBS)


##### 7.1 С помощью функции оценки качества (`accuracy_score`) собрать следующую таблицу ниже

In [100]:
logreg = LogisticRegression(penalty=None, random_state=42)
logreg.fit(X_train_scaled, y_train)

# Логистическая регрессия с L1-регуляризацией
logreg_l1 = LogisticRegression(penalty="l1", solver="liblinear", random_state=42)
logreg_l1.fit(X_train_scaled, y_train)

# Логистическая регрессия с L2-регуляризацией
logreg_l2 = LogisticRegression(penalty="l2", random_state=42)
logreg_l2.fit(X_train_scaled, y_train)

# KNN (k-ближайших соседей)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# Дерево решений
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

# Вычисляем точность (accuracy)
results = pd.DataFrame({
    "Model": ["LogReg", "LogReg with L1", "LogReg with L2", "KNN", "Tree"],
    "Train Accuracy": [
        accuracy_score(y_train, logreg.predict(scaler))),
        accuracy_score(y_train, logreg_l1.predict(scaler)),
        accuracy_score(y_train, logreg_l2.predict(scaler)),
        accuracy_score(y_train, knn.predict(scaler)),
        accuracy_score(y_train, tree.predict(X_train))
    ],
    "Validation Accuracy": [
        accuracy_score(y_valid, logreg.predict(scaled)),
        accuracy_score(y_valid, logreg_l1.predict(X_valid_scaled)),
        accuracy_score(y_valid, logreg_l2.predict(X_valid_scaled)),
        accuracy_score(y_valid, knn.predict(X_valid_scaled)),
        accuracy_score(y_valid, tree.predict(X_valid))
    ]
})

# Устанавливаем имя индексов
results.set_index("Model", inplace=True)

# Выводим результат
print(results)

SyntaxError: closing parenthesis ')' does not match opening parenthesis '[' on line 23 (1144670035.py, line 24)

* значение функции на обучающих данных
* значение функции на валидационных данных 
    
Результатом выполнения этого пункта будет `DataFrame` формата: 
    
|  |train|valid|
|--|-----|-----|
|**LogReg**|  train_score  | valid_score    |
|**LogReg with l1**|  train_score  | valid_score    |
|**LogReg with l2**|  train_score  | valid_score    |
|**KNN**| train_score  |  valid_score   |
|**Tree**| train_score | valid_score    |

#### 8. Теперь реализуйте __кросс-валидацию__ с KFold=5 и выведите средний __score__

In [60]:
cv = KFold(n_splits=5, random_state=66, shuffle=True)

cross_validation_result = cross_val_score(
    ml_pipeline,
    X, # Подаем датасет целиком!!! Разделение на train и valid происходит внутри
    y,
    cv = cv
)

# cross_validation_result
cross_validation_result.mean()

0.8595034449988119

In [61]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, KFold


models = {
    'LogReg': LogisticRegression(),
    'LogReg with l1': LogisticRegression(penalty='l1', solver='liblinear'),
    'LogReg with l2': LogisticRegression(penalty='l2', solver='liblinear'),
    'KNN': KNeighborsClassifier(n_neighbors=15, p=1, weights='uniform'),
    'SVC': SVC(),
    'Tree': DecisionTreeClassifier(max_depth=3, criterion='entropy', min_samples_split=3)
}


cv = KFold(n_splits=5, random_state=666, shuffle=True)

results = []

for model_name, model in models.items():
    ml_pipeline = Pipeline([
        ('preprocessor', preprocessor), 
        ('model', model) 
    ])


    cross_validation_result = cross_val_score(
        ml_pipeline,
        X, 
        y,
        cv=cv,
        scoring='accuracy'
    )
    

    results.append({
        'Model': model_name,
        'Mean Score (cross-val)': cross_validation_result.mean()
    })

accuracy_df = pd.DataFrame(results)
print(accuracy_df)

            Model  Mean Score (cross-val)
0          LogReg                0.857294
1  LogReg with l1                0.859474
2  LogReg with l2                0.856207
3             KNN                0.857306
4             SVC                0.861654
5            Tree                0.843152


|  |cross_val_score|
|--|-----|
|**LogReg**|  your_score |
|**LogReg with l1**|  your_score  |
|**LogReg with l2**|  your_score  |
|**KNN**| your_score  |
|**SVC**| your_score  |
|**Tree**| your_score |

<img src="https://icons.iconarchive.com/icons/icons8/windows-8/256/Programming-Github-icon.png" width=32 /> Пора сохранить изменения для __github__. 

1. Перейди в командной строке в папку, в которой расположен этот нотбук. 
2. Выполни команду `git add 06-01-task.ipynb`
3. Выполни команду `git commit -m "base models in progress"`
4. Выполни команду `git push`

##### 9. Теперь, когда вы проделали весь pipeline и обучили базовую модель, можно вернуться к началу и пробовать новые идеи и искать точки роста для ваших моделей, в том числе и добавление новых фичей

<img src="https://icons.iconarchive.com/icons/icons8/windows-8/256/Programming-Github-icon.png" width=32 /> Сохрани файл для __github__ и выполни команду `!git status` в ячейке ниже.


In [62]:
!git status

Текущая ветка: main
Эта ветка соответствует «origin/main».

Изменения, которые не в индексе для коммита:
  (используйте «git add <файл>...», чтобы добавить файл в индекс)
  (используйте «git restore <файл>...», чтобы отменить изменения в рабочем каталоге)
	[31mизменено:      06-01-task.ipynb[m

Неотслеживаемые файлы:
  (используйте «git add <файл>...», чтобы добавить в то, что будет включено в коммит)
	[31m"../1_\320\222\321\202\320\276\321\200\320\275\320\270\320\272_3/"[m

индекс пуст (используйте «git add» и/или «git commit -a»)
