In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV, RidgeCV, LogisticRegression
from sklearn.metrics import r2_score, accuracy_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.base import BaseEstimator
from sklearn.dummy import DummyClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
class ClfSwitcher(BaseEstimator):

    def __init__(self, estimator=Ridge()):
        self.estimator = estimator

    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

class Scaler(BaseEstimator):

    def __init__(self, normalizer=StandardScaler()):
        self.normalizer = normalizer

    def fit_transform(self, X, y=None, **kwargs):
        return self.normalizer.fit_transform(X)

    def transform(self, X, **kwargs):
        return self.normalizer.transform(X)

    def fit(self, X, y=None, **kwargs):
        self.normalizer.fit(X, y)
        return self


In [3]:
RANDOM_STATE = 42

results_regression = pd.DataFrame(columns = ['model', 'task', 'R2'])
results_classification = pd.DataFrame(columns = ['model', 'task', 'f1', 'accuracy'])

https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html

In [4]:
data = pd.read_csv('boston.csv')
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


1. Разделите выборку на обучающую и тестовую в отношении 80%/20%, предварительно выделив целевую переменную (колонка 'MEDV').

In [5]:
### Ваш код ###
X_train, X_test, y_train, y_test = \
train_test_split(data.loc[:, data.columns != 'MEDV'], \
                 data.loc[:, data.columns == 'MEDV'], \
                 train_size=0.8,
                 shuffle=RANDOM_STATE)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((404, 13), (102, 13), (404, 1), (102, 1))

2. Обучите стандартную регрессию, а также Ridge и  Lasso с параметрами по умолчанию и выведите их R2 на тестовой выборке

In [6]:
lr = LinearRegression().fit(X_train, y_train)
ridge = Ridge().fit(X_train, y_train)
lasso = Lasso().fit(X_train, y_train)

r2_lr = r2_score(y_test, lr.predict(X_test))
r2_ridge = r2_score(y_test, ridge.predict(X_test))
r2_lasso = r2_score(y_test, lasso.predict(X_test))
results_regression.loc[0] = ['LR', 'task2', r2_lr]
results_regression.loc[1] = ['Ridge', 'task2', r2_ridge]
results_regression.loc[2] = ['Lasso', 'task2', r2_lasso]

3. Для Ridge и Lasso подберите коэффициент регуляризации двумя способами 1) GridSearchCV, 2) RidgeCV и LassoCV, в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по всем моделям и сравните с предыдущими результатами.

In [7]:
### Ваш код ###
parameters_grid = {
    'alpha': list(map(lambda x: 10**x, range(-5, 6))),
}
grid_ridge = GridSearchCV(Ridge(), parameters_grid).fit(X_train, y_train)
grid_lasso = GridSearchCV(Lasso(), parameters_grid).fit(X_train, y_train)
ridge_cv = RidgeCV(alphas=list(map(lambda x: 10**x, range(-5, 6)))).fit(X_train, y_train)
lasso_cv = LassoCV(alphas=list(map(lambda x: 10**x, range(-5, 6)))).fit(X_train, y_train.values.ravel())

r2_ridge_grid_search = r2_score(y_test, grid_ridge.predict(X_test))
r2_ridge_cv = r2_score(y_test, ridge_cv.predict(X_test))
r2_lasso_grid_search = r2_score(y_test, grid_lasso.predict(X_test))
r2_lasso_cv = r2_score(y_test, lasso_cv.predict(X_test))
results_regression.loc[3] = ['Ridge_GridSearchCV', 'task3', r2_ridge_grid_search]
results_regression.loc[4] = ['RidgeCV', 'task3', r2_ridge_cv]
results_regression.loc[5] = ['Lasso_GridSearchCV', 'task3', r2_lasso_grid_search]
results_regression.loc[6] = ['LassoCV', 'task3', r2_lasso_cv]

4. Проведите масштабирование выборки (используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 для Ridge и Lasso с параметрами по умолчанию и сравните с предыдущими результатами.

In [8]:
pipe_ridge_st = Pipeline(steps=[('scaler', StandardScaler()),
                                ('ridge', Ridge())]).fit(X_train, y_train)
pipe_ridge_mm = Pipeline(steps=[('scaler', MinMaxScaler()),
                                ('ridge', Ridge())]).fit(X_train, y_train)
pipe_lasso_st = Pipeline(steps=[('scaler', StandardScaler()),
                                ('lasso', Lasso())]).fit(X_train, y_train)
pipe_lasso_mm = Pipeline(steps=[('scaler', MinMaxScaler()),
                                ('lasso', Lasso())]).fit(X_train, y_train)

r2_ridge_standart_scaler = r2_score(y_test, pipe_ridge_st.predict(X_test))
r2_ridge_min_max_scaler = r2_score(y_test, pipe_ridge_mm.predict(X_test))
r2_lasso_standart_scaler = r2_score(y_test, pipe_lasso_st.predict(X_test))
r2_lasso_min_max_scaler = r2_score(y_test, pipe_lasso_mm.predict(X_test))
results_regression.loc[7] = ['Ridge_StandardScaler', 'task4', r2_ridge_standart_scaler]
results_regression.loc[8] = ['Ridge_MinMaxScaler', 'task4', r2_ridge_min_max_scaler]
results_regression.loc[9] = ['Lasso_StandardScaler', 'task4', r2_lasso_standart_scaler]
results_regression.loc[10] = ['Lasso_MinMaxScaler', 'task4', r2_lasso_min_max_scaler]

5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами.

In [9]:
### Ваш код ###
ridge = GridSearchCV(Ridge(), parameters_grid)
lasso = GridSearchCV(Lasso(), parameters_grid)

pipe_ridge_st_cv = Pipeline(steps=[('scaler', StandardScaler()),
                                   ('ridge', ridge)]).fit(X_train, y_train)
pipe_ridge_mm_cv = Pipeline(steps=[('scaler', MinMaxScaler()),
                                   ('ridge', ridge)]).fit(X_train, y_train)
pipe_lasso_st_cv = Pipeline(steps=[('scaler', StandardScaler()),
                                   ('lasso', lasso)]).fit(X_train, y_train.values.ravel())
pipe_lasso_mm_cv = Pipeline(steps=[('scaler', MinMaxScaler()),
                                   ('lasso', lasso)]).fit(X_train, y_train.values.ravel())

r2_ridge_standart_scaler_cv = r2_score(y_test, pipe_ridge_st_cv.predict(X_test))
r2_ridge_min_max_scaler_cv = r2_score(y_test, pipe_ridge_mm_cv.predict(X_test))
r2_lasso_standart_scaler_cv = r2_score(y_test, pipe_lasso_st_cv.predict(X_test))
r2_lasso_min_max_scaler_cv = r2_score(y_test, pipe_lasso_mm_cv.predict(X_test))
results_regression.loc[11] = ['Ridge_StandardScaler_CV', 'task5', r2_ridge_standart_scaler_cv]
results_regression.loc[12] = ['Ridge_MinMaxScaler_CV', 'task5', r2_ridge_min_max_scaler_cv]
results_regression.loc[13] = ['Lasso_StandardScaler_CV', 'task5', r2_lasso_standart_scaler_cv]
results_regression.loc[14] = ['Lasso_MinMaxScaler_CV', 'task5', r2_lasso_min_max_scaler_cv]


6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 для Ridge и Lasso с параметрами по умолчанию и сравните с предыдущими результатами.

In [10]:
### Ваш код ###
poly = PolynomialFeatures(degree=2)
xp_train = poly.fit_transform(X_train)
xp_test = poly.fit_transform(X_test)

pipe_ridge_st_poly = Pipeline(steps=[('scaler', StandardScaler()),
                                ('ridge', Ridge())]).fit(xp_train, y_train)
pipe_ridge_mm_poly = Pipeline(steps=[('scaler', MinMaxScaler()),
                                ('ridge', Ridge())]).fit(xp_train, y_train)
pipe_lasso_st_poly = Pipeline(steps=[('scaler', StandardScaler()),
                                ('lasso', Lasso())]).fit(xp_train, y_train)
pipe_lasso_mm_poly = Pipeline(steps=[('scaler', MinMaxScaler()),
                                ('lasso', Lasso())]).fit(xp_train, y_train)

r2_ridge_standart_scaler_poly = r2_score(y_test, pipe_ridge_st_poly.predict(xp_test))
r2_ridge_min_max_scaler_poly = r2_score(y_test, pipe_ridge_mm_poly.predict(xp_test))
r2_lasso_standart_scaler_poly = r2_score(y_test, pipe_lasso_st_poly.predict(xp_test))
r2_lasso_min_max_scaler_poly = r2_score(y_test, pipe_lasso_mm_poly.predict(xp_test))
results_regression.loc[15] = ['Ridge_StandardScaler_Poly', 'task6', r2_ridge_standart_scaler_poly]
results_regression.loc[16] = ['Ridge_MinMaxScaler_Poly', 'task6', r2_ridge_min_max_scaler_poly]
results_regression.loc[17] = ['Lasso_StandardScaler_Poly', 'task6', r2_lasso_standart_scaler_poly]
results_regression.loc[18] = ['Lasso_MinMaxScaler_Poly', 'task6', r2_lasso_min_max_scaler_poly]


7. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, добавив PolynomialFeatures, посчитайте R2 и сравните с предыдущими результатами.

In [11]:
### Ваш код ###
poly = PolynomialFeatures(degree=2)
standart_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()
xp_st_train = poly.fit_transform(standart_scaler.fit_transform(X_train))
xp_mm_train = poly.fit_transform(minmax_scaler.fit_transform(X_train))
xp_st_test = poly.fit_transform(standart_scaler.fit_transform(X_test))
xp_mm_test = poly.fit_transform(minmax_scaler.fit_transform(X_test))
ridge = GridSearchCV(Ridge(), parameters_grid)
lasso = GridSearchCV(Lasso(), parameters_grid)

pipe_ridge_st_poly_cv = Pipeline(steps=[
                                ('ridge', ridge)]).fit(xp_st_train, y_train)
pipe_ridge_mm_poly_cv = Pipeline(steps=[
                                ('ridge', ridge)]).fit(xp_mm_train, y_train)
pipe_lasso_st_poly_cv = Pipeline(steps=[
                                ('lasso', lasso)]).fit(xp_st_train, y_train)
pipe_lasso_mm_poly_cv = Pipeline(steps=[
                                ('lasso', lasso)]).fit(xp_mm_train, y_train)

r2_ridge_standart_scaler_poly_cv = r2_score(y_test, pipe_ridge_st_poly_cv.predict(xp_st_test))
r2_ridge_min_max_scaler_poly_cv = r2_score(y_test, pipe_ridge_mm_poly_cv.predict(xp_mm_test))
r2_lasso_standart_scaler_poly_cv = r2_score(y_test, pipe_lasso_st_poly_cv.predict(xp_st_test))
r2_lasso_min_max_scaler_poly_cv = r2_score(y_test, pipe_lasso_mm_poly_cv.predict(xp_mm_test))
results_regression.loc[19] = ['Ridge_StandardScaler_Poly_CV', 'task7', r2_ridge_standart_scaler_poly_cv]
results_regression.loc[20] = ['Ridge_MinMaxScaler_Poly_CV', 'task7', r2_ridge_min_max_scaler_poly_cv]
results_regression.loc[21] = ['Lasso_StandardScaler_Poly_CV', 'task7', r2_lasso_standart_scaler_poly_cv]
results_regression.loc[22] = ['Lasso_MinMaxScaler_Poly_CV', 'task7', r2_lasso_min_max_scaler_poly_cv]


8. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2.

In [12]:
### Ваш код ###
pipeline = Pipeline([
    ('transformer', Scaler()),
    ('poly', PolynomialFeatures()),
    ('clf', ClfSwitcher()),
])

parameters = [
    {
        'clf__estimator': [Ridge()],
        'transformer__normalizer': [StandardScaler(), MinMaxScaler(), 'passthrough'],
        'clf__estimator__alpha': list(map(lambda x: 10**x, range(-5, 6))),
        'poly__degree': list(range(1, 6)),
    },
    {
        'clf__estimator': [Lasso()],
        'transformer__normalizer': [StandardScaler(), MinMaxScaler(), 'passthrough'],
        'clf__estimator__alpha': list(map(lambda x: 10**x, range(-5, 6))),
        'poly__degree': list(range(1, 6)),
    },
]

gs = GridSearchCV(pipeline, parameters)
gs.fit(X_train, y_train)

best_params = gs.best_params_
print('Параметры лучшей модели:\n', best_params)
r2_best_model = r2_score(y_test, gs.predict(X_test))
results_regression.loc[23] = ['Best_Model', 'task8', r2_best_model]

Параметры лучшей модели:
 {'clf__estimator': Ridge(alpha=0.1), 'clf__estimator__alpha': 0.1, 'poly__degree': 4, 'transformer__normalizer': MinMaxScaler()}


In [13]:
results_regression

Unnamed: 0,model,task,R2
0,LR,task2,0.727225
1,Ridge,task2,0.718555
2,Lasso,task2,0.66822
3,Ridge_GridSearchCV,task3,0.727225
4,RidgeCV,task3,0.727096
5,Lasso_GridSearchCV,task3,0.727221
6,LassoCV,task3,0.727221
7,Ridge_StandardScaler,task4,0.727207
8,Ridge_MinMaxScaler,task4,0.720468
9,Lasso_StandardScaler,task4,0.660821


http://archive.ics.uci.edu/ml/datasets/Adult

In [14]:
data = pd.read_csv('adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


9. Разделите выборку на признаки и целевую переменную(колонка class). Замените целевую переменную на числовые значения ('<=50K' - 1, '>50K' - 0).

In [15]:
### Ваш код ###
data['class'] = np.where(data['class']=='<=50K', 1, 0)
data.head()
X = data.loc[:, data.columns != 'class']
y = data.loc[:, data.columns == 'class']
X_train, X_test, y_train, y_test = \
train_test_split(X, y, train_size=0.8, shuffle=RANDOM_STATE)
print(X.shape, y.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(48842, 14) (48842, 1) (39073, 14) (9769, 14) (39073, 1) (9769, 1)


10. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [16]:
### Ваш код ###
dc = DummyClassifier(strategy='most_frequent').fit(X, y)
y_pred = dc.predict(X)

f1_most_frequent = f1_score(y, y_pred)
acc_most_frequent = accuracy_score(y, y_pred)
results_classification.loc[0] = ['Most Frequent class', 'task10', f1_most_frequent, acc_most_frequent]

11. Выясните, присутствуют ли в данных пропуски. Если присутствуют, заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [17]:
### Ваш код ###
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       48842 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      48842 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48842 non-null  object
 14  class           48842 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 5.6+ MB


12. Выберите колонки с числовыми и категориальными переменными (используя возможности pandas).

In [18]:
### Ваш код ###
list(X.select_dtypes(['int']).columns), list(X.select_dtypes(['object']).columns)

(['age',
  'fnlwgt',
  'education-num',
  'capital-gain',
  'capital-loss',
  'hours-per-week'],
 ['workclass',
  'education',
  'marital-status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'native-country'])

13. Создайте пайплайн по обработке числовых и категориальных значений колонок (используйте OneHotEncoder,MinMaxScaler) и посчитайте cross_val_score по алгоритмам LogisticRegression, KNeighborsClassifier, LinearSVC по метрикам accuracy и f1_score.

In [19]:
### Ваш код ###
numeric_features = list(X.select_dtypes(['int']).columns)
numeric_transformer = Pipeline(
    steps=[("scaler", MinMaxScaler())]
)

categorical_features = list(X.select_dtypes(['object']).columns)
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

clf_lr = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)
clf_knn = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", KNeighborsClassifier())]
)
clf_svm = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearSVC())]
)

clf_lr.fit(X_train, y_train)
clf_knn.fit(X_train, y_train)
clf_svm.fit(X_train, y_train)

f1_LR = f1_score(y_test, clf_lr.predict(X_test))
acc_LR = accuracy_score(y_test, clf_lr.predict(X_test))
f1_KNN = f1_score(y_test, clf_knn.predict(X_test))
acc_KNN = accuracy_score(y_test, clf_knn.predict(X_test))
f1_SVM = f1_score(y_test, clf_svm.predict(X_test))
acc_SVM = accuracy_score(y_test, clf_svm.predict(X_test))
results_classification.loc[1] = ['LogisticRegression', 'task13', f1_LR, acc_LR]
results_classification.loc[2] = ['KNeighborsClassifier', 'task13', f1_KNN, acc_KNN]
results_classification.loc[3] = ['LinearSVC', 'task13', f1_SVM, acc_SVM]

14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями, (испольуйте SimpleImputer). Посчитайте cross_val_score по алгоритмам LogisticRegression, KNeighborsClassifier, LinearSVC по метрикам accuracy и f1_score.

In [20]:
### Ваш код ###
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(missing_values='?', strategy='most_frequent')),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

clf_lr = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)
clf_knn = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", KNeighborsClassifier())]
)
clf_svm = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearSVC())]
)

f1_LR = cross_val_score(clf_lr, X, y, scoring='f1').mean()
acc_LR = cross_val_score(clf_lr, X, y, scoring='accuracy').mean()
f1_KNN = cross_val_score(clf_knn, X, y, scoring='f1').mean()
acc_KNN = cross_val_score(clf_knn, X, y, scoring='accuracy').mean()
f1_SVM = cross_val_score(clf_svm, X, y, scoring='f1').mean()
acc_SVM = cross_val_score(clf_svm, X, y, scoring='accuracy').mean()
results_classification.loc[4] = ['LogisticRegression_impute', 'task14', f1_LR, acc_LR]
results_classification.loc[5] = ['KNeighborsClassifier_impute', 'task14', f1_KNN, acc_KNN]
results_classification.loc[6] = ['LinearSVC_impute', 'task14', f1_SVM, acc_SVM]

15. Посчитайте cross_val_score по тем же алгоритмам и метрикам, если просто удалить значения '?'.

In [21]:
### Ваш код ###
X_del = X.drop(X[(X == '?').any(1)].index)
y_del = y.drop(X[(X == '?').any(1)].index)

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

clf_lr = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)
clf_knn = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", KNeighborsClassifier())]
)
clf_svm = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LinearSVC())]
)

f1_LR_del_missings = cross_val_score(clf_lr, X_del, y_del, scoring='f1').mean()
acc_LR_del_missings = cross_val_score(clf_lr, X_del, y_del, scoring='accuracy').mean()
f1_KNN_del_missings = cross_val_score(clf_knn, X_del, y_del, scoring='f1').mean()
acc_KNN_del_missings = cross_val_score(clf_knn, X_del, y_del, scoring='accuracy').mean()
f1_SVM_del_missings = cross_val_score(clf_svm, X_del, y_del, scoring='f1').mean()
acc_SVM_del_missings = cross_val_score(clf_svm, X_del, y_del, scoring='accuracy').mean()
results_classification.loc[7] = ['LogisticRegression_delete_missings', 'task15', f1_LR_del_missings, acc_LR_del_missings]
results_classification.loc[8] = ['KNeighborsClassifier_delete_missings', 'task15', f1_KNN_del_missings, acc_KNN_del_missings]
results_classification.loc[9] = ['LinearSVC_delete_missings', 'task15', f1_SVM_del_missings, acc_SVM_del_missings]

 16. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier на данных с замененными значениями '?' на самые частые значения.

In [22]:
### Ваш код ###

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(missing_values='?', strategy='most_frequent')),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

rf_lr = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier())]
)
gb_knn = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", GradientBoostingClassifier())]
)


f1_RF = cross_val_score(rf_lr, X, y, scoring='f1').mean()
acc_RF = cross_val_score(rf_lr, X, y, scoring='accuracy').mean()
f1_GB = cross_val_score(gb_knn, X, y, scoring='f1').mean()
acc_GB = cross_val_score(gb_knn, X, y, scoring='accuracy').mean()
results_classification.loc[10] = ['RandomForestClassifier', 'task16', f1_RF, acc_RF]
results_classification.loc[11] = ['GradientBoostingClassifier', 'task16', f1_GB, acc_GB]

17. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

In [28]:
### Ваш код ###
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(missing_values='?', strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ]
)
numeric_transformer = Pipeline(
    steps=[('scaler', Scaler()),
           ('poly', PolynomialFeatures())]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ]
)

pipeline = Pipeline([
    ('transformer', preprocessor),
    ('clf', ClfSwitcher()),
])

parameters = [
    {
        'clf__estimator': [LogisticRegression()],
        'transformer__num__scaler__normalizer': [StandardScaler(), MinMaxScaler(), 'passthrough'],
        'transformer__num__poly__degree': list(range(1, 5)),
        'transformer__cat__imputer': [SimpleImputer(missing_values='?', strategy='most_frequent'), 'passthrough'],
    },
    {
        'clf__estimator': [RandomForestClassifier()],
        'transformer__num__scaler__normalizer': [StandardScaler(), MinMaxScaler(), 'passthrough'],
        'transformer__num__poly__degree': list(range(1, 5)),
        'transformer__cat__imputer': [SimpleImputer(missing_values='?', strategy='most_frequent'), 'passthrough'],
    },
    {
        'clf__estimator': [GradientBoostingClassifier()],
        'transformer__num__scaler__normalizer': [StandardScaler(), MinMaxScaler(), 'passthrough'],
        'transformer__num__poly__degree': list(range(1, 5)),
        'transformer__cat__imputer': [SimpleImputer(missing_values='?', strategy='most_frequent'), 'passthrough'],
    },
    {
        'clf__estimator': [LinearSVC()],
        'transformer__num__scaler__normalizer': [StandardScaler(), MinMaxScaler(), 'passthrough'],
        'transformer__num__poly__degree': list(range(1, 5)),
        'transformer__cat__imputer': [SimpleImputer(missing_values='?', strategy='most_frequent'), 'passthrough'],
    },
]

gs = GridSearchCV(pipeline, parameters)
gs.fit(X_train, y_train)

best_params = gs.best_params_
print('Параметры лучшей модели:\n', best_params)
f1_best = f1_score(y_test, gs.predict(X_test))
acc_best = accuracy_score(y_test, gs.predict(X_test))
results_classification.loc[12] = ['Best_Model', 'task17', f1_best, acc_best]

Параметры лучшей модели:
 {'clf__estimator': GradientBoostingClassifier(), 'transformer__cat__imputer': 'passthrough', 'transformer__num__poly__degree': 2, 'transformer__num__scaler__normalizer': StandardScaler()}


In [29]:
results_classification

Unnamed: 0,model,task,f1,accuracy
0,Most Frequent class,task10,0.8641,0.760718
1,LogisticRegression,task13,0.904374,0.849831
2,KNeighborsClassifier,task13,0.887408,0.825366
3,LinearSVC,task13,0.905705,0.851776
4,LogisticRegression_impute,task14,0.904876,0.850866
5,KNeighborsClassifier_impute,task14,0.887343,0.825253
6,LinearSVC_impute,task14,0.905422,0.851255
7,LogisticRegression_delete_missings,task15,0.901149,0.846845
8,KNeighborsClassifier_delete_missings,task15,0.882951,0.820574
9,LinearSVC_delete_missings,task15,0.902403,0.848503
