In [None]:
!pip install category_encoders

In [13]:
%matplotlib inline


import category_encoders
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import t
from scipy.stats import norm
from sklearn.datasets import make_classification
from sklearn.linear_model import \
    LinearRegression, LogisticRegression, Ridge, Lasso, SGDClassifier 
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, \
    RobustScaler, QuantileTransformer, PowerTransformer, OneHotEncoder
from warnings import filterwarnings
filterwarnings('ignore')


sns.set()

## Задача: с помощью разных моделей добиться лучшего результата на датасете "Australian Credit Approval".

- Использовать разные методы кодирования категориальных признаков;
- Протестировать разные способы нормализации;
- Отобрать оптимальной набор признаков для каждой модели.

In [6]:
# загрузим данные

df = pd.read_csv(
    '/content/Australian Credit Approval.txt', 
    sep=' ', 
    header=None)
df.columns = ['A' + str(i) for i in range(1, 15)] + ['loan']

test_size = 200

# рандомизируем элементы датасета

shuffle = np.array(np.arange(0, 690))
random.shuffle(shuffle)

df = df.iloc[shuffle, :]

In [7]:
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,loan
407,0,33.25,3.0,1,6,4,2.0,0,0,0,0,2,180,1,0
58,1,42.5,4.915,1,9,4,3.165,1,0,0,1,2,52,1443,1
131,0,20.83,0.5,1,10,2,1.0,0,0,0,0,2,260,1,0
505,1,19.33,9.5,2,11,4,1.0,1,0,0,1,2,60,401,1
46,0,24.5,1.75,1,8,4,0.165,0,0,0,0,2,132,1,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 690 entries, 407 to 279
Data columns (total 15 columns):
A1      690 non-null int64
A2      690 non-null float64
A3      690 non-null float64
A4      690 non-null int64
A5      690 non-null int64
A6      690 non-null int64
A7      690 non-null float64
A8      690 non-null int64
A9      690 non-null int64
A10     690 non-null int64
A11     690 non-null int64
A12     690 non-null int64
A13     690 non-null int64
A14     690 non-null int64
loan    690 non-null int64
dtypes: float64(3), int64(12)
memory usage: 86.2 KB


In [None]:
train_df = df.iloc[:-test_size, :]
test_df = df.iloc[-test_size:, :]

train_df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,loan
407,0,33.25,3.0,1,6,4,2.0,0,0,0,0,2,180,1,0
58,1,42.5,4.915,1,9,4,3.165,1,0,0,1,2,52,1443,1
131,0,20.83,0.5,1,10,2,1.0,0,0,0,0,2,260,1,0
505,1,19.33,9.5,2,11,4,1.0,1,0,0,1,2,60,401,1
46,0,24.5,1.75,1,8,4,0.165,0,0,0,0,2,132,1,0


In [None]:
features = list(df.columns.drop('loan'))
target_var = 'loan'

In [None]:
# выберем лучший scaler для KNeighborsClassifier

scalers = MinMaxScaler(), StandardScaler(), RobustScaler(), QuantileTransformer(), PowerTransformer()
scalers_name = [
      'Минимакс(MinMaxScaler)',
      'Z-нормализация(StandardScaler)',
      'Устойчивая нормализация(RobustScaler)',
      'Квантильное преобразование(QuantileTransformer)',
      'Степенное преобразование(PowerTransformer)'
      ]
n = 0
  
while n < len(scalers_name):
    for scaler in scalers:
        scaled_train = scaler.fit_transform(train_df[features])
        scaled_test = scaler.transform(test_df[features])

        knn_model = KNeighborsClassifier()
        knn_model.fit(scaled_train, train_df['loan'])
        y_pred = knn_model.predict(scaled_test)
        acc_knn = accuracy_score(test_df['loan'], y_pred)

        print('Scaler:', scalers_name[n])
        print('accuracy_score:', acc_knn)
        print()
        n += 1

Scaler: Минимакс(MinMaxScaler)
accuracy_score: 0.83

Scaler: Z-нормализация(StandardScaler)
accuracy_score: 0.84

Scaler: Устойчивая нормализация(RobustScaler)
accuracy_score: 0.81

Scaler: Квантильное преобразование(QuantileTransformer)
accuracy_score: 0.82

Scaler: Степенное преобразование(PowerTransformer)
accuracy_score: 0.84



In [None]:
# разделим числовые и категориальные признаки

feat_num = ['A2', 'A3', 'A7', 'A10', 'A13', 'A14']
feat_cat = ['A1', 'A4', 'A5', 'A6', 'A8', 'A9', 'A11', 'A12']

In [None]:
# к числовым признакам применим z-нормализацию

scaler_std = StandardScaler()
scaled_train = scaler_std.fit_transform(train_df[feat_num])
scaled_test = scaler_std.transform(test_df[feat_num])

scaled_train = pd.DataFrame(scaled_train)
scaled_train.columns = feat_num
scaled_train = pd.concat([scaled_train, train_df[feat_cat].reset_index(), train_df['loan'].reset_index()], axis=1, sort=False)
scaled_train.drop(['index'], axis=1, inplace=True)

scaled_test = pd.DataFrame(scaled_test)
scaled_test.columns = feat_num
scaled_test = pd.concat([scaled_test, test_df[feat_cat].reset_index(), test_df['loan'].reset_index()], axis=1, sort=False)
scaled_test.drop(['index'], axis=1, inplace=True)

In [None]:
# для каждого признака выведем степень корреляции с целевой переменной

corr_matrix = scaled_train.corr()
corr_matrix = corr_matrix['loan']
corr_matrix

A2      0.168715
A3      0.192402
A7      0.338011
A10     0.386084
A13    -0.089429
A14     0.175548
A1     -0.014093
A4      0.214921
A5      0.387014
A6      0.269119
A8      0.735384
A9      0.445031
A11     0.050711
A12     0.094447
loan    1.000000
Name: loan, dtype: float64

In [None]:
# отсортируем признаки по убыванию корреляции

order = corr_matrix.abs().sort_values(ascending = False)
order = corr_matrix[order.index]
order

loan    1.000000
A8      0.735384
A9      0.445031
A5      0.387014
A10     0.386084
A7      0.338011
A6      0.269119
A4      0.214921
A3      0.192402
A14     0.175548
A2      0.168715
A12     0.094447
A13    -0.089429
A11     0.050711
A1     -0.014093
Name: loan, dtype: float64

In [None]:
# сохраним новый порядок признаков в датасете

new_features = order.index.drop('loan')
new_features = list(new_features)
target_var = 'loan'

In [None]:
X_train = scaled_train[new_features]
y_train = scaled_train[[target_var]]
X_test = scaled_test[new_features]
y_test = scaled_test[[target_var]]

In [None]:
# отберем оптимальные признаки и протестируем модели
# KNeighborsClassifier

knn_model = KNeighborsClassifier()

metrics = ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
kvalues = list(range(1, 30))
cv = 6
leaf_size = list(range(1, 50))

i = 1
q_feat = np.arange(1, 15) # количество признаков

while i <= len(new_features):
  for q in q_feat:
  
    gs_knn = GridSearchCV(
        knn_model,
        param_grid={'n_neighbors': kvalues, 'metric': metrics, 'leaf_size': leaf_size},
        cv=cv,
        scoring='accuracy'
        )
    gs_knn.fit(X_train.iloc[:, :i], y_train.to_numpy().ravel())

    knn_model = knn_model.set_params(**gs_knn.best_params_)
    knn_model.fit(X_train.iloc[:, :i].to_numpy(), y_train.to_numpy().ravel())

    y_pred = knn_model.predict(X_test.iloc[:, :i].to_numpy())
    acc_knn = accuracy_score(y_test.to_numpy().ravel(), y_pred)

    best_parameters = gs_knn.best_params_

    print('Количество признаков:', q)
    print('best_parameters:', best_parameters)
    print('accuracy:', acc_knn)
    print()
    i += 1

Количество признаков: 1
best_parameters: {'leaf_size': 1, 'metric': 'euclidean', 'n_neighbors': 1}
accuracy: 0.835

Количество признаков: 2
best_parameters: {'leaf_size': 13, 'metric': 'euclidean', 'n_neighbors': 17}
accuracy: 0.835

Количество признаков: 3
best_parameters: {'leaf_size': 26, 'metric': 'euclidean', 'n_neighbors': 10}
accuracy: 0.86

Количество признаков: 4
best_parameters: {'leaf_size': 4, 'metric': 'manhattan', 'n_neighbors': 20}
accuracy: 0.84

Количество признаков: 5
best_parameters: {'leaf_size': 7, 'metric': 'euclidean', 'n_neighbors': 3}
accuracy: 0.845

Количество признаков: 6
best_parameters: {'leaf_size': 1, 'metric': 'manhattan', 'n_neighbors': 16}
accuracy: 0.84

Количество признаков: 7
best_parameters: {'leaf_size': 7, 'metric': 'manhattan', 'n_neighbors': 5}
accuracy: 0.83

Количество признаков: 8
best_parameters: {'leaf_size': 1, 'metric': 'manhattan', 'n_neighbors': 27}
accuracy: 0.845

Количество признаков: 9
best_parameters: {'leaf_size': 1, 'metric': '

In [None]:
# лучший результат модель показала на первых 3-х признаках
# проверим accuracy, указав параметры из best_params_

knn_model = KNeighborsClassifier(leaf_size=26, metric='euclidean', n_neighbors=10)

knn_model.fit(X_train.iloc[:, :3].to_numpy(), y_train.to_numpy().ravel())

y_pred = knn_model.predict(X_test.iloc[:, :3].to_numpy())
acc_knn = accuracy_score(y_test.to_numpy().ravel(), y_pred)

print('accuracy:', acc_knn)

accuracy: 0.86


In [None]:
# все признаки, давшие лучший результат, категориальные
# применим one-hot кодирование к признакам, у которых больше 2-х уникальных значений

X_train.iloc[:, :3].nunique()

A8     2
A9     2
A5    14
dtype: int64

In [None]:
new_features_2 = list(X_train.iloc[:, :3].columns)

In [None]:
X_train_2 = scaled_train[new_features_2]
y_train_2 = scaled_train[[target_var]]
X_test_2 = scaled_test[new_features_2]
y_test_2 = scaled_test[[target_var]]

X_train_2.head()

Unnamed: 0,A8,A9,A5
0,0,0,6
1,1,0,9
2,0,0,10
3,1,0,11
4,0,0,8


In [None]:
df_encoded_train = pd.get_dummies(X_train_2, prefix='dummy', columns=['A5'])
df_encoded_test = pd.get_dummies(X_test_2, prefix='dummy', columns=['A5'])

In [None]:
# обучим модель на новых признаках

knn_model = KNeighborsClassifier(leaf_size=26, metric='euclidean', n_neighbors=10)

knn_model.fit(df_encoded_train.to_numpy(), y_train_2.to_numpy().ravel())

y_pred = knn_model.predict(df_encoded_test.to_numpy())
acc_knn = accuracy_score(y_test_2.to_numpy().ravel(), y_pred)

print('accuracy:', acc_knn)

accuracy: 0.84


In [None]:
# протестируем среднее кодирование

df_encode_train = pd.concat([X_train_2, y_train_2.reset_index()], axis=1, sort=False)
df_encode_train.drop(['index'], axis=1, inplace=True)

df_encode_test = pd.concat([X_test_2, y_test_2.reset_index()], axis=1, sort=False)
df_encode_test.drop(['index'], axis=1, inplace=True)

mean_encode_train = df_encode_train.groupby('A5')['loan'].mean()
mean_encode_test = df_encode_test.groupby('A5')['loan'].mean()

df_encode_train.loc[:, 'A5_mean_enc'] = df_encode_train['A5'].map(mean_encode_train)
df_encode_train.drop(['A5'], axis=1, inplace=True)
df_encode_test.loc[:, 'A5_mean_enc'] = df_encode_test['A5'].map(mean_encode_test)
df_encode_test.drop(['A5'], axis=1, inplace=True)

In [None]:
# обучим модель на новых признаках

knn_model = KNeighborsClassifier(leaf_size=26, metric='euclidean', n_neighbors=10)

knn_model.fit(df_encode_train.drop(['loan'], axis=1).to_numpy(), df_encode_train['loan'].to_numpy().ravel())

y_pred = knn_model.predict(df_encode_test.drop(['loan'], axis=1).to_numpy())
acc_knn = accuracy_score(df_encode_test['loan'].to_numpy().ravel(), y_pred)

print('accuracy:', acc_knn)

accuracy: 0.865


In [None]:
# LogisticRegression

log_model = LogisticRegression()

C = np.arange(0.1, 10, 0.1)
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['l1', 'l2', 'elasticnet', 'none']
cv = 10

i = 1
q_feat = np.arange(1, 15)

while i <= len(new_features):
  for q in q_feat:
  
    gs_log = GridSearchCV(
        log_model,
        param_grid={'C': C, 'penalty': penalty, 'solver': solver},
        cv=cv,
        scoring='accuracy',
        )
    
    gs_log.fit(X_train.iloc[:, :i], y_train.to_numpy().ravel())

    log_model = log_model.set_params(**gs_log.best_params_)
    log_model.fit(X_train.iloc[:, :i].to_numpy(), y_train.to_numpy().ravel())

    y_pred_log = log_model.predict(X_test.iloc[:, :i].to_numpy())
    acc_log = accuracy_score(y_test.to_numpy().ravel(), y_pred_log)

    best_parameters = gs_log.best_params_

    print('Количество признаков:', q)
    print('best_parameters:', best_parameters)
    print('accuracy:', acc_log)
    print()
    i += 1

Количество признаков: 1
best_parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
accuracy: 0.835

Количество признаков: 2
best_parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
accuracy: 0.835

Количество признаков: 3
best_parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
accuracy: 0.835

Количество признаков: 4
best_parameters: {'C': 0.8, 'penalty': 'l2', 'solver': 'liblinear'}
accuracy: 0.835

Количество признаков: 5
best_parameters: {'C': 0.1, 'penalty': 'none', 'solver': 'newton-cg'}
accuracy: 0.83

Количество признаков: 6
best_parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
accuracy: 0.825

Количество признаков: 7
best_parameters: {'C': 0.30000000000000004, 'penalty': 'l2', 'solver': 'newton-cg'}
accuracy: 0.85

Количество признаков: 8
best_parameters: {'C': 0.4, 'penalty': 'l2', 'solver': 'newton-cg'}
accuracy: 0.845

Количество признаков: 9
best_parameters: {'C': 0.1, 'penalty': 'none', 'solver': 'newton-cg'}
accuracy: 0.84

Количество п

In [None]:
# лучший результат модель показала на первых 7-и признаках
# проверим accuracy, указав параметры из best_params_

log_model = LogisticRegression(C=0.30000000000000004, penalty='l2', solver='newton-cg')

log_model.fit(X_train.iloc[:, :7].to_numpy(), y_train.to_numpy().ravel())

y_pred = log_model.predict(X_test.iloc[:, :7].to_numpy())
acc_log = accuracy_score(y_test.to_numpy().ravel(), y_pred)

print('accuracy:', acc_log)

accuracy: 0.85


In [None]:
# применим среднее кодирование к категориальным признакам, у которых больше 2-х уникальных значений

X_train.iloc[:, :7].nunique()

A8       2
A9       2
A5      14
A10     22
A7     114
A6       8
A4       3
dtype: int64

In [None]:
new_features_3 = ['A8', 'A9', 'A5', 'A10', 'A7', 'A6', 'A4']

In [None]:
X_train_3 = scaled_train[new_features_3]
y_train_3 = scaled_train[[target_var]]
X_test_3 = scaled_test[new_features_3]
y_test_3 = scaled_test[[target_var]]

In [None]:
df_encode_train = pd.concat([X_train_3, y_train_3.reset_index()], axis=1, sort=False)
df_encode_train.drop(['index'], axis=1, inplace=True)

df_encode_test = pd.concat([X_test_3, y_test_3.reset_index()], axis=1, sort=False)
df_encode_test.drop(['index'], axis=1, inplace=True)

mean_encode_train_a5 = df_encode_train.groupby('A5')['loan'].mean()
mean_encode_train_a6 = df_encode_train.groupby('A6')['loan'].mean()
mean_encode_test_a5 = df_encode_test.groupby('A5')['loan'].mean()
mean_encode_test_a6 = df_encode_test.groupby('A6')['loan'].mean()

df_encode_train.loc[:, 'A5_mean_enc'] = df_encode_train['A5'].map(mean_encode_train_a5)
df_encode_train.drop(['A5'], axis=1, inplace=True)
df_encode_train.loc[:, 'A6_mean_enc'] = df_encode_train['A6'].map(mean_encode_train_a6)
df_encode_train.drop(['A6'], axis=1, inplace=True)

df_encode_test.loc[:, 'A5_mean_enc'] = df_encode_test['A5'].map(mean_encode_test_a5)
df_encode_test.drop(['A5'], axis=1, inplace=True)
df_encode_test.loc[:, 'A6_mean_enc'] = df_encode_test['A6'].map(mean_encode_test_a6)
df_encode_test.drop(['A6'], axis=1, inplace=True)

df_encode_train.head()

Unnamed: 0,A8,A9,A10,A7,A4,loan,A5_mean_enc,A6_mean_enc
0,0,0,-0.483907,-0.070698,1,0,0.394737,0.45993
1,1,0,-0.483907,0.273858,1,1,0.619048,0.45993
2,0,0,-0.483907,-0.366453,1,0,0.6,0.4
3,1,0,-0.483907,-0.366453,2,1,0.660377,0.45993
4,0,0,-0.483907,-0.613409,1,0,0.46729,0.45993


In [None]:
# обучим модель на новых признаках

log_model = LogisticRegression(C=0.30000000000000004, penalty='l2', solver='newton-cg')

log_model.fit(df_encode_train.drop(['loan'], axis=1).to_numpy(), df_encode_train['loan'].to_numpy().ravel())

y_pred = log_model.predict(df_encode_test.drop(['loan'], axis=1).to_numpy())
acc_log = accuracy_score(df_encode_test['loan'].to_numpy().ravel(), y_pred)

print('accuracy:', acc_log)

accuracy: 0.84


In [None]:
# выберем лучший scaler для Random Forest

scalers = MinMaxScaler(), StandardScaler(), RobustScaler(), QuantileTransformer(), PowerTransformer()
scalers_name = [
      'Минимакс(MinMaxScaler)',
      'Z-нормализация(StandardScaler)',
      'Устойчивая нормализация(RobustScaler)',
      'Квантильное преобразование(QuantileTransformer)',
      'Степенное преобразование(PowerTransformer)'
      ]
n = 0
  
while n < len(scalers_name):
    for scaler in scalers:
        scaled_train = scaler.fit_transform(train_df[features])
        scaled_test = scaler.transform(test_df[features])

        rf_model = RandomForestClassifier(random_state=7)
        rf_model.fit(scaled_train, train_df['loan'])
        y_pred = rf_model.predict(scaled_test)
        acc_rf = accuracy_score(test_df['loan'], y_pred)

        print('Scaler:', scalers_name[n])
        print('accuracy_score:', acc_rf)
        print()
        n += 1

Scaler: Минимакс(MinMaxScaler)
accuracy_score: 0.85

Scaler: Z-нормализация(StandardScaler)
accuracy_score: 0.855

Scaler: Устойчивая нормализация(RobustScaler)
accuracy_score: 0.855

Scaler: Квантильное преобразование(QuantileTransformer)
accuracy_score: 0.855

Scaler: Степенное преобразование(PowerTransformer)
accuracy_score: 0.85



In [None]:
# к числовым признакам применим квантильное преобразование

scaler_mm = QuantileTransformer()
scaled_train = scaler_mm.fit_transform(train_df[feat_num])
scaled_test = scaler_mm.transform(test_df[feat_num])

scaled_train = pd.DataFrame(scaled_train)
scaled_train.columns = feat_num
scaled_train = pd.concat([scaled_train, train_df[feat_cat].reset_index(), train_df['loan'].reset_index()], axis=1, sort=False)
scaled_train.drop(['index'], axis=1, inplace=True)

scaled_test = pd.DataFrame(scaled_test)
scaled_test.columns = feat_num
scaled_test = pd.concat([scaled_test, test_df[feat_cat].reset_index(), test_df['loan'].reset_index()], axis=1, sort=False)
scaled_test.drop(['index'], axis=1, inplace=True)

In [None]:
X_train_4 = scaled_train[new_features]
y_train_4 = scaled_train[[target_var]]
X_test_4 = scaled_test[new_features]
y_test_4 = scaled_test[[target_var]]

In [None]:
# Random Forest

rf_model = RandomForestClassifier()

n_estimators = np.arange(1, 30)
max_depth = np.arange(1, 30)
cv = 10

i = 1
q_feat = np.arange(1, 15)

while i <= len(new_features):
  for q in q_feat:
  
    gs_rf = GridSearchCV(
        rf_model,
        param_grid = {'n_estimators': n_estimators,
                      'max_depth': max_depth,
                      'random_state': [7]
                      },
                      cv=cv,
                      scoring='accuracy',
                      n_jobs=-1,
                      )
    
    gs_rf.fit(X_train_4.iloc[:, :i], y_train_4.to_numpy().ravel())
    
    rf_model = rf_model.set_params(**gs_rf.best_params_)
    rf_model.fit(X_train_4.iloc[:, :i].to_numpy(), y_train_4.to_numpy().ravel())
    
    y_pred_rf = rf_model.predict(X_test_4.iloc[:, :i].to_numpy())
    acc_rf = accuracy_score(y_test_4.to_numpy().ravel(), y_pred_rf)

    best_parameters = gs_rf.best_params_

    print('Количество признаков:', q)
    print('best_parameters:', best_parameters)
    print('accuracy:', acc_rf)
    print()
    i += 1

Количество признаков: 1
best_parameters: {'max_depth': 1, 'n_estimators': 1, 'random_state': 7}
accuracy: 0.835

Количество признаков: 2
best_parameters: {'max_depth': 1, 'n_estimators': 1, 'random_state': 7}
accuracy: 0.835

Количество признаков: 3
best_parameters: {'max_depth': 2, 'n_estimators': 5, 'random_state': 7}
accuracy: 0.835

Количество признаков: 4
best_parameters: {'max_depth': 3, 'n_estimators': 8, 'random_state': 7}
accuracy: 0.835

Количество признаков: 5
best_parameters: {'max_depth': 8, 'n_estimators': 24, 'random_state': 7}
accuracy: 0.81

Количество признаков: 6
best_parameters: {'max_depth': 3, 'n_estimators': 24, 'random_state': 7}
accuracy: 0.835

Количество признаков: 7
best_parameters: {'max_depth': 8, 'n_estimators': 29, 'random_state': 7}
accuracy: 0.83

Количество признаков: 8
best_parameters: {'max_depth': 8, 'n_estimators': 14, 'random_state': 7}
accuracy: 0.85

Количество признаков: 9
best_parameters: {'max_depth': 7, 'n_estimators': 29, 'random_state': 7

In [None]:
# лучший результат модель показала на первых 11-и признаках
# проверим accuracy, указав параметры из best_params_

rf_model = RandomForestClassifier(max_depth=9, n_estimators=18, random_state=7)

rf_model.fit(X_train_4.iloc[:, :11].to_numpy(), y_train_4.to_numpy().ravel())

y_pred = rf_model.predict(X_test_4.iloc[:, :11].to_numpy())
acc_rf = accuracy_score(y_test_4.to_numpy().ravel(), y_pred)

print('accuracy:', acc_rf)

accuracy: 0.86


In [None]:
# применим частотное кодирование к категориальным признакам, у которых больше 2-х уникальных значений

X_train_4[feat_cat].iloc[:, :11].nunique() > 2

A1     False
A4      True
A5      True
A6      True
A8     False
A9     False
A11    False
A12     True
dtype: bool

In [None]:
new_features_4 = X_train_4.iloc[:, :11].columns

In [None]:
X_train_5 = scaled_train[new_features_4]
y_train_5 = scaled_train[[target_var]]
X_test_5 = scaled_test[new_features_4]
y_test_5 = scaled_test[[target_var]]

In [None]:
for col in X_train_5[['A4', 'A5', 'A6', 'A12']]:
  freq_encoded = X_train_5.groupby(col).size() / len(X_train_5)
  X_train_5.loc[:, col] = X_train_5[col].map(freq_encoded)

for col in X_test_5[['A4', 'A5', 'A6', 'A12']]:
  freq_encoded = X_test_5.groupby(col).size() / len(X_test_5)
  X_test_5.loc[:, col] = X_test_5[col].map(freq_encoded)

X_train_5.head()

Unnamed: 0,A8,A9,A5,A10,A7,A6,A4,A3,A14,A2,A12
0,0,0,0.077551,0.0,0.660532,0.585714,0.246939,0.507157,0.0,0.630879,0.910204
1,1,0,0.085714,0.0,0.780164,0.585714,0.246939,0.617587,0.875256,0.842536,0.910204
2,0,0,0.040816,0.0,0.490798,0.020408,0.246939,0.118609,0.0,0.179959,0.910204
3,1,0,0.108163,0.0,0.490798,0.585714,0.75102,0.791411,0.737219,0.099182,0.910204
4,0,0,0.218367,0.0,0.255624,0.585714,0.246939,0.368098,0.0,0.347648,0.910204


In [None]:
# обучим модель на новых признаках

rf_model = RandomForestClassifier(max_depth=9, n_estimators=18, random_state=7)

rf_model.fit(X_train_5.to_numpy(), y_train_5.to_numpy().ravel())

y_pred = rf_model.predict(X_test_5.to_numpy())
acc_rf = accuracy_score(y_test_5.to_numpy().ravel(), y_pred)

print('accuracy:', acc_rf)

accuracy: 0.83


###Лучший accuracy (0.865) показала модель KNeighborsClassifier на первых трех признаках с наибольшими значениями корреляции с целевой переменной и применении среднего кодирования к категориальным признакам.