# Оценка эффекта от внедрения полученного решения
Для начала применим модель к отложенной выборке.

In [140]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import recall_score

In [54]:
data_train = pd.read_csv('data_train.csv')
data_test = pd.read_csv('hold_out_dataset.csv')

In [7]:
churn = data_train[data_train.y == 1].shape[0] # число клиентов класса "отток"
not_churn = data_train[data_train.y == -1].shape[0] # число клиентов класса "не отток"
all_ = data_train.shape[0] # число всех клиентов
class_weights = [all_ / (2 * not_churn), all_ / (2 * churn)]

In [55]:
num_col = data_train.columns[1:190]
cat_col = data_train.columns[190:-1]

In [56]:
drop_col = data_train.count(axis=0)[data_train.count(axis=0) == 0].index # столбцы, у которых все значения пропущены
data_train = data_train.drop(drop_col, axis=1)

In [69]:
data_test = data_test.drop(drop_col, axis=1)

In [57]:
cat_columns = [c for c in cat_col if c in data_train.columns]

In [11]:
def fill_missing_values(df):
    num_columns = [c for c in num_col if c in df.columns]
    df_num = df[num_columns].fillna(0)
    df_cat = df[cat_columns].fillna("NA", axis=0)
    X = pd.concat([df_num, df_cat], axis=1)
    return X

In [71]:
X = fill_missing_values(data_train)
y = data_train.y

In [72]:
X_test = fill_missing_values(data_test)
y_test = data_test.y

In [59]:
cv = StratifiedKFold(shuffle=True)
model = CatBoostClassifier(iterations=300,
                           learning_rate=0.1,
                           depth=2,
                           cat_features=cat_columns,
                           verbose=False)
opt = GridSearchCV(model, param_grid={'class_weights': [class_weights, None]}, scoring='roc_auc', n_jobs=-1, cv=cv)
opt.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
             estimator=<catboost.core.CatBoostClassifier object at 0x000001C000928880>,
             n_jobs=-1,
             param_grid={'class_weights': [[0.540321491287316,
                                            6.700167504187605],
                                           None]},
             scoring='roc_auc')

In [60]:
result = pd.DataFrame({'probability': opt.predict_proba(X)[:, 1], 'labels': y})

In [62]:
error = result[((result.probability > 0.9) & (result.labels == -1)) | 
               ((result.probability < 0.1) & (result.labels == 1))]

In [142]:
new_X = X.drop(error.index, axis=0)
new_y = y.drop(error.index)
opt.fit(new_X, new_y)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
             estimator=<catboost.core.CatBoostClassifier object at 0x000001C000928880>,
             n_jobs=-1,
             param_grid={'class_weights': [[0.540321491287316,
                                            6.700167504187605],
                                           None]},
             scoring='roc_auc')

In [75]:
result_test = pd.DataFrame({'probability': opt.predict_proba(X_test)[:, 1], 'labels': y_test})

In [143]:
recall_score(result_test.labels, opt.predict(X_test))

0.6360544217687075

In [148]:
roc_auc_score(result_test.labels, result_test.probability)

0.7233184558226959

In [144]:
poor_obj = data_train.count(axis=1)[data_train.count(axis=1) < 53].index
X_2 = new_X.drop(poor_obj, axis=0)
y_2 = new_y.drop(poor_obj)
opt.fit(X_2, y_2)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
             estimator=<catboost.core.CatBoostClassifier object at 0x000001C000928880>,
             n_jobs=-1,
             param_grid={'class_weights': [[0.540321491287316,
                                            6.700167504187605],
                                           None]},
             scoring='roc_auc')

In [145]:
recall_score(result_test.labels, opt.predict(X_test))

0.6428571428571429

In [146]:
result_test_ = pd.DataFrame({'probability': opt.predict_proba(X_test)[:, 1], 'labels': y_test})

In [147]:
roc_auc_score(result_test_.labels, result_test_.probability)

0.724698136135188

In [155]:
result_test_['p_i'] = np.random.choice(np.arange(100), 8000) / 100

In [157]:
result_test_.sort_values(by='probability', ascending=False)[:80][result_test_.p_i > 0.5].shape

  result_test_.sort_values(by='probability', ascending=False)[:80][result_test_.p_i > 0.5].shape


(35, 3)