# Описание проекта

Из «Бета-Банка» стали уходить клиенты. Каждый месяц. Немного, но заметно. Банковские маркетологи посчитали: сохранять текущих клиентов дешевле, чем привлекать новых.
Нужно спрогнозировать, уйдёт клиент из банка в ближайшее время или нет. Вам предоставлены исторические данные о поведении клиентов и расторжении договоров с банком.
Постройте модель с предельно большим значением F1-меры. Чтобы сдать проект успешно, нужно довести метрику до 0.59. Проверьте F1-меру на тестовой выборке самостоятельно.
Дополнительно измеряйте AUC-ROC, сравнивайте её значение с F1-мерой.

Признаки
* RowNumber — индекс строки в данных
* CustomerId — уникальный идентификатор клиента
* Surname — фамилия
* CreditScore — кредитный рейтинг
* Geography — страна проживания
* Gender — пол
* Age — возраст
* Tenure — сколько лет человек является клиентом банка
* Balance — баланс на счёте
* NumOfProducts — количество продуктов банка, используемых клиентом
* HasCrCard — наличие кредитной карты
* IsActiveMember — активность клиента
* EstimatedSalary — предполагаемая зарплата

Целевой признак
* Exited — факт ухода клиента

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score
from sklearn.utils import shuffle

#### Загрузка, подготовка данных

In [2]:
try:
    data=pd.read_csv('Churn.csv')
except:
    data=pd.read_csv('/datasets/Churn.csv')

In [3]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [4]:
data.columns = ['row_number', 'customer_id', 'surname', 'credit_score', 'geography', 'gender', 'age',
                            'tenure', 'balance', 'num_of_products', 'has_cr_card', 'is_active_member', 'estimated_salary', 'exited']

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   row_number        10000 non-null  int64  
 1   customer_id       10000 non-null  int64  
 2   surname           10000 non-null  object 
 3   credit_score      10000 non-null  int64  
 4   geography         10000 non-null  object 
 5   gender            10000 non-null  object 
 6   age               10000 non-null  int64  
 7   tenure            9091 non-null   float64
 8   balance           10000 non-null  float64
 9   num_of_products   10000 non-null  int64  
 10  has_cr_card       10000 non-null  int64  
 11  is_active_member  10000 non-null  int64  
 12  estimated_salary  10000 non-null  float64
 13  exited            10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


In [6]:
#замена пропусков в tenure на медианное значение
data['tenure'] = data['tenure'].fillna(data['tenure'].median())

In [7]:
# удалим ненужные признаки
data = data.drop(['row_number', 'customer_id', 'surname'], axis=1)

In [8]:
#матрица корреляции. Проверка на мультиколлинеарность
data.corr()

Unnamed: 0,credit_score,age,tenure,balance,num_of_products,has_cr_card,is_active_member,estimated_salary,exited
credit_score,1.0,-0.003965,-6e-05,0.006268,0.012238,-0.005458,0.025651,-0.001384,-0.027094
age,-0.003965,1.0,-0.012606,0.028308,-0.03068,-0.011721,0.085472,-0.007201,0.285323
tenure,-6e-05,-0.012606,1.0,-0.007535,0.011409,0.025979,-0.030681,0.010049,-0.015989
balance,0.006268,0.028308,-0.007535,1.0,-0.30418,-0.014858,-0.010084,0.012797,0.118533
num_of_products,0.012238,-0.03068,0.011409,-0.30418,1.0,0.003183,0.009612,0.014204,-0.04782
has_cr_card,-0.005458,-0.011721,0.025979,-0.014858,0.003183,1.0,-0.011866,-0.009933,-0.007138
is_active_member,0.025651,0.085472,-0.030681,-0.010084,0.009612,-0.011866,1.0,-0.011421,-0.156128
estimated_salary,-0.001384,-0.007201,0.010049,0.012797,0.014204,-0.009933,-0.011421,1.0,0.012097
exited,-0.027094,0.285323,-0.015989,0.118533,-0.04782,-0.007138,-0.156128,0.012097,1.0


In [9]:
#One-Hot Encoding
data = pd.get_dummies(data, drop_first=True)

#### Исследование баланса классов. Обучение модели без учета дисбаланса.

In [22]:
print(f'Доля класса 0: {(data["exited"].value_counts(normalize=True)*100)[0]:.2f}%')
print(f'Доля класса 1: {(data["exited"].value_counts(normalize=True)*100)[1]:.2f}%')

Доля класса 0: 79.63%
Доля класса 1: 20.37%


Имеем дисбаланс классов

#### Разбиение на train и test

In [11]:
features = data.drop(['exited'], axis=1)
targets = data['exited']

In [12]:
features_train, features_test, target_train, target_test = train_test_split(features, targets, 
                                                                            test_size=0.4, random_state=54321, stratify=targets)

In [13]:
print(features_train.shape)
print(target_train.shape)
print(features_test.shape)
print(target_test.shape)

(6000, 11)
(6000,)
(4000, 11)
(4000,)


In [14]:
# функция для определения параметров
def get_main_parameters(model):
    
    #predictions
    predictions_test = model.predict(features_test)
    
    #матрица ошибок для тестовой выборки
    tn_test, fp_test, fn_test, tp_test = confusion_matrix(target_test, predictions_test, labels=[0, 1]).ravel()
    confusion_matrix_test = [tn_test, fp_test, fn_test, tp_test]
    
    #f1-мера для тестовой выборки при average=weighted
    f1_test_weighted = f1_score(target_test, predictions_test, average="weighted")
    
    #f1-мера для тестовой выборки при average=macro
    f1_test_macro = f1_score(target_test, predictions_test, average="macro")
    
    #f1-мера для валидационной и тестовой выборки
    f1_test = f1_score(target_test, predictions_test)
    
    #ROC-AUC для валидационной и тестовой выборки
    proba_test = model.predict_proba(features_test)
    roc_auc_test = roc_auc_score(target_test, proba_test[:,1])
    
    return {
        'confusion_matrix_test': confusion_matrix_test,
        'f1_test_weighted': f1_test_weighted,
        'f1_test_macro': f1_test_macro,
        'f1_test': f1_test,
        'roc_auc_test': roc_auc_test
           }

In [15]:
# подбор гиперпараметров случайного леса
criterion = ['gini', 'entropy']
n_estimators = [x for x in range(10, 1001, 50)]
max_depth = [x for x in range(2, 11)]
hyperparameter_grid = {'criterion': criterion,
    'n_estimators': n_estimators,
    'max_depth': max_depth
                      }

model = RandomForestClassifier(random_state = 54321)
random_cv = RandomizedSearchCV(estimator=model,
    param_distributions=hyperparameter_grid,
    return_train_score = True,
    random_state=54321, scoring='f1',
    n_jobs=-1
                              )
random_cv.fit(features_train, target_train)
best_random_forest = random_cv.best_estimator_
print(best_random_forest)

RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=160,
                       random_state=54321)


In [16]:
random_forest_results = get_main_parameters(best_random_forest)
random_forest_results

{'confusion_matrix_test': [3102, 83, 441, 374],
 'f1_test_weighted': 0.854050536946881,
 'f1_test_macro': 0.7550834212041668,
 'f1_test': 0.5880503144654088,
 'roc_auc_test': 0.8567402798778785}

### Улучшение качества модели, учитывая дисбаланс классов. Вариации моделей.

In [21]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=54321)
    
    return features_upsampled, target_upsampled

In [22]:
#upsampling
features_train_upsampled, target_train_upsampled = upsample(features_train, target_train, 4)

#### Случайный лес

In [23]:
# подбор гиперпараметров случайного леса на новых данных
criterion = ['gini', 'entropy']
n_estimators = [x for x in range(10, 1001, 50)]
max_depth = [x for x in range(2, 11)]
hyperparameter_grid = {'criterion': criterion,
    'n_estimators': n_estimators,
    'max_depth': max_depth
                      }

model = RandomForestClassifier(random_state = 54321)
random_cv = RandomizedSearchCV(estimator=model,
    param_distributions=hyperparameter_grid,
    return_train_score = True,
    random_state=54321,
    scoring='f1',
    n_jobs=-1
    )
random_cv.fit(features_train_upsampled, target_train_upsampled)
upsampled_best_random_forest = random_cv.best_estimator_
print(upsampled_best_random_forest)

RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=160,
                       random_state=54321)


In [24]:
upsampled_random_forest_results = get_main_parameters(upsampled_best_random_forest)
upsampled_random_forest_results

{'confusion_matrix_test': [2763, 422, 254, 561],
 'f1_test_weighted': 0.8366065003251669,
 'f1_test_macro': 0.7575147993094195,
 'f1_test': 0.6240266963292548,
 'roc_auc_test': 0.8593945931369243}

#### Градиентный бустинг

In [25]:
loss = ['deviance', 'exponential']
n_estimators = [x for x in range(100, 1501, 50)]
max_depth = [x for x in range(1, 11)]
min_samples_leaf = [x for x in range(1, 6)]
min_samples_split = [x for x in range(2, 6)]
max_features = ['auto', 'sqrt', 'log2', None]
hyperparameter_grid = {'loss': loss,
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features,
    }

model = GradientBoostingClassifier(random_state = 54321)
random_cv = RandomizedSearchCV(estimator=model,
    param_distributions=hyperparameter_grid,
    return_train_score = True,
    random_state=54321,
    n_jobs=-1
    )
random_cv.fit(features_train_upsampled, target_train_upsampled)
boosting_model = random_cv.best_estimator_
print(boosting_model)

GradientBoostingClassifier(loss='exponential', max_depth=10,
                           max_features='log2', min_samples_leaf=3,
                           min_samples_split=5, n_estimators=1050,
                           random_state=54321)


In [26]:
upsampled_boosting_results = get_main_parameters(boosting_model)
upsampled_boosting_results

{'confusion_matrix_test': [3032, 153, 429, 386],
 'f1_test_weighted': 0.8426918213818821,
 'f1_test_macro': 0.7412955049871737,
 'f1_test': 0.5701624815361891,
 'roc_auc_test': 0.8382550490701236}

#### Дерево решений

In [27]:
criterion = ['gini', 'entropy']
splitter = ['best', 'random'] 
max_depth = [x for x in range(2, 11)]
hyperparameter_grid = {'criterion': criterion,
    'splitter': splitter,
    'max_depth': max_depth
                      }

model = DecisionTreeClassifier(random_state = 54321)
random_cv = RandomizedSearchCV(estimator=model,
    param_distributions=hyperparameter_grid,
    return_train_score = True,
    random_state=54321,
    n_jobs=-1
      )
random_cv.fit(features_train_upsampled, target_train_upsampled)
upsampled_tree_model = random_cv.best_estimator_
print(upsampled_tree_model)

DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=54321,
                       splitter='random')


In [28]:
upsampled_tree_results = get_main_parameters(upsampled_tree_model)
upsampled_tree_results

{'confusion_matrix_test': [2534, 651, 264, 551],
 'f1_test_weighted': 0.7857968808252036,
 'f1_test_macro': 0.696711331585584,
 'f1_test': 0.5463559742191373,
 'roc_auc_test': 0.7844495381918695}

### Улучшение модели при помощи стандартизации.

In [29]:
# Стандартизация
numeric = ['credit_score', 'age', 'tenure', 'balance', 'num_of_products', 'estimated_salary']
scaler = StandardScaler()
scaler.fit(features_train_upsampled[numeric])
features_train_upsampled[numeric] = scaler.transform(features_train_upsampled[numeric])
features_test[numeric] = scaler.transform(features_test[numeric])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


#### Случайный лес

In [31]:
# подбор гиперпараметров случайного леса на стандартизаированных данных
criterion = ['gini', 'entropy']
n_estimators = [x for x in range(10, 1001, 50)]
max_depth = [x for x in range(2, 11)]
hyperparameter_grid = {'criterion': criterion,
    'n_estimators': n_estimators,
    'max_depth': max_depth
                      }

model = RandomForestClassifier(random_state = 54321)
random_cv = RandomizedSearchCV(estimator=model,
    param_distributions=hyperparameter_grid,
    return_train_score = True,
    random_state=54321,
    n_jobs=-1)
random_cv.fit(features_train_upsampled, target_train_upsampled)
scalered_random_forest = random_cv.best_estimator_
print(scalered_random_forest)

RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=160,
                       random_state=54321)


In [32]:
scalered_random_forest_results = get_main_parameters(scalered_random_forest)
scalered_random_forest_results

{'confusion_matrix_test': [2762, 423, 253, 562],
 'f1_test_weighted': 0.8366636200716845,
 'f1_test_macro': 0.7577060931899641,
 'f1_test': 0.6244444444444444,
 'roc_auc_test': 0.8593845768604752}

#### Градиентный бустинг

In [33]:
loss = ['deviance', 'exponential']
n_estimators = [x for x in range(100, 1501, 50)]
max_depth = [x for x in range(1, 21)]
hyperparameter_grid = {'loss': loss,
    'n_estimators': n_estimators,
    'max_depth': max_depth,
                      }

model = GradientBoostingClassifier(random_state = 54321)
random_cv = RandomizedSearchCV(estimator=model,
    param_distributions=hyperparameter_grid,
    return_train_score = True,
    n_jobs=-1,
    random_state=54321)
random_cv.fit(features_train_upsampled, target_train_upsampled)
scalered_boosting_model = random_cv.best_estimator_
print(scalered_boosting_model)

GradientBoostingClassifier(max_depth=10, n_estimators=750, random_state=54321)


In [34]:
scalered_boosting_results = get_main_parameters(scalered_boosting_model)
scalered_boosting_results

{'confusion_matrix_test': [3031, 154, 429, 386],
 'f1_test_weighted': 0.8424757662587913,
 'f1_test_macro': 0.7410032791072831,
 'f1_test': 0.5697416974169741,
 'roc_auc_test': 0.8352403424796062}

#### Дерево решений

In [35]:
criterion = ['gini', 'entropy']
splitter = ['best', 'random'] 
max_depth = [x for x in range(2, 21)]
hyperparameter_grid = {'criterion': criterion,
    'splitter': splitter,
    'max_depth': max_depth
                      }

model = DecisionTreeClassifier(random_state = 54321)
random_cv = RandomizedSearchCV(estimator=model,
    param_distributions=hyperparameter_grid,
    return_train_score = True,
    random_state=54321,
    n_jobs=-1)
random_cv.fit(features_train_upsampled, target_train_upsampled)
scalered_tree_model = random_cv.best_estimator_
print(scalered_tree_model)

DecisionTreeClassifier(max_depth=20, random_state=54321)


In [36]:
scalered_tree_results = get_main_parameters(scalered_tree_model)
scalered_tree_results

{'confusion_matrix_test': [2768, 417, 386, 429],
 'f1_test_weighted': 0.8006324743598182,
 'f1_test_macro': 0.6949400797543641,
 'f1_test': 0.5165562913907285,
 'roc_auc_test': 0.6986780441294026}

### Улучшение модели при помощи downsampling

In [37]:
def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=54321)] + [features_ones])
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=54321)] + [target_ones])
    
    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=54321)
    
    return features_downsampled, target_downsampled

In [38]:
features_downsampled, target_downsampled = downsample(features, targets, 0.2)
print(features_downsampled.shape)
print(target_downsampled.shape)

(3630, 11)
(3630,)


In [39]:
features_train_downsampled, features_test_downsampled, target_train_downsampled, target_test_downsampled = train_test_split(features_downsampled, target_downsampled, 
                                                                            test_size=0.4, random_state=54321, stratify=target_downsampled)

In [40]:
print(features_train_downsampled.shape)
print(features_test_downsampled.shape)
print(target_train_downsampled.shape)
print(target_test_downsampled.shape)

(2178, 11)
(1452, 11)
(2178,)
(1452,)


In [41]:
# функция для определения параметров
def get_main_parameters_downsampled(model):
    
    #predictions
    predictions_test = model.predict(features_test_downsampled)
    
    #матрица ошибок для тестовой выборки
    tn_test, fp_test, fn_test, tp_test = confusion_matrix(target_test_downsampled, predictions_test, labels=[0, 1]).ravel()
    confusion_matrix_test = [tn_test, fp_test, fn_test, tp_test]
    
    #f1-мера для тестовой выборки при average=weighted
    f1_test_weighted = f1_score(target_test_downsampled, predictions_test, average="weighted")
    
    #f1-мера для тестовой выборки при average=macro
    f1_test_macro = f1_score(target_test_downsampled, predictions_test, average="macro")
    
    #f1-мера для валидационной и тестовой выборки
    f1_test = f1_score(target_test_downsampled, predictions_test)
    
    #ROC-AUC для валидационной и тестовой выборки
    proba_test = model.predict_proba(features_test_downsampled)
    roc_auc_test = roc_auc_score(target_test_downsampled, proba_test[:,1])
    
    return {
        'confusion_matrix_test': confusion_matrix_test,
        'f1_test_weighted': f1_test_weighted,
        'f1_test_macro': f1_test_macro,
        'f1_test': f1_test,
        'roc_auc_test': roc_auc_test
           }

#### Случайный лес

In [42]:
criterion = ['gini', 'entropy']
n_estimators = [x for x in range(10, 1001, 50)]
max_depth = [x for x in range(2, 11)]
hyperparameter_grid = {'criterion': criterion,
    'n_estimators': n_estimators,
    'max_depth': max_depth
                      }

model = RandomForestClassifier(random_state = 54321)
random_cv = RandomizedSearchCV(estimator=model,
    param_distributions=hyperparameter_grid,
    return_train_score = True,
    random_state=54321,
    n_jobs=-1, scoring='f1')
random_cv.fit(features_train_downsampled, target_train_downsampled)
downsampled_random_forest = random_cv.best_estimator_
print(scalered_random_forest)

RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=160,
                       random_state=54321)


In [43]:
random_cv.best_score_

0.8051044574275578

In [44]:
downsampled_random_forest_results = get_main_parameters_downsampled(downsampled_random_forest)
downsampled_random_forest_results

{'confusion_matrix_test': [468, 169, 158, 657],
 'f1_test_weighted': 0.7745674845477583,
 'f1_test_macro': 0.7709119490027663,
 'f1_test': 0.8007312614259599,
 'roc_auc_test': 0.8511850988625748}

#### Градиентный бустинг

In [45]:
loss = ['deviance', 'exponential']
n_estimators = [x for x in range(100, 1501, 50)]
max_depth = [x for x in range(1, 21)]
hyperparameter_grid = {'loss': loss,
    'n_estimators': n_estimators,
    'max_depth': max_depth,
                      }

model = GradientBoostingClassifier(random_state = 54321)
random_cv = RandomizedSearchCV(estimator=model,
    param_distributions=hyperparameter_grid,
    return_train_score = True,
    n_jobs=-1,
    random_state=54321, scoring='f1')
random_cv.fit(features_train_downsampled, target_train_downsampled)
downsampled_boosting_model = random_cv.best_estimator_
print(downsampled_boosting_model)

GradientBoostingClassifier(loss='exponential', n_estimators=150,
                           random_state=54321)


In [46]:
random_cv.best_score_

0.8046641408015353

In [47]:
downsampled_boosting_results = get_main_parameters_downsampled(downsampled_boosting_model)
downsampled_boosting_results

{'confusion_matrix_test': [466, 171, 153, 662],
 'f1_test_weighted': 0.7764791745349673,
 'f1_test_macro': 0.7727181374064683,
 'f1_test': 0.8033980582524272,
 'roc_auc_test': 0.8528724562028682}

## Вывод

Лучший результат получился у модели градиентного бустинга после downsampling:
* F1-мера 0.8033980582524272
* ROC-AUC 0.8528724562028682
* Количество TP 466
* Количество FN  171
* Количество TN 153
* Количество FP 662

Можно сказать, что downsampling данных улучшил модель градиентного бустинга, т.к. согласно ТЗ для банка удержать старых клиентов (т.е. правильно спрогнозировать отток), т.е. получить больше предсказаний класса 1.