# Отток клиентов

Из «Бета-Банка» стали уходить клиенты. Каждый месяц. Немного, но заметно. Банковские маркетологи посчитали: сохранять текущих клиентов дешевле, чем привлекать новых.

Нужно спрогнозировать, уйдёт клиент из банка в ближайшее время или нет. Вам предоставлены исторические данные о поведении клиентов и расторжении договоров с банком. 

Постройте модель с предельно большим значением *F1*-меры. Чтобы сдать проект успешно, нужно довести метрику до 0.59. Проверьте *F1*-меру на тестовой выборке самостоятельно.

Дополнительно измеряйте *AUC-ROC*, сравнивайте её значение с *F1*-мерой.

# 1. Подготовка данных

In [1]:
import warnings
warnings.filterwarnings('ignore') # Чтобы не выводить предупреждения о нулевых значениях метрики F1

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

data = pd.read_csv('/datasets/Churn.csv')
data.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8.0,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7.0,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4.0,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4.0,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2.0,134603.88,1,1,1,71725.73,0


In [2]:
data.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,9091.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,4.99769,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.894723,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,2.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [3]:
#Переименуем название столбцов
data.columns = ['row_number', 'customer_id', 'surname', 'credit_score', 'geography', 'gender', 'age', 
                'tenure', 'balance', 'num_of_products', 'has_cr_card', 'is_active_member', 'estimated_salary', 'exited']

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
row_number          10000 non-null int64
customer_id         10000 non-null int64
surname             10000 non-null object
credit_score        10000 non-null int64
geography           10000 non-null object
gender              10000 non-null object
age                 10000 non-null int64
tenure              9091 non-null float64
balance             10000 non-null float64
num_of_products     10000 non-null int64
has_cr_card         10000 non-null int64
is_active_member    10000 non-null int64
estimated_salary    10000 non-null float64
exited              10000 non-null int64
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


In [5]:
data[data['tenure'] == 0]

Unnamed: 0,row_number,customer_id,surname,credit_score,geography,gender,age,tenure,balance,num_of_products,has_cr_card,is_active_member,estimated_salary,exited
29,30,15656300,Lucciano,411,France,Male,29,0.0,59697.17,2,1,1,53483.21,0
35,36,15794171,Lombardo,475,France,Female,45,0.0,134264.04,1,1,0,27822.99,1
57,58,15647091,Endrizzi,725,Germany,Male,19,0.0,75888.20,1,0,0,45613.75,0
72,73,15812518,Palermo,657,Spain,Female,37,0.0,163607.18,1,0,1,44203.55,0
127,128,15782688,Piccio,625,Germany,Male,56,0.0,148507.24,1,1,0,46824.08,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9793,9794,15772363,Hilton,772,Germany,Female,42,0.0,101979.16,1,1,0,90928.48,0
9799,9800,15722731,Manna,653,France,Male,46,0.0,119556.10,1,1,0,78250.13,1
9843,9844,15778304,Fan,646,Germany,Male,24,0.0,92398.08,1,1,1,18897.29,0
9868,9869,15587640,Rowntree,718,France,Female,43,0.0,93143.39,1,1,0,167554.86,0


<hr>

Отделить целеовой признак от остальных <br>
**row_number**, **customer_id** и **surname** всего лишь идентификаторы клиентов и не являются признаками по которым стоит обучать модель. Их исключаем <br> 
**tenure** - пропуски вероятно означают, что у клиентов нет недвижимости, поэтому заполним нулями и преобразуем тип данных в целое <br>
**geography** и **gender** необходимо закодировать, т.к. эти призниаки категориальные <br>
Все оставшие числовые признаки надо масштабировать <br>
Разделим данные на обучающую и валидационую выборки

<hr>

Отделяем целевой признак

In [7]:
features = data.drop('exited', axis=1)
target = data['exited']

<hr>

Исключаем row_number, customer_id и surname

In [8]:
features = features.drop(columns=['row_number', 'customer_id', 'surname'], axis=1)

<hr>

Заполняем пропуски и преобразовываем тип данных tenure

In [9]:
features['tenure'] = features['tenure'].fillna(0).astype('int')

<hr>

Кодирование категориальных признаков

In [10]:
features = pd.get_dummies(features, drop_first=True)

<hr>

Масштабирование числовых признаков

In [11]:
numeric = ['credit_score', 'age', 'tenure', 'balance', 'num_of_products', 'estimated_salary']
scaler = StandardScaler()
features[numeric] = scaler.fit_transform(features[numeric])
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
credit_score         10000 non-null float64
age                  10000 non-null float64
tenure               10000 non-null float64
balance              10000 non-null float64
num_of_products      10000 non-null float64
has_cr_card          10000 non-null int64
is_active_member     10000 non-null int64
estimated_salary     10000 non-null float64
geography_Germany    10000 non-null uint8
geography_Spain      10000 non-null uint8
gender_Male          10000 non-null uint8
dtypes: float64(6), int64(2), uint8(3)
memory usage: 654.4 KB


In [12]:
features.describe()

Unnamed: 0,credit_score,age,tenure,balance,num_of_products,has_cr_card,is_active_member,estimated_salary,geography_Germany,geography_Spain,gender_Male
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,-4.824585e-16,2.318146e-16,-6.643575e-17,-6.252776000000001e-17,1.634248e-17,0.7055,0.5151,-2.8776980000000004e-17,0.2509,0.2477,0.5457
std,1.00005,1.00005,1.00005,1.00005,1.00005,0.45584,0.499797,1.00005,0.433553,0.431698,0.497932
min,-3.109504,-1.994969,-1.460235,-1.225848,-0.9115835,0.0,0.0,-1.740268,0.0,0.0,0.0
25%,-0.6883586,-0.6600185,-0.817441,-1.225848,-0.9115835,0.0,0.0,-0.8535935,0.0,0.0,0.0
50%,0.01522218,-0.1832505,-0.1746471,0.3319639,-0.9115835,1.0,1.0,0.001802807,0.0,0.0,1.0
75%,0.6981094,0.4842246,0.7895438,0.8199205,0.8077366,1.0,1.0,0.8572431,1.0,0.0,1.0
max,2.063884,5.061197,1.753735,2.795323,4.246377,1.0,1.0,1.7372,1.0,1.0,1.0


<hr>

Разделяем данные на обучающую, валидационую и тестовую выборки

In [13]:
features_train, features_valid_test, target_train, target_valid_test = train_test_split(
    features, target, test_size=0.25, random_state=42)

features_valid, features_test, target_valid, target_test = train_test_split(
    features_valid_test, target_valid_test, test_size=0.25, random_state=42)

print('Обучающая выборка:', features_train.shape)
print('Валидационая выборка:', features_valid.shape)
print('Тестовая выборка:', features_valid.shape)

Обучающая выборка: (7500, 11)
Валидационая выборка: (1875, 11)
Тестовая выборка: (1875, 11)


<hr>

**Вывод**: <br>
- Отделили целевой признак **exited**
- Удалили незначимые признаки **row_number**, **customer_id** и **surname**
- Заполнили и преобразовали тип данных **tenure**
- Проведи кодирование категориальных признаков 
- Масштабировали числовые признаки
- Разделили данные на обучающую и валидационую выборки 

# 2. Исследование задачи

Посмотрим на баланс классов

In [14]:
target.value_counts()

0    7963
1    2037
Name: exited, dtype: int64

<hr>

Создадим таблицу для хранения результатов

In [15]:
res_dict = {
    ('', 'type'):['without balance', 'downsampling', 'upsampling', 'test'],
    ('LogisticRegression', 'F1'):[0, 0, 0, 0], ('LogisticRegression', 'roc_auc'):[0, 0, 0, 0], 
    ('RandomForestClassifier', 'F1'):[0, 0, 0, 0], ('RandomForestClassifier', 'roc_auc'):[0, 0, 0, 0],
    ('DecisionTreeClassifier', 'F1'):[0, 0, 0, 0], ('DecisionTreeClassifier', 'roc_auc'):[0, 0, 0, 0],
}

result_table = pd.DataFrame(res_dict).set_index(('','type'))
result_table.index.name = 'type'
result_table

Unnamed: 0_level_0,LogisticRegression,LogisticRegression,RandomForestClassifier,RandomForestClassifier,DecisionTreeClassifier,DecisionTreeClassifier
Unnamed: 0_level_1,F1,roc_auc,F1,roc_auc,F1,roc_auc
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
without balance,0,0,0,0,0,0
downsampling,0,0,0,0,0,0
upsampling,0,0,0,0,0,0
test,0,0,0,0,0,0


<hr>

Обучим модели: логистическую регрессию, случайный лес и дерево решений. 

<hr>

## 2.1 Логистическая регрессия

In [16]:
model_lr = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced')
model_lr.fit(features_train, target_train)

predicted_valid = model_lr.predict(features_valid)
probabilities_valid = model_lr.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
f1 = f1_score(target_valid, predicted_valid)
roc_auc = roc_auc_score(target_valid, probabilities_one_valid) 
result_table.loc['without balance', [('LogisticRegression', 'F1'), ('LogisticRegression', 'roc_auc')]] = [f1, roc_auc]

print("F1:", f1)
print("roc_auc:", roc_auc)

F1: 0.48066298342541436
roc_auc: 0.7694887900045246


<hr>

## 2.2 Случайный лес

In [17]:
best_model_rfc = None # Лучшая модель
best_score_rfc = 0 # Оценка лучшей модели 
score_rfc_list = [] # Список оценок на валидационой выборке
# Сделаем перебор циклами
for est in range(1, 15, 3):
    for depth in range(1, 20, 3):
        for leaf in range(2, 30, 3):
            # Создаем модель
            model_rfc = RandomForestClassifier(random_state=42, 
                                               n_estimators=est, 
                                               max_depth=depth, 
                                               max_leaf_nodes=leaf
                                              )
            
            model_rfc.fit(features_train, target_train) # Обучаем модель
            predicted_valid = model_rfc.predict(features_valid)
            m_score = f1_score(target_valid, predicted_valid) # Оценка модели на валидационой выборке
            # Заносим оценки в списки
            score_rfc_list.append(m_score)
            # Если оценка текущей модели больше лучшей сохраненной, то сохраняем текущу.
            if m_score > best_score_rfc :
                best_score_rfc = m_score
                best_model_rfc = model_rfc
probabilities_valid = model_rfc.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
roc_auc = roc_auc_score(target_valid, probabilities_one_valid)
result_table.loc['without balance', 
                 [('RandomForestClassifier', 'F1'), ('RandomForestClassifier', 'roc_auc')]] = [best_score_rfc, roc_auc]
print('F1:', best_score_rfc)
print("roc_auc:", roc_auc)
print('Лучшая модель: \n', best_model_rfc)

F1: 0.541958041958042
roc_auc: 0.8484744573136626
Лучшая модель: 
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=29,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=4,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)


<hr>

## 2.3 Дерево решений

In [18]:
best_model_dtc = None
best_score_dtc = 0
score_dtc_list = [] 
for depth in range(1, 30, 1):
    for leaf in range(2, 30, 3):
        model_dtc = DecisionTreeClassifier(random_state=42, max_depth=depth, max_leaf_nodes=leaf)
        model_dtc.fit(features_train, target_train) # Обучаем модель
        predicted_valid = model_dtc.predict(features_valid)
        m_score = f1_score(target_valid, predicted_valid)
        score_dtc_list.append(m_score)
        if m_score > best_score_dtc:
            best_score_dtc = m_score
            best_model_dtc = model_dtc
            
probabilities_valid = model_dtc.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
roc_auc = roc_auc_score(target_valid, probabilities_one_valid)
result_table.loc['without balance', 
                 [('DecisionTreeClassifier', 'F1'), ('DecisionTreeClassifier', 'roc_auc')]] = [best_score_dtc, roc_auc]
print('F1:', best_score_dtc)
print("roc_auc:", roc_auc)
print('Лучшая модель: \n', best_model_dtc)

F1: 0.5917721518987341
roc_auc: 0.8387705878790256
Лучшая модель: 
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=29,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')


<hr>

## 2 Вывод



In [19]:
result_table

Unnamed: 0_level_0,LogisticRegression,LogisticRegression,RandomForestClassifier,RandomForestClassifier,DecisionTreeClassifier,DecisionTreeClassifier
Unnamed: 0_level_1,F1,roc_auc,F1,roc_auc,F1,roc_auc
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
without balance,0.480663,0.769489,0.541958,0.848474,0.591772,0.838771
downsampling,0.0,0.0,0.0,0.0,0.0,0.0
upsampling,0.0,0.0,0.0,0.0,0.0,0.0
test,0.0,0.0,0.0,0.0,0.0,0.0


Лучшая модель по метрике F1: Дерево решений (0.59), по метрике AUC-ROC: Случайный лес (0.85). <br>
Логистическая регрессия довольно сильно отстаает по обеим метрикам

# 3. Борьба с дисбалансом

<hr>

## 3.1 Уменьшение выборки
Поробуем обучить модели с понижением выборки методом downsampling

In [20]:
def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=42)] + [features_ones])
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=42)] + [target_ones])
    
    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=42)
    
    return features_downsampled, target_downsampled

down_features_train, down_target_train = downsample(features_train, target_train, 0.33)
print('Баланс классов')
down_target_train.value_counts()

Баланс классов


0    1967
1    1540
Name: exited, dtype: int64

<hr>

**Логистическая регрессия**

In [21]:
model_lr = LogisticRegression(random_state=42, solver='liblinear')
model_lr.fit(down_features_train, down_target_train)
predicted_valid = model_lr.predict(features_valid)
probabilities_valid = model_lr.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
f1 = f1_score(target_valid, predicted_valid)
roc_auc = roc_auc_score(target_valid, probabilities_one_valid)
result_table.loc['downsampling', [('LogisticRegression', 'F1'), ('LogisticRegression', 'roc_auc')]] = [f1, roc_auc]
print("F1:", f1)
print("roc_auc:", roc_auc)

F1: 0.4901531728665208
roc_auc: 0.7669004977074106


<hr>

**Случайный лес**

In [22]:
best_model_rfc = None # Лучшая модель
best_score_rfc = 0 # Оценка лучшей модели 
score_rfc_list = [] # Список оценок на валидационой выборке
# Сделаем перебор циклами
for est in range(1, 15, 3):
    for depth in range(1, 20, 3):
        for leaf in range(2, 30, 3):
            # Создаем модель
            model_rfc = RandomForestClassifier(random_state=42, 
                                               n_estimators=est, 
                                               max_depth=depth, 
                                               max_leaf_nodes=leaf
                                              )
            
            model_rfc.fit(down_features_train, down_target_train) # Обучаем модель
            predicted_valid = model_rfc.predict(features_valid)
            m_score = f1_score(target_valid, predicted_valid) # Оценка модели на валидационой выборке
            # Заносим оценки в списки
            score_rfc_list.append(m_score)
            # Если оценка текущей модели больше лучшей сохраненной, то сохраняем текущу.
            if m_score > best_score_rfc :
                best_score_rfc = m_score
                best_model_rfc = model_rfc
probabilities_valid = model_rfc.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
roc_auc = roc_auc_score(target_valid, probabilities_one_valid)
result_table.loc['downsampling', 
                 [('RandomForestClassifier', 'F1'), ('RandomForestClassifier', 'roc_auc')]] = [best_score_rfc, roc_auc]
print('F1:', best_score_rfc)
print("roc_auc:", roc_auc)
print('Лучшая модель: \n', best_model_rfc)

F1: 0.6227106227106226
roc_auc: 0.8476897754438218
Лучшая модель: 
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=23,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=13,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)


<hr>

**Дерево решений**

In [23]:
best_model_dtc = None
best_score_dtc = 0
score_dtc_list = [] 
for depth in range(1, 30, 1):
    for leaf in range(2, 30, 3):
        model_dtc = DecisionTreeClassifier(random_state=42, max_depth=depth, max_leaf_nodes=leaf)
        model_dtc.fit(down_features_train, down_target_train) # Обучаем модель
        predicted_valid = model_dtc.predict(features_valid)
        m_score = f1_score(target_valid, predicted_valid)
        score_dtc_list.append(m_score)
        if m_score > best_score_dtc:
            best_score_dtc = m_score
            best_model_dtc = model_dtc
            
probabilities_valid = model_dtc.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
roc_auc = roc_auc_score(target_valid, probabilities_one_valid)
result_table.loc['downsampling', 
                 [('DecisionTreeClassifier', 'F1'), ('DecisionTreeClassifier', 'roc_auc')]] = [best_score_dtc, roc_auc]
print('F1:', best_score_dtc)
print("roc_auc:", roc_auc)
print('Лучшая модель: \n', best_model_dtc)

F1: 0.5898004434589801
roc_auc: 0.8369651961081205
Лучшая модель: 
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=29,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')


## 3.1 Вывод


In [24]:
result_table

Unnamed: 0_level_0,LogisticRegression,LogisticRegression,RandomForestClassifier,RandomForestClassifier,DecisionTreeClassifier,DecisionTreeClassifier
Unnamed: 0_level_1,F1,roc_auc,F1,roc_auc,F1,roc_auc
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
without balance,0.480663,0.769489,0.541958,0.848474,0.591772,0.838771
downsampling,0.490153,0.7669,0.622711,0.84769,0.5898,0.836965
upsampling,0.0,0.0,0.0,0.0,0.0,0.0
test,0.0,0.0,0.0,0.0,0.0,0.0


Теперь по обоим параметрам F1(0.62), AUC-ROC(0.85) лидирует Случайный лес <br>
Дерево решение даже стало хуже, а Логистическая регрессия немного лучше

<hr>

## 3.2 Увеличение выборки

Используем метод upsampling для достижения баланса классов

In [25]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=42)
    
    return features_upsampled, target_upsampled

up_features_train, up_target_train = upsample(features_train, target_train, 3)
print('Баланс классов')
up_target_train.value_counts()

Баланс классов


0    5960
1    4620
Name: exited, dtype: int64

<hr>

**Логистическая регрессия**

In [26]:
model_lr = LogisticRegression(random_state=42, solver='liblinear')
model_lr.fit(up_features_train, up_target_train)
predicted_valid = model_lr.predict(features_valid)
probabilities_valid = model_lr.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
f1 = f1_score(target_valid, predicted_valid)
roc_auc = roc_auc_score(target_valid, probabilities_one_valid)
result_table.loc['upsampling', [('LogisticRegression', 'F1'), ('LogisticRegression', 'roc_auc')]] = [f1, roc_auc]
print("F1:", f1)
print("roc_auc:", roc_auc)

F1: 0.5005417118093175
roc_auc: 0.7691645854635234


<hr>

**Случайный лес**

In [27]:
best_model_rfc = None # Лучшая модель
best_score_rfc = 0 # Оценка лучшей модели 
score_rfc_list = [] # Список оценок на валидационой выборке
# Сделаем перебор циклами
for est in range(1, 15, 3):
    for depth in range(1, 20, 3):
        for leaf in range(2, 30, 3):
            # Создаем модель
            model_rfc = RandomForestClassifier(random_state=42, 
                                               n_estimators=est, 
                                               max_depth=depth, 
                                               max_leaf_nodes=leaf
                                              )
            
            model_rfc.fit(up_features_train, up_target_train) # Обучаем модель
            predicted_valid = model_rfc.predict(features_valid)
            m_score = f1_score(target_valid, predicted_valid) # Оценка модели на валидационой выборке
            # Заносим оценки в списки
            score_rfc_list.append(m_score)
            # Если оценка текущей модели больше лучшей сохраненной, то сохраняем текущу.
            if m_score > best_score_rfc :
                best_score_rfc = m_score
                best_model_rfc = model_rfc
probabilities_valid = model_rfc.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
roc_auc = roc_auc_score(target_valid, probabilities_one_valid)
result_table.loc['upsampling', 
                 [('RandomForestClassifier', 'F1'), ('RandomForestClassifier', 'roc_auc')]] = [best_score_rfc, roc_auc]
print('F1:', best_score_rfc)
print("roc_auc:", roc_auc)
print('Лучшая модель: \n', best_model_rfc)

F1: 0.6342710997442454
roc_auc: 0.8544241450441239
Лучшая модель: 
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='auto', max_leaf_nodes=20,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)


<hr>

**Дерево решений**

In [28]:
best_model_dtc = None
best_score_dtc = 0
score_dtc_list = [] 
for depth in range(1, 30, 1):
    for leaf in range(2, 30, 3):
        model_dtc = DecisionTreeClassifier(random_state=42, max_depth=depth, max_leaf_nodes=leaf)
        model_dtc.fit(up_features_train, up_target_train) # Обучаем модель
        predicted_valid = model_dtc.predict(features_valid)
        m_score = f1_score(target_valid, predicted_valid)
        score_dtc_list.append(m_score)
        if m_score > best_score_dtc:
            best_score_dtc = m_score
            best_model_dtc = model_dtc
            
probabilities_valid = model_dtc.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
roc_auc = roc_auc_score(target_valid, probabilities_one_valid)
result_table.loc['upsampling', 
                 [('DecisionTreeClassifier', 'F1'), ('DecisionTreeClassifier', 'roc_auc')]] = [best_score_dtc, roc_auc]
print('F1:', best_score_dtc)
print("roc_auc:", roc_auc)
print('Лучшая модель: \n', best_model_dtc)

F1: 0.6243386243386243
roc_auc: 0.8491576025964864
Лучшая модель: 
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
                       max_features=None, max_leaf_nodes=26,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')


## 3 Вывод

In [29]:
result_table

Unnamed: 0_level_0,LogisticRegression,LogisticRegression,RandomForestClassifier,RandomForestClassifier,DecisionTreeClassifier,DecisionTreeClassifier
Unnamed: 0_level_1,F1,roc_auc,F1,roc_auc,F1,roc_auc
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
without balance,0.480663,0.769489,0.541958,0.848474,0.591772,0.838771
downsampling,0.490153,0.7669,0.622711,0.84769,0.5898,0.836965
upsampling,0.500542,0.769165,0.634271,0.854424,0.624339,0.849158
test,0.0,0.0,0.0,0.0,0.0,0.0


Подобрав аргумент repeat для функции upsample, получили лучшее значение мер F1 = 0.63 и AUC-ROC = 0.85 для модели Случайный лес

# 4. Тестирование модели

<hr>

**Логистическая регрессия**

In [30]:
probabilities_test = model_lr.predict_proba(features_test)
probabilities_one_test = probabilities_test[:, 1]
predicted_test = model_lr.predict(features_test)
f1 = f1_score(target_test, predicted_test)
roc_auc = roc_auc_score(target_test, probabilities_one_test)
result_table.loc['test', [('LogisticRegression', 'F1'), ('LogisticRegression', 'roc_auc')]] = [f1, roc_auc]
print("F1:", f1)
print("roc_auc:", roc_auc)

F1: 0.5049833887043189
roc_auc: 0.7967965536229067


<hr>

**Случайный лес**

In [31]:
probabilities_test = model_rfc.predict_proba(features_test)
probabilities_one_test = probabilities_test[:, 1]
predicted_test = model_rfc.predict(features_test)
roc_auc = roc_auc_score(target_test, probabilities_one_test)
f1 = f1_score(target_test, predicted_test)
result_table.loc['test', [('RandomForestClassifier', 'F1'), ('RandomForestClassifier', 'roc_auc')]] = [f1, roc_auc]
print('F1:', best_score_rfc)
print("roc_auc:", roc_auc)

F1: 0.6342710997442454
roc_auc: 0.8548893855472419


<hr>

**Дерево решений**

In [32]:
probabilities_test = model_dtc.predict_proba(features_test)
probabilities_one_test = probabilities_test[:, 1]
predicted_test = model_dtc.predict(features_test)
roc_auc = roc_auc_score(target_test, probabilities_one_test)
f1 = f1_score(target_test, predicted_test)
result_table.loc['test', [('DecisionTreeClassifier', 'F1'), ('DecisionTreeClassifier', 'roc_auc')]] = [f1, roc_auc]
print('F1:', best_score_dtc)
print("roc_auc:", roc_auc)

F1: 0.6243386243386243
roc_auc: 0.8488161176432483


# Вывод

In [33]:
cm = sns.light_palette("green", as_cmap=True)
s = result_table.style.background_gradient(cmap=cm)
s

Unnamed: 0_level_0,LogisticRegression,LogisticRegression,RandomForestClassifier,RandomForestClassifier,DecisionTreeClassifier,DecisionTreeClassifier
Unnamed: 0_level_1,F1,roc_auc,F1,roc_auc,F1,roc_auc
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
without balance,0.480663,0.769489,0.541958,0.848474,0.591772,0.838771
downsampling,0.490153,0.7669,0.622711,0.84769,0.5898,0.836965
upsampling,0.500542,0.769165,0.634271,0.854424,0.624339,0.849158
test,0.504983,0.796797,0.611511,0.854889,0.617761,0.848816


На тестовой выборке Случайный лес и Дерево решений почти совпадают по метрике F1, но по метрике ROC-AUC и в влиадционной выборке Случайный лес лучше по обеим метрикам.
Поэтому лучшая модель: Случайный лес с параметрами:

In [34]:
model_rfc

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=19, max_features='auto', max_leaf_nodes=29,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=13,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)