# Описание кейса «Прогнозирование оттока клиентов»
 
Суть бизнес задачи:
Каждый месяц компания пролонгирует полисы Каско клиентов - физических лиц. Для оптимизации работы со списками на пролонгацию необходимо прогнозировать с какой вероятностью каждый из клиентов пролонгируется и какие факторы на это влияют. В зависимости от этого расставляются приоритеты операторам колл-центра, которые обрабатывают список (осуществляют обзвон клиентов), а также принимаются решения по дополнительной мотивации клиентов к пролонгации.
 
Описание массива:
Дана выборка полисов на пролонгацию в формате CSV-файла с набором полей, характеризующих сам полис, клиента (и его историю страхования) и транспортное средство. 
Полис на пролонгацию – это полис, период действия заканчивается и который нужно пролонгировать.
Прогнозируемая переменная – факт пролонгации полиса «POLICY_IS_RENEWED», где «1» – клиент пролонгировался, «0» - клиент не пролонгировался.
Массив случайным образом разбит на 2 части: 80% данных – тренировочная выборка, 20% данных – тестовая выборка.
 

In [1]:
#Присоединяем библиотеки для работы расчетов - pandas, numpy, matplotlib
#Import libraries to porogram pandas, numpym matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Открываем файл csv содержащий данные программы, импортируем данные в датафрейм pandas
#Open csv file with data, import data to pandas dataframe
url = 'https://github.com/MindSetLib/PythonUsefull/raw/master/BinaryClassificationInsuranceRenewal/InsuranceRenewalDB.csv'
data = pd.read_csv(url, sep=';', encoding='utf-8')

In [3]:
#Исследуем данные - посмотрим на верхние 5 строк
#Explortory data analysis - have a loot at the data
data.head()

Unnamed: 0,DATA_TYPE,POLICY_ID,POLICY_BEGIN_MONTH,POLICY_END_MONTH,POLICY_IS_RENEWED,POLICY_SALES_CHANNEL,POLICY_SALES_CHANNEL_GROUP,POLICY_BRANCH,POLICY_MIN_AGE,POLICY_MIN_DRIVING_EXPERIENCE,...,POLICY_PRV_CLM_GLT_N,CLIENT_HAS_DAGO,CLIENT_HAS_OSAGO,POLICY_COURT_SIGN,CLAIM_AVG_ACC_ST_PRD,POLICY_HAS_COMPLAINTS,POLICY_YEARS_RENEWED_N,POLICY_DEDUCT_VALUE,CLIENT_REGISTRATION_REGION,POLICY_PRICE_CHANGE
0,TRAIN,1,1,1,1,39,1,Москва,51,12,...,N,1,0,0,0,0,0,0.0,Тульская,-1.0
1,TRAIN,2,1,1,1,50,5,Москва,35,7,...,0,1,1,0,0,0,4,0.0,Москва,-0.05
2,TRAIN,3,1,1,1,52,6,Москва,41,6,...,1L,0,0,0,0,0,1,12518.0,Московская,-0.07
3,TRAIN,4,1,1,1,50,5,Москва,36,12,...,0,1,1,0,0,0,6,15000.0,Москва,0.05
4,TRAIN,5,1,1,0,52,6,Санкт-Петербург,42,5,...,N,0,0,0,0,0,0,50000.0,Ленинградская,0.17


In [4]:
# Посмотрим на размер датафрейма
# Let's have a loot at dataframe shape
data.shape

(95352, 30)

In [5]:
# Детальнее взглянем на данные и ключевые статистики по ним
# Lets have a look at details and key statistics
data.describe()

Unnamed: 0,POLICY_ID,POLICY_BEGIN_MONTH,POLICY_END_MONTH,POLICY_IS_RENEWED,POLICY_SALES_CHANNEL,POLICY_SALES_CHANNEL_GROUP,POLICY_MIN_AGE,POLICY_MIN_DRIVING_EXPERIENCE,VEHICLE_ENGINE_POWER,VEHICLE_IN_CREDIT,VEHICLE_SUM_INSURED,CLIENT_HAS_DAGO,CLIENT_HAS_OSAGO,POLICY_COURT_SIGN,POLICY_HAS_COMPLAINTS,POLICY_DEDUCT_VALUE,POLICY_PRICE_CHANGE
count,95352.0,95352.0,95352.0,95352.0,95352.0,95352.0,95352.0,95352.0,95352.0,95352.0,95352.0,95352.0,95352.0,95352.0,95352.0,95352.0,95352.0
mean,48294.907385,6.88166,6.860779,0.506754,47.393007,5.129614,42.604015,35.075122,153.788627,0.321472,976013.2,0.277771,0.550245,0.000965,0.007446,6036.711308,-0.063786
std,27889.234251,3.401097,3.404167,0.499957,13.986007,1.577412,10.697854,200.723291,53.690138,0.467044,683857.9,0.447902,0.497472,0.031047,0.085969,10457.380954,0.766108
min,1.0,1.0,1.0,0.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
25%,24132.75,4.0,4.0,0.0,50.0,5.0,34.0,8.0,122.0,0.0,557600.0,0.0,0.0,0.0,0.0,0.0,-0.18
50%,48309.5,7.0,7.0,1.0,52.0,6.0,41.0,14.0,146.0,0.0,806872.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,72444.25,10.0,10.0,1.0,53.0,6.0,50.0,19.0,171.0,1.0,1157020.0,1.0,1.0,0.0,0.0,10000.0,0.05
max,96605.0,12.0,12.0,1.0,63.0,8.0,86.0,2015.0,2000.0,1.0,9449000.0,1.0,1.0,1.0,1.0,120873.0,60.02


In [6]:
#В датафрейме есть два типа данных тестовые и тренировочные. Возмем тренировчные данные, тестовые данные не содержат целевой перменной
#Thare are two types of data - test and train. Lets take train dataset
data_train = data.loc[ data['DATA_TYPE'] == 'TRAIN' ]
data_test = data.loc[ data['DATA_TYPE'] == 'TEST ' ]

In [7]:
#Зададим целевую переменную POLICY_IS_RENEWED - полис был пролонгирован
#Set target variable - Policy was renewed
Y = data_train['POLICY_IS_RENEWED']

In [8]:
#Удалим ID полиса из обучающей выборки
#Remove policy ID from train data
data_train=data_train.drop(('POLICY_ID'), axis=1)

In [9]:
#Удалим целевую переменную из обучающей выборки
#Remove target variable from train data
data_train=data_train.drop(('POLICY_IS_RENEWED'), axis=1)

In [10]:
#Удалим тип данных из обучающей выборки
#Remove data type from train data
data_train=data_train.drop(('DATA_TYPE'), axis=1)

In [11]:
#Преобразуем признак изменения цены полиса в числовую переменную
#Transform POLICY_PRICE_CHANGE to numeric variable
data_train.POLICY_PRICE_CHANGE = pd.to_numeric(data_train.POLICY_PRICE_CHANGE)

In [12]:
#Определим по типу данных категориальные признаки и числовые для дальнейших преобразований
#Find categorical and numerical columns for further transformation
categorical_columns = [c for c in data_train.columns if data_train[c].dtype.name == 'object']
numerical_columns   = [c for c in data_train.columns if data_train[c].dtype.name != 'object']

In [13]:
# Посмотрим на категориальные признаки
# Have a look at categorical variables
data_train[categorical_columns].describe()

Unnamed: 0,POLICY_BRANCH,VEHICLE_MAKE,VEHICLE_MODEL,POLICY_INTERMEDIARY,INSURER_GENDER,POLICY_CLM_N,POLICY_CLM_GLT_N,POLICY_PRV_CLM_N,POLICY_PRV_CLM_GLT_N,CLAIM_AVG_ACC_ST_PRD,POLICY_YEARS_RENEWED_N,CLIENT_REGISTRATION_REGION
count,76407,76407,76407,76407,76407,76407,76407,76407,76407,76407,76407,76407
unique,2,80,525,1321,2,7,7,7,7,639,12,83
top,Москва,Kia,RAV4,N,M,0,0,0,0,0,0,Санкт-Петербург
freq,40168,8754,3301,16337,48307,54298,57688,35335,38137,59753,26336,30717


In [14]:
#Заполним пропущенные значения категориальных признаков самыми популярными значениеми
#Fill fields with not available date by top values
data_describe = data_train.describe(include=[object])
for c in categorical_columns:
    data_train[c] = data_train[c].fillna(data_describe[c]['top'])

In [15]:
# Заполним пропущенные значения числовых признаков медианным значанием
# Fill numerical data train values by median values
data_train = data_train.fillna(data.median(axis=0), axis=0)

In [16]:
# Посмотрим на корреляцию между данными
# Look at variables correlation
data_train.corr()

Unnamed: 0,POLICY_BEGIN_MONTH,POLICY_END_MONTH,POLICY_SALES_CHANNEL,POLICY_SALES_CHANNEL_GROUP,POLICY_MIN_AGE,POLICY_MIN_DRIVING_EXPERIENCE,VEHICLE_ENGINE_POWER,VEHICLE_IN_CREDIT,VEHICLE_SUM_INSURED,CLIENT_HAS_DAGO,CLIENT_HAS_OSAGO,POLICY_COURT_SIGN,POLICY_HAS_COMPLAINTS,POLICY_DEDUCT_VALUE,POLICY_PRICE_CHANGE
POLICY_BEGIN_MONTH,1.0,0.99803,0.004473,0.016706,-0.00742,0.022147,0.01935,0.071432,0.053374,-0.015745,-0.009642,-0.002862,0.002293,-0.057753,-0.001045
POLICY_END_MONTH,0.99803,1.0,0.004714,0.016645,-0.007338,0.022516,0.019782,0.071989,0.054115,-0.016697,-0.010289,-0.002664,0.002605,-0.057774,-0.000923
POLICY_SALES_CHANNEL,0.004473,0.004714,1.0,0.683366,0.027907,0.010776,-0.079471,0.129964,-0.029085,-0.078392,-0.145705,0.002072,0.004939,-0.028384,-0.030716
POLICY_SALES_CHANNEL_GROUP,0.016706,0.016645,0.683366,1.0,0.01636,0.02127,-0.019368,0.172941,0.045989,-0.067206,-0.168341,-0.003868,-0.001025,-0.077879,-0.046947
POLICY_MIN_AGE,-0.00742,-0.007338,0.027907,0.01636,1.0,0.024462,-0.063048,-0.076707,-0.04756,-0.093187,-0.022502,-0.017133,-0.021082,-0.157575,-0.009067
POLICY_MIN_DRIVING_EXPERIENCE,0.022147,0.022516,0.010776,0.02127,0.024462,1.0,-0.000148,0.033575,0.049069,-0.038847,-0.013643,-0.003508,-0.000102,0.042662,-0.006395
VEHICLE_ENGINE_POWER,0.01935,0.019782,-0.079471,-0.019368,-0.063048,-0.000148,1.0,-0.067548,0.777611,0.040625,-0.064448,0.008783,0.005008,-0.033455,0.031338
VEHICLE_IN_CREDIT,0.071432,0.071989,0.129964,0.172941,-0.076707,0.033575,-0.067548,1.0,-0.002353,-0.127672,-0.082482,0.007051,0.003648,-0.113565,-0.041193
VEHICLE_SUM_INSURED,0.053374,0.054115,-0.029085,0.045989,-0.04756,0.049069,0.777611,-0.002353,1.0,-0.012728,-0.1065,0.015542,0.005109,-0.060281,0.007798
CLIENT_HAS_DAGO,-0.015745,-0.016697,-0.078392,-0.067206,-0.093187,-0.038847,0.040625,-0.127672,-0.012728,1.0,0.220065,0.002575,0.01171,0.014427,-0.001053


In [17]:
# Определим переменные с количеством категорий  2 (бинарные) и более двух (многоклассовые)
# Find variables with 2 categories and more than 2
binary_columns    = [c for c in categorical_columns if data_describe[c]['unique'] == 2]
nonbinary_columns = [c for c in categorical_columns if data_describe[c]['unique'] > 2]
print ("Binary: ", binary_columns)
print("Nonbinary: ", nonbinary_columns)

Binary:  ['POLICY_BRANCH', 'INSURER_GENDER']
Nonbinary:  ['VEHICLE_MAKE', 'VEHICLE_MODEL', 'POLICY_INTERMEDIARY', 'POLICY_CLM_N', 'POLICY_CLM_GLT_N', 'POLICY_PRV_CLM_N', 'POLICY_PRV_CLM_GLT_N', 'CLAIM_AVG_ACC_ST_PRD', 'POLICY_YEARS_RENEWED_N', 'CLIENT_REGISTRATION_REGION']


In [18]:
# Заменим значения бинарных категорий признаков на значения 0 или 1
# Change binary categroies to 0 or 1
for c in binary_columns[0:]:
    top = data_describe[c]['top']
    top_items = data_train[c] == top
    data_train.loc[top_items, c] = 0
    data_train.loc[np.logical_not(top_items), c] = 1

In [19]:
# Создадим новый датафрейм с категориальными признаками преобразовынными в dummy переменные 1 и 0 вместо значения каждой категории
# Create new dataframe with categorical variables transformed to dummy variable instead 
data_nonbinary = pd.get_dummies(data_train[nonbinary_columns])
print (data_nonbinary.columns)

Index(['VEHICLE_MAKE_<Пусто>', 'VEHICLE_MAKE_Acura', 'VEHICLE_MAKE_Alfa Romeo',
       'VEHICLE_MAKE_Audi', 'VEHICLE_MAKE_BMW', 'VEHICLE_MAKE_BYD',
       'VEHICLE_MAKE_Bentley', 'VEHICLE_MAKE_Bogdan',
       'VEHICLE_MAKE_Brilliance', 'VEHICLE_MAKE_Cadillac',
       ...
       'CLIENT_REGISTRATION_REGION_Ульяновская',
       'CLIENT_REGISTRATION_REGION_Хабаровский',
       'CLIENT_REGISTRATION_REGION_Хакасия',
       'CLIENT_REGISTRATION_REGION_Ханты-Мансийский Автономный округ - Югра',
       'CLIENT_REGISTRATION_REGION_Челябинская',
       'CLIENT_REGISTRATION_REGION_Чеченская',
       'CLIENT_REGISTRATION_REGION_Чувашская Республика -',
       'CLIENT_REGISTRATION_REGION_Чукотский',
       'CLIENT_REGISTRATION_REGION_Ямало-Ненецкий',
       'CLIENT_REGISTRATION_REGION_Ярославская'],
      dtype='object', length=2688)


In [20]:
# Ряд алгоритмов требует нормализованного пространства признаков. Нормализуем пространство
# Some of ML algorithms need to normalise dataset
data_numerical = data_train[numerical_columns]
data_numerical = (data_numerical - data_numerical.mean()) / data_numerical.std()
data_numerical.describe()

Unnamed: 0,POLICY_BEGIN_MONTH,POLICY_END_MONTH,POLICY_SALES_CHANNEL,POLICY_SALES_CHANNEL_GROUP,POLICY_MIN_AGE,POLICY_MIN_DRIVING_EXPERIENCE,VEHICLE_ENGINE_POWER,VEHICLE_IN_CREDIT,VEHICLE_SUM_INSURED,CLIENT_HAS_DAGO,CLIENT_HAS_OSAGO,POLICY_COURT_SIGN,POLICY_HAS_COMPLAINTS,POLICY_DEDUCT_VALUE,POLICY_PRICE_CHANGE
count,76407.0,76407.0,76407.0,76407.0,76407.0,76407.0,76407.0,76407.0,76407.0,76407.0,76407.0,76407.0,76407.0,76407.0,76407.0
mean,3.200282e-15,-4.230603e-15,-7.468457e-16,-1.769577e-15,-9.476716e-17,-1.613131e-16,-1.994084e-15,2.01585e-15,1.856889e-16,-4.318837e-15,3.127659e-15,-2.846885e-15,-3.610073e-15,-5.455403e-16,-1.346274e-14
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.730366,-1.72232,-3.316435,-2.618019,-2.299901,-0.1750199,-2.859027,-0.685923,-1.432033,-0.6178201,-1.105773,-0.03028159,-0.08444466,-0.5775672,-1.246675
25%,-0.8472508,-0.8400286,0.1858454,-0.08244024,-0.8070509,-0.1353427,-0.5736631,-0.685923,-0.6135227,-0.6178201,-1.105773,-0.03028159,-0.08444466,-0.5775672,-0.154811
50%,0.03586461,0.04226298,0.3287956,0.5514544,-0.1539288,-0.1055848,-0.1463188,-0.685923,-0.2502116,-0.6178201,0.9043332,-0.03028159,-0.08444466,-0.5775672,0.0848665
75%,0.91898,0.9245546,0.4002707,0.5514544,0.6857996,-0.08078655,0.318186,1.45787,0.2695888,1.618573,0.9043332,-0.03028159,-0.08444466,0.3805021,0.1514436
max,1.507724,1.512749,1.115022,1.819244,4.044713,9.818677,34.30135,1.45787,12.44006,1.618573,0.9043332,33.02294,11.84192,11.0029,80.004


In [21]:
# Объедним массивы данных и преоброзуем все значения к типу float, количество признаков вырасло с 28 до 2705
# Concatenate data arrays and transform all values to float, number of variables increase from 28 to 2705
data_model = pd.concat((data_numerical, data_train[binary_columns], data_nonbinary), axis=1)
data_model = pd.DataFrame(data_model, dtype=float)
print ("Shape: ",data_model.shape)
print ("Columns: ",data_model.columns)

Shape:  (76407, 2705)
Columns:  Index(['POLICY_BEGIN_MONTH', 'POLICY_END_MONTH', 'POLICY_SALES_CHANNEL',
       'POLICY_SALES_CHANNEL_GROUP', 'POLICY_MIN_AGE',
       'POLICY_MIN_DRIVING_EXPERIENCE', 'VEHICLE_ENGINE_POWER',
       'VEHICLE_IN_CREDIT', 'VEHICLE_SUM_INSURED', 'CLIENT_HAS_DAGO',
       ...
       'CLIENT_REGISTRATION_REGION_Ульяновская',
       'CLIENT_REGISTRATION_REGION_Хабаровский',
       'CLIENT_REGISTRATION_REGION_Хакасия',
       'CLIENT_REGISTRATION_REGION_Ханты-Мансийский Автономный округ - Югра',
       'CLIENT_REGISTRATION_REGION_Челябинская',
       'CLIENT_REGISTRATION_REGION_Чеченская',
       'CLIENT_REGISTRATION_REGION_Чувашская Республика -',
       'CLIENT_REGISTRATION_REGION_Чукотский',
       'CLIENT_REGISTRATION_REGION_Ямало-Ненецкий',
       'CLIENT_REGISTRATION_REGION_Ярославская'],
      dtype='object', length=2705)


In [22]:
# Назовем входные параметры для модели X
# Lets name X input parameters 
X = data_model.copy()
feature_names = X.columns

In [30]:
#Импортируем стандартный набор компопнентов бибилотеки sklearn
#Import components of sklearn library
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [24]:
# Поделим выборку для модели на 2 части - тестовую (30 процентов) - измерение качества и тренировочную (70 процентов) - построение модели.
# Split data to 2 parts - test - measure of quality and train - model building
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 11)

In [25]:
# Построим модель случайного леса, с числом деревьев 100 и фиксированным параметром случайности
# Build model of random forest with number of trees 100 and fixed random state = 11
from sklearn import ensemble
rf = ensemble.RandomForestClassifier(n_estimators=100, random_state=11)
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=11,
            verbose=0, warm_start=False)

In [26]:
# Рассчитаем различные метрики качества модели
# Calculate different quality metrics of random forest model
print("precision:",metrics.precision_score(y_test, rf.predict(X_test)))
print("recall:",metrics.recall_score(y_test, rf.predict(X_test)))
print("roc_auc:",roc_auc_score(y_test, rf.predict(X_test)))
print("gini:",2*roc_auc_score(y_test, rf.predict(X_test))-1)
print ("accuracy:",accuracy_score(y_test, rf.predict(X_test)))

precision: 0.7013809910641755
recall: 0.8978162911611786
roc_auc: 0.6244788681035358
gini: 0.24895773620707162
accuracy: 0.6951533394407364


In [27]:
# Сортируем метрики качества по степени их убывания
# Sort metrics by metrics importance

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

print("Feature importances:")
for f, idx in enumerate(indices):
    print("{:2d}. feature '{:5s}' ({:.4f})".format(f + 1, feature_names[idx], importances[idx]))

Feature importances:
 1. feature 'POLICY_PRICE_CHANGE' (0.0824)
 2. feature 'VEHICLE_SUM_INSURED' (0.0623)
 3. feature 'POLICY_MIN_AGE' (0.0566)
 4. feature 'POLICY_MIN_DRIVING_EXPERIENCE' (0.0537)
 5. feature 'VEHICLE_ENGINE_POWER' (0.0439)
 6. feature 'POLICY_BEGIN_MONTH' (0.0378)
 7. feature 'POLICY_END_MONTH' (0.0376)
 8. feature 'POLICY_DEDUCT_VALUE' (0.0251)
 9. feature 'POLICY_SALES_CHANNEL' (0.0248)
10. feature 'INSURER_GENDER' (0.0148)
11. feature 'VEHICLE_IN_CREDIT' (0.0128)
12. feature 'CLIENT_HAS_OSAGO' (0.0126)
13. feature 'POLICY_SALES_CHANNEL_GROUP' (0.0124)
14. feature 'CLIENT_HAS_DAGO' (0.0112)
15. feature 'CLIENT_REGISTRATION_REGION_Санкт-Петербург' (0.0082)
16. feature 'CLIENT_REGISTRATION_REGION_Москва' (0.0078)
17. feature 'POLICY_INTERMEDIARY_N' (0.0074)
18. feature 'POLICY_YEARS_RENEWED_N_1' (0.0072)
19. feature 'POLICY_CLM_GLT_N_0' (0.0070)
20. feature 'CLAIM_AVG_ACC_ST_PRD_0' (0.0067)
21. feature 'POLICY_PRV_CLM_GLT_N_0' (0.0066)
22. feature 'POLICY_BRANCH' (0.

In [28]:
# Построим альтернативную модель с помощью градиентного бустинга над деревьями с тем же количеством деревьев 
# Build alternative model with trees gradient boosting with the same number of trees

from sklearn import ensemble
gbt = ensemble.GradientBoostingClassifier(n_estimators=100, random_state=11)
gbt.fit(X_train, y_train)



GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=11,
              subsample=1.0, verbose=0, warm_start=False)

In [29]:
# Рассчитаем различные метрики качества модели
# Calculate different quality metrics of gradient boosting model

print("precision:",metrics.precision_score(y_test, gbt.predict(X_test)))
print("recall:",metrics.recall_score(y_test, gbt.predict(X_test)))
print("roc_auc:",roc_auc_score(y_test, gbt.predict(X_test)))
print("gini:",2*roc_auc_score(y_test, gbt.predict(X_test))-1)
print ("accuracy:",accuracy_score(y_test, gbt.predict(X_test)))

precision: 0.7078700675822978
recall: 0.9003812824956673
roc_auc: 0.6348223192897258
gini: 0.26964463857945153
accuracy: 0.7034855821663831
