In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import itertools

from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

import matplotlib.pyplot as plt

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

### Задание 1

Для нашего пайплайна (Case1) поэкспериментировать с разными моделями:
- бустинг;
- логистическая регрессия;

In [2]:
df = pd.read_csv('../ML_в_бизнесе/churn_data.csv')
df.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
X_train, X_test, y_train, y_test = train_test_split(df, df['Exited'], random_state=0)

In [4]:
#соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.column]
    
    
class NumberSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []
        
    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self
    
    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [5]:
categorical_columns = ['Geography', 'Gender', 'Tenure', 'HasCrCard', 'IsActiveMember']
continuous_columns = ['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']

Теперь нам нужно под каждый признак создать трансформер и объединить их в список.

In [6]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
        ('selector', NumberSelector(key=cont_col)),
        ('scaler', StandardScaler())
    ])
    
    final_transformers.append((cont_col, cont_transformer))
    
    
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

#### LogisticRegression

In [7]:
# обычная логистическая регрессия
pipeline = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression(random_state=42)),
])

In [8]:
pipeline.fit(X_train, y_train)
preds = pipeline.predict_proba(X_test)[:, 1]

In [9]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.28952195521683305, F-Score=0.510, Precision=0.462, Recall=0.568


In [10]:
roc_auc_med = roc_auc_score(y_test, preds)

In [11]:
total_metrics = pd.DataFrame({'model': ['LogisticRegression'],
                             'threshold': [round(thresholds[ix], 3)],
                 'fscore': [round(fscore[ix], 3)],
                 'precision': [round(precision[ix], 3)],
                 'recall': [round(recall[ix], 3)],
                 'roc_auc': [round(roc_auc_med, 3)]})

In [12]:
total_metrics

Unnamed: 0,model,threshold,fscore,precision,recall,roc_auc
0,LogisticRegression,0.29,0.51,0.462,0.568,0.772


#### XGBClassifier

In [13]:
import xgboost as xgb


pipeline = Pipeline([
    ('features', feats),
    ('classifier', xgb.XGBClassifier(random_state=42)),
])

feats = FeatureUnion(final_transformers)
# обучим наш пайплайн
pipeline.fit(X_train, y_train)

# наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]



In [14]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')


Best Threshold=0.37331774830818176, F-Score=0.626, Precision=0.619, Recall=0.633


In [15]:
roc_auc_med = roc_auc_score(y_test, preds)

In [16]:
total_metrics = total_metrics.append({'model': 'XGBClassifier',
                             'threshold': round(thresholds[ix], 3),
                 'fscore': round(fscore[ix], 3),
                 'precision': round(precision[ix], 3),
                 'recall': round(recall[ix], 3),
                 'roc_auc': round(roc_auc_med, 3)}, ignore_index=True)

### Задание 2

Отобрать лучшую модель по метрикам (какая по вашему мнению здесь наиболее подходящая ML-метрика)

In [17]:
total_metrics

Unnamed: 0,model,threshold,fscore,precision,recall,roc_auc
0,LogisticRegression,0.29,0.51,0.462,0.568,0.772
1,XGBClassifier,0.373,0.626,0.619,0.633,0.861


XGBClassifier показал себя лучше. Неплохой баланс между Precision и Recall (хотя сами метрики могли быть и повыше). Вероятно, если перед нами стоит задача остановить тех кто хочет уйти от нас, и при этом никого не спугнуть, то нам важно понимать насколько качественно модель различает классы, а этим занимается Roc_auc. 

### Задание 3

Для отобранной модели (на отложенной выборке) сделать оценку экономической эффективности при тех же вводных, как в вопросе 2:
- 1 доллар на удержание;
- 2 доллара - с каждого правильно классифицированного (True Positive);

In [18]:
# мы уже нашли ранее "оптимальный" порог, когда максимизировали f_score
preds = pipeline.predict_proba(X_test)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)

cnf_matrix = confusion_matrix(y_test, preds > thresholds[ix])

In [19]:
TN = cnf_matrix[0][0]
FP = cnf_matrix[0][1]
FN = cnf_matrix[1][0]
TP = cnf_matrix[1][1]


retain_sum = (FP + TP) * 1
income = TP * 2

income - retain_sum

123

### Задание 4

*Провести подбор гиперпараметров лучшей модели по итогам 2-3

In [20]:
from sklearn.model_selection import GridSearchCV

params = {
    'classifier__max_depth': [5,6,7,8], 
    'classifier__min_child_weight':[1,5,10], 
    'classifier__learning_rate': [0.05,0.1, 0.2, 0.3], 
    'classifier__n_estimators':[5,10,20,100]
}

In [21]:
%%time
grid = GridSearchCV(pipeline,
                    param_grid=params,
                    cv=5,
                    refit=False)

search = grid.fit(X_train, y_train)
search.best_params_







































































Wall time: 9min 50s


{'classifier__learning_rate': 0.05,
 'classifier__max_depth': 6,
 'classifier__min_child_weight': 5,
 'classifier__n_estimators': 100}

In [22]:
search.best_params_

{'classifier__learning_rate': 0.05,
 'classifier__max_depth': 6,
 'classifier__min_child_weight': 5,
 'classifier__n_estimators': 100}

In [23]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', xgb.XGBClassifier(learning_rate=0.05, 
                                     max_depth=6, 
                                     n_estimators=100,
                                     min_child_weight=5,
                                     random_state=42)),
])


pipeline.fit(X_train, y_train)

# наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]



In [24]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

cnf_matrix = confusion_matrix(y_test, preds > thresholds[ix])

Best Threshold=0.3544279932975769, F-Score=0.646, Precision=0.641, Recall=0.650


### Задание 5

*Еще раз провести оценку экономической эффективности

In [25]:
TN = cnf_matrix[0][0]
FP = cnf_matrix[0][1]
FN = cnf_matrix[1][0]
TP = cnf_matrix[1][1]


retain_sum = (FP + TP) * 1
income = TP * 2

income - retain_sum

145