## Обучение пайплайна

1. Загрузим данные 
   UCI Dataset - Contraceptive Method Choice Data Set:
   - https://archive.ics.uci.edu/ml/datasets/Contraceptive+Method+Choice
2. Соберем пайплайн с препроцессингом
3. Обучим catboost и сохраним на диск предобученный пайплайн


**Импорт библиотек**

In [78]:
import numpy as np
import pandas as pd
import itertools
from matplotlib import pyplot as plt
%matplotlib inline

import dill
import sklearn.datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from catboost import CatBoostClassifier
from sklearn.metrics import recall_score, roc_curve, precision_score, roc_auc_score, accuracy_score, f1_score, confusion_matrix, precision_recall_curve,confusion_matrix, classification_report


In [79]:
names = pd.read_csv("data/names.csv", header = None)

In [80]:
names[::1]

Unnamed: 0,0
0,Wife_age
1,Wife_education
2,Husband_education
3,Children
4,Religion
5,Working
6,Husband_Work
7,Standard-of-living
8,Media_exposure
9,Contraceptive method used


In [81]:
columns = [i[0] for i in names.values.tolist()]

In [82]:
columns

['Wife_age',
 'Wife_education',
 'Husband_education',
 'Children',
 'Religion',
 'Working',
 'Husband_Work',
 'Standard-of-living',
 'Media_exposure',
 'Contraceptive method used']

In [83]:
df = pd.read_csv("data/data.csv", sep = ',', names = columns)
df.head(7)

Unnamed: 0,Wife_age,Wife_education,Husband_education,Children,Religion,Working,Husband_Work,Standard-of-living,Media_exposure,Contraceptive method used
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1
5,19,4,4,0,1,1,3,3,0,1
6,38,2,3,6,1,1,3,2,0,1


In [84]:
df.shape

(1473, 10)

In [85]:
df.describe()

Unnamed: 0,Wife_age,Wife_education,Husband_education,Children,Religion,Working,Husband_Work,Standard-of-living,Media_exposure,Contraceptive method used
count,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0,1473.0
mean,32.538357,2.958588,3.429735,3.261371,0.850645,0.749491,2.137814,3.133741,0.073999,1.919891
std,8.227245,1.014994,0.816349,2.358549,0.356559,0.433453,0.864857,0.976161,0.261858,0.876376
min,16.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
25%,26.0,2.0,3.0,1.0,1.0,0.0,1.0,3.0,0.0,1.0
50%,32.0,3.0,4.0,3.0,1.0,1.0,2.0,3.0,0.0,2.0
75%,39.0,4.0,4.0,4.0,1.0,1.0,3.0,4.0,0.0,3.0
max,49.0,4.0,4.0,16.0,1.0,1.0,4.0,4.0,1.0,3.0


In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1473 entries, 0 to 1472
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   Wife_age                   1473 non-null   int64
 1   Wife_education             1473 non-null   int64
 2   Husband_education          1473 non-null   int64
 3   Children                   1473 non-null   int64
 4   Religion                   1473 non-null   int64
 5   Working                    1473 non-null   int64
 6   Husband_Work               1473 non-null   int64
 7   Standard-of-living         1473 non-null   int64
 8   Media_exposure             1473 non-null   int64
 9   Contraceptive method used  1473 non-null   int64
dtypes: int64(10)
memory usage: 115.2 KB


Смотрим баланс таргета. Так как в исходном датасете 3 целевых класса (1=No-use, 2=Long-term, 3=Short-term), трансформируем классы 2 и 3 в класс 1. Нам нужно определить кто не пользуется контрацептивами (категория 1).

In [87]:
df['Contraceptive method used'].value_counts()

1    629
3    511
2    333
Name: Contraceptive method used, dtype: int64

In [88]:
df['target'] = [1 if x == 1 else 0 for x in df['Contraceptive method used']]

df['target'].value_counts()

0    844
1    629
Name: target, dtype: int64

In [89]:
df.head(7)

Unnamed: 0,Wife_age,Wife_education,Husband_education,Children,Religion,Working,Husband_Work,Standard-of-living,Media_exposure,Contraceptive method used,target
0,24,2,3,3,1,1,2,3,0,1,1
1,45,1,3,10,1,1,3,4,0,1,1
2,43,2,3,7,1,1,3,4,0,1,1
3,42,3,2,9,1,1,3,3,0,1,1
4,36,3,3,8,1,1,3,2,0,1,1
5,19,4,4,0,1,1,3,3,0,1,1
6,38,2,3,6,1,1,3,2,0,1,1


Датасет хорошо подготовлен и не требует дополнительного feature engeneering

Делим данные на трейн и тест.

In [90]:
x_data = df.iloc[:,:-2]
y_data = df.iloc[:,-1]

In [91]:
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.33, random_state=42)

In [92]:
#save test
X_test.to_csv("data/X_test.csv", index=None)
y_test.to_csv("data/y_test.csv", index=None)
#save train
X_train.to_csv("data/X_train.csv", index=None)
y_train.to_csv("data/y_train.csv", index=None)

In [93]:
cat_feats = ["Wife_education",
 "Husband_education",
 "Religion",
 "Working",
 "Husband_Work",
 "Standard-of-living",
 "Media_exposure"]
cont_feats = ["Wife_age", "Children"]

In [101]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

class NumpyToDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, column_names, cat_feats):
        self.column_names = column_names
        self.cat_feats = cat_feats

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        XX = pd.DataFrame(X, columns=self.column_names)
        XX[self.cat_feats] = XX[self.cat_feats].astype(int)
        return XX

In [102]:
continuos_transformers = []
cat_transformers = []

for cont_col in cont_feats:
    transfomer =  Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    continuos_transformers.append((cont_col, transfomer))
    
for cat_col in cat_feats:
    cat_transformer = Pipeline([
                ('selector', NumberSelector(key=cat_col))
            ])
    cat_transformers.append((cat_col, cat_transformer))

numpy_to_df = Pipeline([('numpy_to_df', NumpyToDataFrame(column_names=(cat_feats+cont_feats), cat_feats = cat_feats))])
    

In [103]:
feats = FeatureUnion(cat_transformers+continuos_transformers)

In [104]:
preprocessing = Pipeline([
                    ('feats', feats),
                    ('numpy_df', numpy_to_df),
            ])

In [105]:
preprocessing.fit_transform(X_train)

Unnamed: 0,Wife_education,Husband_education,Religion,Working,Husband_Work,Standard-of-living,Media_exposure,Wife_age,Children
0,2,2,1,1,3,4,0,0.251546,1.093753
1,4,4,1,1,3,4,0,0.491764,0.680558
2,4,3,1,1,3,3,0,-1.189759,-0.145834
3,3,3,1,1,1,2,0,0.491764,3.159732
4,3,3,1,1,2,2,0,-1.429977,-1.385421
...,...,...,...,...,...,...,...,...,...
981,2,4,1,0,3,4,0,0.131438,1.093753
982,4,4,1,0,3,4,0,-1.189759,-0.972225
983,3,4,1,1,3,2,0,-0.469106,-0.145834
984,4,4,0,0,2,4,0,0.491764,0.680558


In [106]:
preprocessing.steps

[('feats',
  FeatureUnion(transformer_list=[('Wife_education',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='Wife_education'))])),
                                 ('Husband_education',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='Husband_education'))])),
                                 ('Religion',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='Religion'))])),
                                 ('Working',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='Worki...
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='Standard-of-living'))])),
                       

In [107]:
pipeline = Pipeline([
    ('processing', preprocessing),
    ('classifier', CatBoostClassifier(random_state=42, n_estimators=50, cat_features = ["Wife_education",
 "Husband_education",
 "Religion",
 "Working",
 "Husband_Work",
 "Standard-of-living",
 "Media_exposure"]))])

In [108]:
%%time
pipeline.fit(X_train, y_train)

Learning rate set to 0.159722
0:	learn: 0.6762695	total: 13.4ms	remaining: 658ms
1:	learn: 0.6614401	total: 33.7ms	remaining: 809ms
2:	learn: 0.6443293	total: 57.6ms	remaining: 902ms
3:	learn: 0.6327944	total: 81.1ms	remaining: 932ms
4:	learn: 0.6188992	total: 101ms	remaining: 908ms
5:	learn: 0.6020831	total: 124ms	remaining: 910ms
6:	learn: 0.5887275	total: 147ms	remaining: 905ms
7:	learn: 0.5795847	total: 170ms	remaining: 893ms
8:	learn: 0.5726788	total: 194ms	remaining: 885ms
9:	learn: 0.5667581	total: 221ms	remaining: 884ms
10:	learn: 0.5627313	total: 237ms	remaining: 840ms
11:	learn: 0.5588972	total: 256ms	remaining: 811ms
12:	learn: 0.5557943	total: 279ms	remaining: 795ms
13:	learn: 0.5518951	total: 303ms	remaining: 780ms
14:	learn: 0.5474694	total: 326ms	remaining: 762ms
15:	learn: 0.5431859	total: 349ms	remaining: 742ms
16:	learn: 0.5411997	total: 374ms	remaining: 727ms
17:	learn: 0.5394373	total: 397ms	remaining: 707ms
18:	learn: 0.5374079	total: 421ms	remaining: 687ms
19:	lea

Pipeline(steps=[('processing',
                 Pipeline(steps=[('feats',
                                  FeatureUnion(transformer_list=[('Wife_education',
                                                                  Pipeline(steps=[('selector',
                                                                                   NumberSelector(key='Wife_education'))])),
                                                                 ('Husband_education',
                                                                  Pipeline(steps=[('selector',
                                                                                   NumberSelector(key='Husband_education'))])),
                                                                 ('Religion',
                                                                  Pipeline(steps=[('selector',
                                                                                   NumberSelector(key='Religion'))])),
                    

In [109]:
with open("models/catboost_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)

In [110]:
y_pred = pipeline.predict_proba(X_test)[:, 1]
y_pred[:10]

array([0.20579507, 0.30951517, 0.28916082, 0.34173639, 0.47908796,
       0.24628383, 0.34570022, 0.4002813 , 0.30015241, 0.91360185])

In [111]:
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)

f1 = (2*precision*recall)/(precision+recall)

ix = np.argmax(f1)

roc = roc_auc_score(y_test, y_pred)

print(f'Best threshold: {thresholds[ix]}, F-score: {f1[ix]}, precision: {precision[ix]}, recall: {recall[ix]}, roc_auc = {roc}')

Best threshold: 0.34131486006052486, F-score: 0.6932773109243698, precision: 0.6111111111111112, recall: 0.8009708737864077, roc_auc = 0.7959437515115919
