In [1]:
import pandas as pd
pd.set_option('display.max_columns', 250)

import numpy as np

# from scipy.sparse import coo_matrix, csr_matrix, hstack

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import scikitplot as skplt

#Imputers
# from impyute.imputation.cs import mice, fast_knn 
from sklearn.impute import SimpleImputer

#Encoders
import category_encoders as ce

from sklearn import metrics
from sklearn.metrics import roc_curve, confusion_matrix


#For pipelines creating
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline, FeatureUnion, _transform_one
from sklearn.preprocessing import FunctionTransformer
from sklearn.externals.joblib import Parallel, delayed

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier


# import itertools
# import pickle


<class 'ModuleNotFoundError'>: No module named 'seaborn'

In [2]:
SEED = 42


In [3]:
class DFFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        # non-optimized default implementation; override when a better
        # method is possible
        if y is None:
            # fit method of arity 1 (unsupervised transformation)
            return self.fit(X, **fit_params).transform(X)
        else:
            # fit method of arity 2 (supervised transformation)
            return self.fit(X, y, **fit_params).transform(X)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
#             delayed(_transform_one)(trans, weight, X) #original
            delayed(_transform_one)(trans, X, None, weight)
            for name, trans, weight in self._iter())
        return pd.concat(Xs, axis=1, join='inner')

<class 'NameError'>: name 'FeatureUnion' is not defined

In [4]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    '''
    Используется для выбора сегментов датафрейма с помощью листа названий признаков
    '''
    
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names]

<class 'NameError'>: name 'BaseEstimator' is not defined

In [5]:
def train_model_cv_cl(X_train, y_train, X_test, params, 
                      encoder = None, 
                      model_type = 'sklearn', eval_metric = 'auc', 
                      sk_model = None, 
                      n_folds = 5,
                      n_estimators = 1000,
                      n_jobs = -1,
                      verbose = 250,
                      early_stopping_rounds = 200
                     ):
    if encoder is not None:
        X_train = encoder.fit_transform(X_train, y_train)
        X_test = encoder.transform(X_test)
    
    # set up scoring parameters for different models
    metrics_dict = {'auc': {'lgb': 'auc',         
                            'cat': 'AUC',
                            'sklearn': metrics.roc_auc_score,
                            'xgb': 'auc'}  
                   }
    cv_scores = []
    result_dict = {}
    pred_test = np.zeros(X_test.shape[0])
    pred_train = np.zeros(X_train.shape[0]) #collect all validation predictions over all folds
    
    fold_method = StratifiedKFold(n_splits = n_folds, shuffle = True, random_state = SEED)
    for fold_n, (dev_index, val_index) in enumerate(fold_method.split(X_train, y_train)):
        try: #if data scr or numpy matrices
            X_dev, X_val = X_train[dev_index], X_train[val_index]
            y_dev, y_val = y_train[dev_index], y_train[val_index]
            
        except: #if x data is pandas dataframes
            X_dev, X_val = X_train.iloc[dev_index], X_train.iloc[val_index]
            y_dev, y_val = y_train[dev_index], y_train[val_index]
              
        if model_type == 'sklearn':
            model = sk_model(**params)
            model.fit(X_dev, y_dev)
            pred_y_val = model.predict_proba(X_val)[:, 1]
            pred_y_test = model.predict_proba(X_test)[:, 1]
            
            score = metrics_dict[eval_metric][model_type](y_val, pred_y_val)
            print(f'Fold {fold_n}. {eval_metric}: {score:.6f}.')
            #print('')
            
        if model_type == 'lgb':
            model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs = n_jobs)
            model.fit(X_dev, y_dev, eval_set = [(X_val, y_val)], 
                      eval_metric = metrics_dict[eval_metric][model_type],
                      verbose=verbose, 
                      early_stopping_rounds=early_stopping_rounds)
            pred_y_val = model.predict_proba(X_val)[:, 1]
            pred_y_test = model.predict_proba(X_test, num_iteration=model.best_iteration_)[:, 1]
            
        if model_type == 'xgb':
            dev_data = xgb.DMatrix(data = X_dev, label = y_dev)
            valid_data = xgb.DMatrix(data = X_val, label = y_val)            
            watchlist = [(dev_data, 'dev_data'), (valid_data, 'valid_data')]
            
            param = params #outside parameters
            param['eval_metric'] = metrics_dict[eval_metric][model_type] #add parameter from metric_dict
            
            model = xgb.train(dtrain = dev_data, evals=watchlist,
                              num_boost_round = n_estimators, 
                              early_stopping_rounds=early_stopping_rounds, 
                              verbose_eval=verbose, params=params)            
            pred_y_val = model.predict(xgb.DMatrix(X_val), ntree_limit=model.best_ntree_limit)
            pred_y_test = model.predict(xgb.DMatrix(X_test), ntree_limit=model.best_ntree_limit)
            
        if model_type == 'cat':
            model = CatBoostClassifier(iterations=n_estimators, eval_metric=metrics_dict[eval_metric][model_type], **params,
                                      loss_function='Logloss', od_wait = early_stopping_rounds)
            model.fit(X_dev, y_dev, eval_set=(X_val, y_val), use_best_model=True, verbose=verbose)
            pred_y_val = model.predict_proba(X_val)[:, 1]
            pred_y_test = model.predict_proba(X_test)[:, 1]
            
        
        pred_train[val_index] += pred_y_val
        pred_test += pred_y_test
        cv_scores.append(metrics_dict[eval_metric]['sklearn'](y_val, pred_y_val))
        
    pred_test /= n_folds
    result_dict['predictions'] = pred_test
    result_dict['overall_val_predictions'] = pred_train
    result_dict['cv_scores'] = cv_scores
    
    print('CV mean score: {0:.6f}, std: {1:.6f}.'.format(np.mean(cv_scores), np.std(cv_scores)))
    
    return result_dict

In [6]:
load data

<class 'ValueError'>: 'data' is a directory, not a regular file.

In [7]:
churn_data_train = pd.read_csv('./Data/kaggle_data/orange_small_churn_train_data.csv')
churn_data_test = pd.read_csv

<class 'FileNotFoundError'>: [Errno 44] No such file or directory: './Data/kaggle_data/orange_small_churn_train_data.csv'

In [8]:
churn_data_train['labels'] = churn_data_train['labels'] > 0 #convert (-1, 1) to (True, False)


<class 'NameError'>: name 'churn_data_train' is not defined

In [9]:
churn_data_test.head()


<class 'NameError'>: name 'churn_data_test' is not defined

In [10]:
print(churn_data_train.shape, churn_data_test.shape)


<class 'NameError'>: name 'churn_data_train' is not defined

In [11]:
churn_data_train.drop(columns = ['ID'], axis = 0, inplace = True)
churn_data_test.drop(columns = ['ID'], axis = 0, inplace = True)

<class 'NameError'>: name 'churn_data_train' is not defined

In [12]:
const_feature = churn_data_train.isna().sum(axis = 0) == len(churn_data_train)
useless_features = const_feature[const_feature == True].index
print('Полностью пустые признаки: \n', useless_features.to_list())

<class 'NameError'>: name 'churn_data_train' is not defined

In [None]:
num_features_names = ['Var{}'.format(i) for i in range(1, 191) if 'Var{}'.format(i) not in useless_features]
cat_features_names = ['Var{}'.format(i) for i in range(191, 231) if 'Var{}'.format(i) not in useless_features]

print('Количество числовых признаков:', len(num_features_names))
print('Количество категориальных признаков:', len(cat_features_names))

In [13]:
num_data = churn_data_train.loc[:, num_features_names]
cat_data = churn_data_train.loc[:, cat_features_names]

<class 'NameError'>: name 'churn_data_train' is not defined

In [14]:
feature_amount = []
for threshold in range(0, 40000, 1000):
    nan_counter = num_data.isna().sum(axis = 0)
    f_amount = len(nan_counter[nan_counter < threshold])
    feature_amount.append(f_amount)

plt.figure(figsize = (10, 5))
plt.plot(range(0, 40000, 1000), feature_amount)
plt.title('Изменение количества числовых признаков в зависимости от фильтрующего порога')
plt.xlabel('Пороговое количество nan объектов')
plt.ylabel('Количество отобранных признаков')
plt.show()

<class 'NameError'>: name 'num_data' is not defined

In [15]:
NUM_THRESHOLD = 18000

In [16]:
nan_count = num_data.isna().sum(axis = 0).to_frame(name = 'count')
useful_num_featues = (nan_count[nan_count['count'] < NUM_THRESHOLD].index).to_list()

print(useful_num_featues)

<class 'NameError'>: name 'num_data' is not defined

In [17]:

print('Итоговое количество отобранных числовых признаков:', len(useful_num_featues))

<class 'NameError'>: name 'useful_num_featues' is not defined

In [18]:
feature_amount = []
for threshold in range(0, 40000, 1000):
    nan_counter = cat_data.isna().sum(axis = 0)
    f_amount = len(nan_counter[nan_counter < threshold])
    feature_amount.append(f_amount)

plt.figure(figsize = (10, 5))
plt.plot(range(0, 40000, 1000), feature_amount)
plt.title('Изменение количества категориальных признаков в зависимости от фильтрующего порога')
plt.xlabel('Пороговое количество not nan объектов')
plt.ylabel('Количество отобранных признаков')
plt.show()

<class 'NameError'>: name 'cat_data' is not defined

In [19]:
useful_cat_featues = cat_features_names


<class 'NameError'>: name 'cat_features_names' is not defined

In [20]:
selected_features = useful_num_featues  + useful_cat_featues
print('Общее число отобранных признаков: ',  len(selected_features))

<class 'NameError'>: name 'useful_num_featues' is not defined

In [21]:
churn_data_test.head()


<class 'NameError'>: name 'churn_data_test' is not defined

In [22]:
X_raw = churn_data_train.drop(columns = ['labels'])
y_raw = churn_data_train['labels']

X_dev_raw, X_val_raw, y_dev, y_val = train_test_split(X_raw, y_raw, stratify = y_raw, 
                                                      test_size = 0.1, random_state = SEED)

y_dev = y_dev.values
y_val = y_val.values

<class 'NameError'>: name 'churn_data_train' is not defined

In [23]:
def fillna_imputer(df):
    '''
    Заменяет Nan значения на строковый маркер 'Nan' и добавляет boolean столбец для каждого признака
    '''
    column_names = df.columns.to_list()
    result = pd.DataFrame()
    for name in column_names:
        result[name] = df[name].fillna(value = 'Nan')
        result[name + '_bool'] = df[name].isna()
    return result.reset_index(drop = True)

bool_fillna_imputer = FunctionTransformer(fillna_imputer, validate=False)

<class 'NameError'>: name 'FunctionTransformer' is not defined

In [24]:
def make_df_from_np(X, column_names):
    '''
    Восстанавливает датафрейм из numpy матрицы после масштабирования данных
    '''
    return pd.DataFrame(X, columns = column_names)

repair_df = FunctionTransformer(make_df_from_np, validate=False)

<class 'NameError'>: name 'FunctionTransformer' is not defined

In [26]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(useful_num_featues)),
    ('imputer', SimpleImputer(strategy = 'mean')),
    ('std_scaler', StandardScaler()),
    ('repair_df', repair_df)
])


# https://ig248.gitlab.io/post/2018-11-21-transformer-factory/
# Как изменять параметры функций внутри пайплайна
num_pipeline = num_pipeline.set_params(repair_df__kw_args={'column_names': useful_num_featues})

<class 'NameError'>: name 'Pipeline' is not defined

In [27]:
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(useful_cat_featues)),
    ('bool_fillna_imputer', bool_fillna_imputer),
    ('label_encoder', ce.CatBoostEncoder(random_state = SEED)) 
])

<class 'NameError'>: name 'Pipeline' is not defined

In [28]:
full_pipeline = DFFeatureUnion(transformer_list = [
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
],
                            n_jobs = -1)

<class 'NameError'>: name 'DFFeatureUnion' is not defined

In [29]:
X_dev = full_pipeline.fit_transform(X_dev_raw, y_dev)
X_val = full_pipeline.transform(X_val_raw)
X_test = full_pipeline.transform(churn_data_test)

print('X_dev shape: {}, X_val shape: {}, X_test shape: {}'.format(X_dev.shape, X_val.shape, X_test.shape))

<class 'NameError'>: name 'full_pipeline' is not defined

In [30]:
def plot_roc_curve(fpr, tpr, label = None):
    plt.plot(fpr, tpr, linewidth = 2, label = label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

In [31]:
def make_csv_predictions(predictions, path = None):
    df = pd.DataFrame({'ID': range(0, len(predictions)), 'result': predictions})
    if path is not None:
        df.to_csv(path, sep = ',', index = False)
    else:
        return df

In [32]:
naive_result = train_model_cv_cl(X_dev, y_dev, X_test, 
                                 model_type = 'sklearn', eval_metric = 'auc', 
                                 sk_model = LogisticRegression,
                                 params = {'solver': 'lbfgs', 
                                           'max_iter': 1000, 
                                           'n_jobs': -1}) 

<class 'NameError'>: name 'X_dev' is not defined

In [33]:
naive_result.keys()

<class 'NameError'>: name 'naive_result' is not defined

In [34]:
fpr, tpr, thresholds = roc_curve(y_dev, naive_result['overall_val_predictions'])

<class 'NameError'>: name 'roc_curve' is not defined

In [35]:
plot_roc_curve(fpr, tpr)

<class 'NameError'>: name 'fpr' is not defined

In [36]:
skplt.metrics.plot_confusion_matrix(y_dev, np.around(naive_result['overall_val_predictions']), figsize=(7,5))
plt.show()

<class 'NameError'>: name 'skplt' is not defined

In [37]:
make_csv_predictions(naive_result['predictions'], path = './Kaggle_predictions/naive_logistic.csv')


<class 'NameError'>: name 'naive_result' is not defined

In [38]:
default_cat_results = train_model_cv_cl(X_dev, y_dev.astype('int'), X_test, 
                                                        model_type = 'cat', eval_metric = 'auc', 
                                                        params = {}) 

<class 'NameError'>: name 'X_dev' is not defined

In [39]:
make_csv_predictions(default_cat_results['predictions'], path = './Kaggle_predictions/default_catboost.csv')


<class 'NameError'>: name 'default_cat_results' is not defined