In [1]:
from sys import path
import pandas as pd
import gc

In [2]:
dest_path = '/home/y_ksenia/NGWL/sber-churning/'

SEED = 42

In [3]:
# !git clone https://Leggerla:githubcbvdjks7@github.com/Leggerla/sber-churning.git
        
# import google.colab
# from pathlib import Path
# # Determine the locations of auxiliary libraries and datasets.
# google.colab.drive.mount("/content/drive")

# # Change this if you created the shortcut in a different location
# path = 'ngwl-predict-customer-churn'
# AUX_DATA_ROOT = Path("/content/drive/My Drive/"+path)
# dest_path = AUX_DATA_ROOT

# assert AUX_DATA_ROOT.is_dir(), "Have you forgot to 'Add a shortcut to Drive'?"

In [4]:
# patching path is dirty trick but acceptable for notebooks

if dest_path not in path:
    path.insert(0, dest_path)
    

pd.set_option("display.max_columns", 999)

In [5]:
%load_ext autoreload
%autoreload 2

from joining_tables import get_train, get_shipments, train_test_split, get_messages

from feature_extraction import FeatureExtractor

In [6]:
addresses = pd.read_csv('./misc/addresses.csv')
fe = FeatureExtractor()

# Feature extraction

In [7]:
train = get_train('./')
train, holdout = train_test_split(train)

In [8]:
extra = train.merge(addresses, on='phone_id', how='left')
orders = fe.collect_orders(extra)
train_features = fe.exract_all(orders)

In [9]:
train_full = train[['phone_id','month','target']].merge(
    train_features, left_on=['phone_id','month'], right_on=['phone_id','month']
)

In [10]:
train_full.to_csv('./train_full.csv')

In [11]:
train_columns = train_full.columns

In [12]:
del extra, orders, train_features

# Model

In [13]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score

from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier, Dataset

In [14]:
n_estimators = 100
learning_rate = 0.01
n_jobs = -1

In [15]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
# params_start = {
#     'objective': 'binary',
#     'metric' : ['f1', 'roc_auc'], 
#     'n_estimators': 1000,
#     'learning_rate': 0.01,
#     'n_jobs': -1,
#     'scale_pos_weight':
#     'num_leaves': [55, 100, 200, 300],
#     'min_child_samples': [100,300,500],
#     'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
#     'subsample': sp_uniform(loc=0.2, scale=0.8),
#     'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
#     'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
#     'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

# model = LGBMClassifier(n_estimators=n_estimators,
#                        learning_rate=learning_rate,
#                        boosting_type='gbdt')

In [16]:
trains = [
    train_full[train_full['month'] < 4],
    train_full[train_full['month'] == 5],
    train_full[train_full['month'] == 6]
    #     Dataset(data=X_train.values,
    #             label=y_train[train_full['month'] == 6].values)
]
# X_train = train_full.drop(['phone_id','month','target'], axis=1)
# y_train = train_full['target']

In [17]:
del train, train_full

In [18]:
import random
from random import choice

In [19]:
import random
from tqdm import tqdm
from tqdm import trange

random.seed(SEED)

In [20]:
print("Random search start...")
print("")

n_iterations = 10
eval_dict = {}

f1s = []
recalls = []
params = []
for i in range(0, n_iterations):
    param_dist = {
        'objective': 'binary',
        'metric': 'f1',
        'n_estimators': 100,
        'learning_rate': 0.05,
        'n_jobs': -1,
#         'scale_pos_weight': choice([0.2, 0.6, 0.8]),
        'num_leaves': choice([27, 31, 61, 81, 127, 197, 231, 275, 302]),
        'bagging_fraction': choice([0.5, 0.7, 0.8, 0.9]),
        'min_data': choice([300, 400, 450, 500, 550, 650]),
        'is_unbalance': choice([True, False]),
        'max_bin': choice([3, 5, 10, 12, 18, 20, 22]),
        'boosting_type': choice(['gbdt', 'dart']),
        'bagging_freq': choice([3, 9, 11, 15, 17, 23, 31]),
        'max_depth': choice([3, 4, 5, 6, 7, 9, 11]),
        'feature_fraction': choice([0.5, 0.7, 0.8, 0.9]),
        'lambda_l1': choice([0, 10, 20, 30, 40]), }
    
    tr = trains[0]
    print("Cycle {}...".format(i+1))
    for i in tqdm(range(len(trains)-1)):
        params.append(param_dist)
        val = trains[i+1]
        X_train = tr.drop(['phone_id', 'month', 'target'], axis=1).values
        y_train = tr['target'].values

        X_val = val.drop(['phone_id', 'month', 'target'], axis=1).values
        y_val = val['target'].values

        tr_ds = Dataset(
            data=X_train,
            label=y_train,
                        params={'verbose': -1}, free_raw_data=False
        )
        val_ds = Dataset(
            data=X_val,
            label=y_val,
                        params={'verbose': -1}, free_raw_data=False
        )

        gbm = lgb.train(param_dist,
                        tr_ds,
                        num_boost_round=10,
                        valid_sets=val_ds,
                        early_stopping_rounds=5,
                        verbose_eval=5,
                        evals_result=eval_dict)
        # predicting
        y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        y_pred = (y_pred > 0.5).astype(int)
        f1 = f1_score(y_val, y_pred)
        f1s.append(f1)

        recall = recall_score(y_val, y_pred)
        recalls.append(recall)
        
        tr = pd.concat([tr, val], axis=0)

  0%|          | 0/2 [00:00<?, ?it/s]

Random search start...

Cycle 1...




[LightGBM] [Info] Number of positive: 80423, number of negative: 51836
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 462
[LightGBM] [Info] Number of data points in the train set: 132259, number of used features: 118





ValueError: For early stopping, at least one dataset and eval metric is required for evaluation

In [None]:
pd.DataFrame([params, f1s, recalls])

In [None]:
# from copy import deepcopy
# gbm_auc =gbm

In [None]:
feature_importance = {str(k): v 
                      for k,v in zip(train_columns.drop(['phone_id', 'month', 'target']), gbm.feature_importance())}
sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

In [None]:
# import matplotlib.pyplot as plt

# plt.bar(range(len(feature_importance)), list(feature_importance.values()), align='center')
# plt.xticks(range(len(feature_importance)), list(feature_importance.keys()))

In [None]:
lgb.plot_importance(gbm, max_num_features=10)

In [None]:
del tr, X_train, val, X_val, y_train, y_val

# Holdout evaluation

In [None]:
holdout_extra = holdout.merge(addresses, on='phone_id', how='left')

holdout_orders = fe.collect_orders(holdout_extra)
holdout_features = fe.exract_all(holdout_orders)

In [None]:
holdout_full = holdout[['phone_id','month','target']].merge(
    holdout_features, left_on=['phone_id','month'], right_on=['phone_id','month']
)

In [None]:
del holdout_orders, holdout_features, holdout_extra

In [None]:
for col in (set(train_columns) - set(holdout_full.columns)):
    holdout_full[col] = np.nan

In [None]:
holdout_full = holdout_full[train_columns]

In [None]:
del holdout

In [None]:
holdout_full.head()

In [None]:
X_test = holdout_full.drop(['phone_id','month','target'], axis=1)
y_test = holdout_full['target']

In [None]:
logits = gbm.predict(X_test, num_iteration=gbm.best_iteration)
preds = (logits>0.5).astype(int)

In [None]:
f1_score(y_test, preds)

In [None]:
del X_test, y_test, holdout_full, logits

# Evaluate on Kaggle sample

In [None]:
test = pd.read_csv('./sample_submission.csv', sep=';')

In [None]:
test = test.rename(columns={'Id': 'phone_id'})

In [None]:
test['month'] = 8

In [None]:
test_extra = test.merge(addresses, on='phone_id', how='left')

test_orders = fe.collect_orders(test_extra)
test_features = fe.exract_all(test_orders)

In [None]:
del test_extra, test_orders

In [None]:
test_full = test[['phone_id','month']].merge(
    test_features, left_on=['phone_id','month'], right_on=['phone_id','month'], how='left'
)

In [None]:
test_full.head()

In [None]:
for col in (set(train_columns) - set(train_columns)):
    test_full[col] = np.nan

In [None]:
test_full = test_full[train_columns.drop(['target'])]

In [None]:
test_full = test_full.drop(['phone_id','month'], axis=1)

In [None]:
test_logits = gbm.predict(test_full)
test_preds = (test_logits>0.5).astype(int)

In [None]:
len(test_full), len(test)

In [None]:
submit = pd.read_csv('./sample_submission.csv', sep=';')

In [None]:
submit['Predicted'] = test_preds.astype(bool)

In [None]:
submit

In [None]:
submit.to_csv('submission_v6.csv', index=False)