In [1]:
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

##### Importing Relevant Packages

In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.cluster import KMeans
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold,StratifiedKFold
import re

In [3]:
# load datasets
train = pd.read_csv("Train.csv")
test  = pd.read_csv("Test.csv")
sample  = pd.read_csv("sample_submission.csv")
var = pd.read_csv('VariableDefinitions.csv')

In [5]:
train.head(2)
train.shape

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,dcf68cc2fb515ccad7d8b9b3bd80ee2a4b270063,SAINT-LOUIS,K > 24 month,17000.0,32.0,18000.0,6000.0,34.0,,97.0,355.0,6.0,,,NO,62,All-net 500F=2000F;5d,35.0,0
1,71c44b5ba328db5c4192a80f7cf8f244d9350ed0,,K > 24 month,4300.0,29.0,4427.0,1476.0,37.0,1764.0,8.0,3.0,0.0,,2.0,NO,40,"Data: 100 F=40MB,24H",22.0,0


(400000, 19)

##### Feature Engineering

In [6]:
useless = ['user_id', 'MRG']

In [7]:
train.REGION.fillna('REG',inplace=True)
test.REGION.fillna('REG',inplace=True)

In [8]:
train.TOP_PACK.fillna('PACK',inplace=True)
test.TOP_PACK.fillna('PACK',inplace=True)

In [9]:
minus_999999 = ['MONTANT','FREQUENCE_RECH','REVENUE','ARPU_SEGMENT','FREQUENCE','DATA_VOLUME','ON_NET','ORANGE','FREQ_TOP_PACK','TIGO','ZONE1','ZONE2']

In [10]:
for col in minus_999999:
    train[col].fillna(-99999, inplace=True)
    test[col].fillna(-99999, inplace=True)

In [11]:
train.drop(columns=[i for i in useless], inplace=True)
test.drop(columns=[i for i in useless], inplace=True)

In [12]:
data = pd.concat([train,test],axis=0)
data = data.reset_index(drop=True)

In [13]:
def tenure(x):
    'function to encode the Tenure based on the lowest amount of month spent'
    if x == 'K > 24 month':
        return 24
    elif x == 'H 15-18 month':
        return 15
    elif x == 'G 12-15 month':
        return 12
    elif x == 'J 21-24 month':
        return 21
    elif x == 'I 18-21 month':
        return 18
    elif x == 'E 6-9 month':
        return 6
    elif x == 'F 9-12 month':
        return 9
    elif x == 'D 3-6 month':
        return 3
    else:
        pass

In [14]:
data.TENURE = data.TENURE.apply(tenure)

In [15]:
categ = ['REGION','TOP_PACK'] 

In [2]:
len(data.REGION.unique()), len(data.TOP_PACK.unique())

(15, 61)

In [16]:
data = pd.get_dummies(data, prefix = categ, columns = categ)

In [17]:
data.head(3)

Unnamed: 0,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,...,TOP_PACK_VAS(IVR_Radio_Monthly),TOP_PACK_VAS(IVR_Radio_Weekly),TOP_PACK_WIFI_ Family _10MBPS,TOP_PACK_WIFI_ Family _4MBPS,TOP_PACK_WIFI_Family_2MBPS,"TOP_PACK_YMGX 100=1 hour FNF, 24H/1 month",TOP_PACK_Yewouleen_PKG,TOP_PACK_pack_chinguitel_24h,TOP_PACK_pilot_offer5,TOP_PACK_pilot_offer6
0,24,17000.0,32.0,18000.0,6000.0,34.0,-99999.0,97.0,355.0,6.0,...,0,0,0,0,0,0,0,0,0,0
1,24,4300.0,29.0,4427.0,1476.0,37.0,1764.0,8.0,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,24,1500.0,3.0,1500.0,500.0,3.0,-99999.0,30.0,30.0,-99999.0,...,0,0,0,0,0,0,0,0,0,0


##### Feature Creation

In [18]:
data['income_per_topup_amount'] = data['REVENUE'] / data['MONTANT']
data['active_per_duration'] =  data['TENURE'] / data['REGULARITY']
data['income_to_amount_ratio'] = data['REVENUE'] / data['MONTANT']
data['income_left_after_top_up'] = data['REVENUE'] - data['MONTANT']
data['tenure_to_topup_frequency'] = data['TENURE'] / data['FREQUENCE_RECH']
data['income_in90days_perRevenue'] = data['ARPU_SEGMENT'] / data['REVENUE'] 

In [19]:
train=data[data.CHURN.notnull()].reset_index(drop=True)
test=data[data.CHURN.isna()].reset_index(drop=True)

In [20]:
y = train.CHURN
train.drop('CHURN', axis=1, inplace=True)
test.drop('CHURN', axis=1, inplace=True)

In [21]:
test.head(2)
train.shape

Unnamed: 0,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,...,TOP_PACK_Yewouleen_PKG,TOP_PACK_pack_chinguitel_24h,TOP_PACK_pilot_offer5,TOP_PACK_pilot_offer6,income_per_topup_amount,active_per_duration,income_to_amount_ratio,income_left_after_top_up,tenure_to_topup_frequency,income_in90days_perRevenue
0,24,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,...,0,0,0,0,1.0,24.0,1.0,0.0,-0.00024,1.0
1,24,-99999.0,-99999.0,10.0,3.0,1.0,-99999.0,-99999.0,-99999.0,-99999.0,...,0,0,0,0,-0.0001,12.0,-0.0001,100009.0,-0.00024,0.3


(400000, 148)

In [23]:
c = [i for i in train.columns if train[i].dtype == object]
len(c)

0

In [24]:
train.shape, y.shape, test.shape

((400000, 148), (400000,), (100000, 148))

##### NOTE
I already did feature selection and correlation to determine what features mattered in predicting customer churn for this dataset. So the clust variable below is not something I randomly chose

In [25]:
clust = [
'REVENUE',
'MONTANT',
'FREQUENCE_RECH',
'ARPU_SEGMENT',
'FREQUENCE',
'ON_NET',
'ORANGE',
'REGULARITY',
'FREQ_TOP_PACK',
'REGION_REG',
'TOP_PACK_PACK']

The cell below is quite computationally expensive and should only be run if you're interested in seeing the plots I used in determining the optimal k values, using the elbow method, also most k values i got were 2.

In [26]:
# for j in clust:
#     distortions = []
#     if train[j].dtype == object:
#         dummies = pd.get_dummies(train[j], drop_first=True)
#         for i in range(1, 11):
#             km = KMeans(
#                 n_clusters=i, init='k-means++',
#                 n_init=20, max_iter=300,
#                 tol=1e-04, random_state=0
#             )
#             km.fit(dummies)
#             distortions.append(km.inertia_)

#         # plot
#         print(j, km.inertia_)
#         plt.plot(range(1, 11), distortions, marker='o')
#         plt.xlabel('Number of clusters')
#         plt.ylabel('Distortion')
#         plt.show()
    
#     else:
#         for i in range(1, 11):
#             km = KMeans(
#                 n_clusters=i, init='k-means++',
#                 n_init=20, max_iter=300,
#                 tol=1e-04, random_state=0
#             )
#             km.fit(pd.DataFrame(train[j]))
#             distortions.append(km.inertia_)

#         # plot

#         print(j, km.inertia_)
#         plt.plot(range(1, 11), distortions, marker='o')
#         plt.xlabel('Number of clusters')
#         plt.ylabel('Distortion')
#         plt.show()

In [27]:
kmeans = KMeans(
        n_clusters=2, init='k-means++',
        n_init=20, max_iter=300,
        tol=1e-04, random_state=0
    )

In [28]:
%%capture #To avoid clumsy cell output

for col in clust:
    if train[col].dtype == object:
        dummies = pd.get_dummies(train[col], drop_first=True)
        test_dum = pd.get_dummies(test[col], drop_first=True)
        kmeans.fit(dummies)
        train[col+'_grouped'] = kmeans.labels_
        test[col+'_grouped'] = kmeans.predict(test_dum)
    else:
        kmeans.fit(pd.DataFrame(train[col]))
        train[col+'_grouped'] = kmeans.labels_
        test[col+'_grouped'] = kmeans.predict(pd.DataFrame(test[col]))

KMeans(n_clusters=2, n_init=20, random_state=0)

KMeans(n_clusters=2, n_init=20, random_state=0)

KMeans(n_clusters=2, n_init=20, random_state=0)

KMeans(n_clusters=2, n_init=20, random_state=0)

KMeans(n_clusters=2, n_init=20, random_state=0)

KMeans(n_clusters=2, n_init=20, random_state=0)

KMeans(n_clusters=2, n_init=20, random_state=0)

KMeans(n_clusters=2, n_init=20, random_state=0)

KMeans(n_clusters=2, n_init=20, random_state=0)

KMeans(n_clusters=2, n_init=20, random_state=0)

KMeans(n_clusters=2, n_init=20, random_state=0)

In [29]:
train.shape, test.shape

((400000, 159), (100000, 159))

In [38]:
del data

In [40]:
sum(train.isna().sum())

0

The cell below is quite computationally expensive and should only be run if you're interested in seeing the plots I used in determining the optimal k values, using the elbow method, also most k values i got were 2.

In [3]:
# distortions = []
# for i in range(1, 11):
#     km = KMeans(
#         n_clusters=i, init='k-means++',
#         n_init=20, max_iter=300,
#         tol=1e-04, random_state=0
#     )
#     km.fit(pd.DataFrame(train))
#     distortions.append(km.inertia_)

# # plot
# plt.plot(range(1, 11), distortions, marker='o')
# plt.xlabel('Number of clusters')
# plt.ylabel('Distortion')
# plt.show()

In [42]:
km = KMeans(
        n_clusters=2, init='k-means++',
        n_init=20, max_iter=300,
        tol=1e-04, random_state=0
    )

In [43]:
km.fit(train)

KMeans(n_clusters=2, n_init=20, random_state=0)

In [44]:
train['groups'] = km.labels_
test['groups'] = km.predict(test)

In [45]:
train.shape, y.shape, test.shape

((400000, 160), (400000,), (100000, 160))

In [46]:
train.head(3)

Unnamed: 0,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,...,FREQUENCE_RECH_grouped,ARPU_SEGMENT_grouped,FREQUENCE_grouped,ON_NET_grouped,ORANGE_grouped,REGULARITY_grouped,FREQ_TOP_PACK_grouped,REGION_REG_grouped,TOP_PACK_PACK_grouped,groups
0,24,17000.0,32.0,18000.0,6000.0,34.0,-99999.0,97.0,355.0,6.0,...,1,1,0,0,0,1,0,1,1,0
1,24,4300.0,29.0,4427.0,1476.0,37.0,1764.0,8.0,3.0,0.0,...,1,1,0,0,0,1,0,0,1,0
2,24,1500.0,3.0,1500.0,500.0,3.0,-99999.0,30.0,30.0,-99999.0,...,1,1,0,0,0,1,0,1,1,0


##### Local Validation

In [47]:
X_1, X_2, y_1, y_2 = train_test_split(train, y, test_size = 0.01, random_state = 0)

In [48]:
X_3, X_4, y_3, y_4 = train_test_split(X_2, y_2, test_size = 0.1, random_state = 0)

In [49]:
X_3.shape, y_3.shape

((3600, 160), (3600,))

In [None]:
val_model = CatBoostClassifier(random_seed = 10, n_estimators = 1000)

In [None]:
val_model.fit(X_3, y_3)

In [None]:
val_pred = val_model.predict_proba(X_4)[:, 1]

In [80]:
log_loss(y_4, val_pred)

0.2631137325635713

In [52]:
log_loss(y_4, val_pred)

0.26259819218504604

In [66]:
log_loss(y_4, val_pred)

0.2568614664247297

In [None]:
log_loss(y_4, val_pred)

Local CV log_loss score of about 0.255 seems okay, but caution is that it's just a very sample from our dataset.

##### Model Training

- Models I used in making my preddictions are catboost and xgboost

In [50]:
train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x), inplace=True)
test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x), inplace=True)

### I'm renaming the columns because Lightgbm(LGBM) model would give an error saying it can't accept JSON as column names.

In [84]:
train.head(2)

Unnamed: 0,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,...,FREQUENCE_RECH_grouped,ARPU_SEGMENT_grouped,FREQUENCE_grouped,ON_NET_grouped,ORANGE_grouped,REGULARITY_grouped,FREQ_TOP_PACK_grouped,REGION_REG_grouped,TOP_PACK_PACK_grouped,groups
0,24,17000.0,32.0,18000.0,6000.0,34.0,-99999.0,97.0,355.0,6.0,...,1,1,0,0,0,1,0,1,1,0
1,24,4300.0,29.0,4427.0,1476.0,37.0,1764.0,8.0,3.0,0.0,...,1,1,0,0,0,1,0,0,1,0


25-KFold Catboost model

In [85]:
train.shape, test.shape, y.shape

((400000, 160), (100000, 160), (400000,))

In [88]:
errcb1=[]
y_pred_totcb1=[]
fold=KFold(n_splits=25)#25
i=1
for train_index, test_index in fold.split(train,y):
    print(str(i) + ' iter')
    X_train, X_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    m1  = CatBoostClassifier(n_estimators=1000,eval_metric='Logloss',random_seed= 10, use_best_model=True)
    m1.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=100,verbose=100)
    preds=m1.predict_proba(X_test)[:,1]
    print("err: ",log_loss(y_test,preds))
    errcb1.append((log_loss(y_test,preds)))
    p1 = m1.predict_proba(test)[:,1]
    y_pred_totcb1.append(p1)
    i+=1
np.mean(errcb1)

1 iter
Learning rate set to 0.137797
0:	learn: 0.5031452	test: 0.5031452	test1: 0.5022872	best: 0.5022872 (0)	total: 172ms	remaining: 2m 51s
100:	learn: 0.2509054	test: 0.2509054	test1: 0.2480670	best: 0.2480670 (100)	total: 16.3s	remaining: 2m 25s
200:	learn: 0.2491595	test: 0.2491595	test1: 0.2477578	best: 0.2477556 (194)	total: 30.8s	remaining: 2m 2s
300:	learn: 0.2477787	test: 0.2477787	test1: 0.2478270	best: 0.2476115 (216)	total: 50.5s	remaining: 1m 57s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2476115411
bestIteration = 216

Shrink model to first 217 iterations.


<catboost.core.CatBoostClassifier at 0x1f0d6b3e48>

err:  0.24761154110800296
2 iter
Learning rate set to 0.137797
0:	learn: 0.5115097	test: 0.5115097	test1: 0.5107237	best: 0.5107237 (0)	total: 223ms	remaining: 3m 42s
100:	learn: 0.2510100	test: 0.2510100	test1: 0.2492923	best: 0.2492473 (86)	total: 23.5s	remaining: 3m 29s
200:	learn: 0.2491564	test: 0.2491564	test1: 0.2490761	best: 0.2490456 (195)	total: 42s	remaining: 2m 46s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2490455764
bestIteration = 195

Shrink model to first 196 iterations.


<catboost.core.CatBoostClassifier at 0x1f187c8dd8>

err:  0.24904557642250227
3 iter
Learning rate set to 0.137797
0:	learn: 0.5130667	test: 0.5130667	test1: 0.5122895	best: 0.5122895 (0)	total: 483ms	remaining: 8m 2s
100:	learn: 0.2509320	test: 0.2509320	test1: 0.2469596	best: 0.2469596 (100)	total: 19.5s	remaining: 2m 53s
200:	learn: 0.2492311	test: 0.2492311	test1: 0.2465169	best: 0.2464827 (189)	total: 35.8s	remaining: 2m 22s
300:	learn: 0.2478480	test: 0.2478480	test1: 0.2463364	best: 0.2463364 (300)	total: 54.2s	remaining: 2m 5s
400:	learn: 0.2466516	test: 0.2466516	test1: 0.2464571	best: 0.2463364 (300)	total: 1m 12s	remaining: 1m 47s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2463363593
bestIteration = 300

Shrink model to first 301 iterations.


<catboost.core.CatBoostClassifier at 0x1f0d6b3fd0>

err:  0.24633635927157851
4 iter
Learning rate set to 0.137797
0:	learn: 0.5131187	test: 0.5131187	test1: 0.5139142	best: 0.5139142 (0)	total: 176ms	remaining: 2m 55s
100:	learn: 0.2505563	test: 0.2505563	test1: 0.2542533	best: 0.2542533 (100)	total: 17.8s	remaining: 2m 38s
200:	learn: 0.2489606	test: 0.2489606	test1: 0.2538519	best: 0.2538446 (189)	total: 33.6s	remaining: 2m 13s
300:	learn: 0.2476394	test: 0.2476394	test1: 0.2537160	best: 0.2536825 (284)	total: 49.7s	remaining: 1m 55s
400:	learn: 0.2465250	test: 0.2465250	test1: 0.2535437	best: 0.2535437 (400)	total: 1m 5s	remaining: 1m 37s
500:	learn: 0.2455330	test: 0.2455330	test1: 0.2535398	best: 0.2534809 (450)	total: 1m 21s	remaining: 1m 20s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.253480914
bestIteration = 450

Shrink model to first 451 iterations.


<catboost.core.CatBoostClassifier at 0x1f187c8c18>

err:  0.25348091398596706
5 iter
Learning rate set to 0.137797
0:	learn: 0.5133129	test: 0.5133129	test1: 0.5137363	best: 0.5137363 (0)	total: 208ms	remaining: 3m 27s
100:	learn: 0.2505739	test: 0.2505739	test1: 0.2563574	best: 0.2563537 (98)	total: 19.3s	remaining: 2m 52s
200:	learn: 0.2488776	test: 0.2488776	test1: 0.2559505	best: 0.2559362 (197)	total: 35.1s	remaining: 2m 19s
300:	learn: 0.2475273	test: 0.2475273	test1: 0.2558050	best: 0.2557572 (281)	total: 51s	remaining: 1m 58s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2557571766
bestIteration = 281

Shrink model to first 282 iterations.


<catboost.core.CatBoostClassifier at 0x1f18947940>

err:  0.25575717656899866
6 iter
Learning rate set to 0.137797
0:	learn: 0.5135466	test: 0.5135466	test1: 0.5127704	best: 0.5127704 (0)	total: 198ms	remaining: 3m 17s
100:	learn: 0.2509787	test: 0.2509787	test1: 0.2460229	best: 0.2460033 (98)	total: 19.1s	remaining: 2m 49s
200:	learn: 0.2493619	test: 0.2493619	test1: 0.2459142	best: 0.2458956 (197)	total: 35.1s	remaining: 2m 19s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2458955931
bestIteration = 197

Shrink model to first 198 iterations.


<catboost.core.CatBoostClassifier at 0x1f1048dba8>

err:  0.2458955930874792
7 iter
Learning rate set to 0.137797
0:	learn: 0.5132917	test: 0.5132917	test1: 0.5143951	best: 0.5143951 (0)	total: 176ms	remaining: 2m 55s
100:	learn: 0.2505409	test: 0.2505409	test1: 0.2557104	best: 0.2556461 (93)	total: 18s	remaining: 2m 40s
200:	learn: 0.2488383	test: 0.2488383	test1: 0.2556785	best: 0.2556403 (175)	total: 34.2s	remaining: 2m 15s
300:	learn: 0.2473618	test: 0.2473618	test1: 0.2556672	best: 0.2556386 (239)	total: 50.4s	remaining: 1m 57s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2556385857
bestIteration = 239

Shrink model to first 240 iterations.


<catboost.core.CatBoostClassifier at 0x1f18947fd0>

err:  0.25563858570768716
8 iter
Learning rate set to 0.137797
0:	learn: 0.5094512	test: 0.5094512	test1: 0.5087789	best: 0.5087789 (0)	total: 174ms	remaining: 2m 53s
100:	learn: 0.2507894	test: 0.2507894	test1: 0.2499875	best: 0.2499875 (100)	total: 19s	remaining: 2m 48s
200:	learn: 0.2490361	test: 0.2490361	test1: 0.2497383	best: 0.2496771 (189)	total: 35.1s	remaining: 2m 19s
300:	learn: 0.2475565	test: 0.2475565	test1: 0.2496680	best: 0.2496181 (289)	total: 52.7s	remaining: 2m 2s
400:	learn: 0.2462353	test: 0.2462353	test1: 0.2496787	best: 0.2495454 (365)	total: 1m 9s	remaining: 1m 44s
500:	learn: 0.2451712	test: 0.2451712	test1: 0.2496110	best: 0.2495257 (428)	total: 1m 25s	remaining: 1m 25s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2495256961
bestIteration = 428

Shrink model to first 429 iterations.


<catboost.core.CatBoostClassifier at 0x1f10478a20>

err:  0.24952569612057934
9 iter
Learning rate set to 0.137797
0:	learn: 0.5132557	test: 0.5132557	test1: 0.5140930	best: 0.5140930 (0)	total: 189ms	remaining: 3m 8s
100:	learn: 0.2506546	test: 0.2506546	test1: 0.2556966	best: 0.2556932 (98)	total: 19.2s	remaining: 2m 51s
200:	learn: 0.2489330	test: 0.2489330	test1: 0.2557899	best: 0.2556478 (168)	total: 35.4s	remaining: 2m 20s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2556477681
bestIteration = 168

Shrink model to first 169 iterations.


<catboost.core.CatBoostClassifier at 0x1f5f717438>

err:  0.2556477680958079
10 iter
Learning rate set to 0.137797
0:	learn: 0.5133268	test: 0.5133268	test1: 0.5123333	best: 0.5123333 (0)	total: 201ms	remaining: 3m 20s
100:	learn: 0.2507668	test: 0.2507668	test1: 0.2512847	best: 0.2512601 (97)	total: 19.4s	remaining: 2m 52s
200:	learn: 0.2491240	test: 0.2491240	test1: 0.2512322	best: 0.2511846 (151)	total: 35.3s	remaining: 2m 20s
300:	learn: 0.2476102	test: 0.2476102	test1: 0.2512803	best: 0.2511328 (230)	total: 51.5s	remaining: 1m 59s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2511328012
bestIteration = 230

Shrink model to first 231 iterations.


<catboost.core.CatBoostClassifier at 0x1f5cac5ac8>

err:  0.2511328011938243
11 iter
Learning rate set to 0.137797
0:	learn: 0.5130718	test: 0.5130718	test1: 0.5129074	best: 0.5129074 (0)	total: 247ms	remaining: 4m 6s
100:	learn: 0.2506326	test: 0.2506326	test1: 0.2560889	best: 0.2560889 (100)	total: 19.7s	remaining: 2m 55s
200:	learn: 0.2489224	test: 0.2489224	test1: 0.2560855	best: 0.2559513 (117)	total: 35.7s	remaining: 2m 21s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2559513106
bestIteration = 117

Shrink model to first 118 iterations.


<catboost.core.CatBoostClassifier at 0x1f0d610ef0>

err:  0.2559513106095431
12 iter
Learning rate set to 0.137797
0:	learn: 0.5114239	test: 0.5114239	test1: 0.5118505	best: 0.5118505 (0)	total: 174ms	remaining: 2m 53s
100:	learn: 0.2509055	test: 0.2509055	test1: 0.2508098	best: 0.2508044 (99)	total: 18.6s	remaining: 2m 45s
200:	learn: 0.2491408	test: 0.2491408	test1: 0.2505762	best: 0.2505387 (193)	total: 37.6s	remaining: 2m 29s
300:	learn: 0.2477527	test: 0.2477527	test1: 0.2505370	best: 0.2504863 (293)	total: 53.5s	remaining: 2m 4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2504862574
bestIteration = 293

Shrink model to first 294 iterations.


<catboost.core.CatBoostClassifier at 0x1f5f717c18>

err:  0.25048625735355523
13 iter
Learning rate set to 0.137797
0:	learn: 0.5122607	test: 0.5122607	test1: 0.5117581	best: 0.5117581 (0)	total: 197ms	remaining: 3m 17s
100:	learn: 0.2509078	test: 0.2509078	test1: 0.2520095	best: 0.2520095 (100)	total: 19.7s	remaining: 2m 55s
200:	learn: 0.2490606	test: 0.2490606	test1: 0.2515989	best: 0.2515630 (196)	total: 37.5s	remaining: 2m 28s
300:	learn: 0.2476551	test: 0.2476551	test1: 0.2515176	best: 0.2514897 (234)	total: 57.3s	remaining: 2m 13s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2514896533
bestIteration = 234

Shrink model to first 235 iterations.


<catboost.core.CatBoostClassifier at 0x1f10bbd7f0>

err:  0.2514896533006486
14 iter
Learning rate set to 0.137797
0:	learn: 0.5111059	test: 0.5111059	test1: 0.5118799	best: 0.5118799 (0)	total: 200ms	remaining: 3m 19s
100:	learn: 0.2507049	test: 0.2507049	test1: 0.2542146	best: 0.2542146 (100)	total: 22.2s	remaining: 3m 17s
200:	learn: 0.2490485	test: 0.2490485	test1: 0.2538988	best: 0.2538307 (170)	total: 38.7s	remaining: 2m 33s
300:	learn: 0.2476994	test: 0.2476994	test1: 0.2538292	best: 0.2537272 (285)	total: 54.6s	remaining: 2m 6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2537271515
bestIteration = 285

Shrink model to first 286 iterations.


<catboost.core.CatBoostClassifier at 0x1f0d6106a0>

err:  0.2537271515064176
15 iter
Learning rate set to 0.137797
0:	learn: 0.5090135	test: 0.5090135	test1: 0.5107425	best: 0.5107425 (0)	total: 193ms	remaining: 3m 12s
100:	learn: 0.2506899	test: 0.2506899	test1: 0.2535321	best: 0.2535321 (100)	total: 19.7s	remaining: 2m 55s
200:	learn: 0.2490424	test: 0.2490424	test1: 0.2532310	best: 0.2532174 (196)	total: 38s	remaining: 2m 30s
300:	learn: 0.2476275	test: 0.2476275	test1: 0.2530607	best: 0.2530607 (300)	total: 56.6s	remaining: 2m 11s
400:	learn: 0.2465262	test: 0.2465262	test1: 0.2530376	best: 0.2529656 (324)	total: 1m 12s	remaining: 1m 48s
500:	learn: 0.2453743	test: 0.2453743	test1: 0.2530277	best: 0.2529504 (438)	total: 1m 28s	remaining: 1m 27s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2529503583
bestIteration = 438

Shrink model to first 439 iterations.


<catboost.core.CatBoostClassifier at 0x1f0d696a58>

err:  0.2529503582563414
16 iter
Learning rate set to 0.137797
0:	learn: 0.5090559	test: 0.5090559	test1: 0.5092456	best: 0.5092456 (0)	total: 169ms	remaining: 2m 48s
100:	learn: 0.2506435	test: 0.2506435	test1: 0.2546314	best: 0.2545798 (84)	total: 18s	remaining: 2m 40s
200:	learn: 0.2488932	test: 0.2488932	test1: 0.2543137	best: 0.2543137 (200)	total: 33.9s	remaining: 2m 14s
300:	learn: 0.2475669	test: 0.2475669	test1: 0.2542039	best: 0.2541008 (277)	total: 50.2s	remaining: 1m 56s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2541008488
bestIteration = 277

Shrink model to first 278 iterations.


<catboost.core.CatBoostClassifier at 0x1f0d6bc240>

err:  0.2541008487753667
17 iter
Learning rate set to 0.137797
0:	learn: 0.5113113	test: 0.5113113	test1: 0.5115990	best: 0.5115990 (0)	total: 167ms	remaining: 2m 46s
100:	learn: 0.2507571	test: 0.2507571	test1: 0.2529633	best: 0.2529633 (100)	total: 17.7s	remaining: 2m 37s
200:	learn: 0.2490135	test: 0.2490135	test1: 0.2528746	best: 0.2528605 (194)	total: 33.7s	remaining: 2m 14s
300:	learn: 0.2476895	test: 0.2476895	test1: 0.2527951	best: 0.2527932 (299)	total: 49.5s	remaining: 1m 54s
400:	learn: 0.2464095	test: 0.2464095	test1: 0.2527510	best: 0.2526964 (356)	total: 1m 5s	remaining: 1m 37s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2526964017
bestIteration = 356

Shrink model to first 357 iterations.


<catboost.core.CatBoostClassifier at 0x1f0d610860>

err:  0.2526964017448457
18 iter
Learning rate set to 0.137797
0:	learn: 0.5090230	test: 0.5090230	test1: 0.5110932	best: 0.5110932 (0)	total: 153ms	remaining: 2m 33s
100:	learn: 0.2503388	test: 0.2503388	test1: 0.2606138	best: 0.2605976 (95)	total: 18s	remaining: 2m 40s
200:	learn: 0.2486351	test: 0.2486351	test1: 0.2607343	best: 0.2605567 (120)	total: 36.4s	remaining: 2m 24s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2605566848
bestIteration = 120

Shrink model to first 121 iterations.


<catboost.core.CatBoostClassifier at 0x1f5f717c18>

err:  0.2605566848058612
19 iter
Learning rate set to 0.137797
0:	learn: 0.5130179	test: 0.5130179	test1: 0.5120365	best: 0.5120365 (0)	total: 192ms	remaining: 3m 12s
100:	learn: 0.2508398	test: 0.2508398	test1: 0.2496026	best: 0.2495990 (99)	total: 19.1s	remaining: 2m 50s
200:	learn: 0.2490564	test: 0.2490564	test1: 0.2491853	best: 0.2491660 (162)	total: 35.2s	remaining: 2m 20s
300:	learn: 0.2477837	test: 0.2477837	test1: 0.2490078	best: 0.2489813 (282)	total: 51s	remaining: 1m 58s
400:	learn: 0.2466172	test: 0.2466172	test1: 0.2490284	best: 0.2489029 (318)	total: 1m 6s	remaining: 1m 39s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2489028637
bestIteration = 318

Shrink model to first 319 iterations.


<catboost.core.CatBoostClassifier at 0x1f0d6140f0>

err:  0.24890286370568707
20 iter
Learning rate set to 0.137797
0:	learn: 0.5093805	test: 0.5093805	test1: 0.5102308	best: 0.5102308 (0)	total: 149ms	remaining: 2m 29s
100:	learn: 0.2506172	test: 0.2506172	test1: 0.2547772	best: 0.2547772 (100)	total: 18.4s	remaining: 2m 43s
200:	learn: 0.2489374	test: 0.2489374	test1: 0.2544763	best: 0.2544212 (152)	total: 37s	remaining: 2m 27s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2544212016
bestIteration = 152

Shrink model to first 153 iterations.


<catboost.core.CatBoostClassifier at 0x1f5cad37f0>

err:  0.25442120160913406
21 iter
Learning rate set to 0.137797
0:	learn: 0.5093094	test: 0.5093094	test1: 0.5084263	best: 0.5084263 (0)	total: 183ms	remaining: 3m 2s
100:	learn: 0.2509447	test: 0.2509447	test1: 0.2454882	best: 0.2454724 (99)	total: 19.7s	remaining: 2m 55s
200:	learn: 0.2492752	test: 0.2492752	test1: 0.2453315	best: 0.2452698 (183)	total: 36s	remaining: 2m 23s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2452697748
bestIteration = 183

Shrink model to first 184 iterations.


<catboost.core.CatBoostClassifier at 0x1f0d696a58>

err:  0.24526977478881662
22 iter
Learning rate set to 0.137797
0:	learn: 0.5091195	test: 0.5091195	test1: 0.5084895	best: 0.5084895 (0)	total: 220ms	remaining: 3m 39s
100:	learn: 0.2509062	test: 0.2509062	test1: 0.2476080	best: 0.2475695 (99)	total: 20.9s	remaining: 3m 5s
200:	learn: 0.2492252	test: 0.2492252	test1: 0.2474258	best: 0.2473521 (188)	total: 37.5s	remaining: 2m 29s
300:	learn: 0.2478645	test: 0.2478645	test1: 0.2472439	best: 0.2472439 (300)	total: 55.9s	remaining: 2m 9s
400:	learn: 0.2466609	test: 0.2466609	test1: 0.2473749	best: 0.2471867 (322)	total: 1m 12s	remaining: 1m 47s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2471867407
bestIteration = 322

Shrink model to first 323 iterations.


<catboost.core.CatBoostClassifier at 0x1f589d1470>

err:  0.24718674065748922
23 iter
Learning rate set to 0.137797
0:	learn: 0.5128012	test: 0.5128012	test1: 0.5125624	best: 0.5125624 (0)	total: 162ms	remaining: 2m 41s
100:	learn: 0.2505028	test: 0.2505028	test1: 0.2574160	best: 0.2574160 (100)	total: 17.9s	remaining: 2m 38s
200:	learn: 0.2488369	test: 0.2488369	test1: 0.2572106	best: 0.2571228 (155)	total: 34.2s	remaining: 2m 15s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2571227547
bestIteration = 155

Shrink model to first 156 iterations.


<catboost.core.CatBoostClassifier at 0x1f0d614470>

err:  0.2571227546896069
24 iter
Learning rate set to 0.137797
0:	learn: 0.5125811	test: 0.5125811	test1: 0.5137962	best: 0.5137962 (0)	total: 180ms	remaining: 3m
100:	learn: 0.2506285	test: 0.2506285	test1: 0.2559610	best: 0.2559610 (100)	total: 19.7s	remaining: 2m 55s
200:	learn: 0.2488817	test: 0.2488817	test1: 0.2560270	best: 0.2559447 (103)	total: 37.6s	remaining: 2m 29s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2559447335
bestIteration = 103

Shrink model to first 104 iterations.


<catboost.core.CatBoostClassifier at 0x1f0d6ad6d8>

err:  0.2559447334508798
25 iter
Learning rate set to 0.137797
0:	learn: 0.5127269	test: 0.5127269	test1: 0.5108666	best: 0.5108666 (0)	total: 242ms	remaining: 4m 1s
100:	learn: 0.2508848	test: 0.2508848	test1: 0.2483940	best: 0.2483940 (100)	total: 20.9s	remaining: 3m 6s
200:	learn: 0.2493418	test: 0.2493418	test1: 0.2482886	best: 0.2482032 (146)	total: 37s	remaining: 2m 27s
300:	learn: 0.2478151	test: 0.2478151	test1: 0.2481083	best: 0.2480436 (277)	total: 53.6s	remaining: 2m 4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2480436365
bestIteration = 277

Shrink model to first 278 iterations.


<catboost.core.CatBoostClassifier at 0x1f0d608f60>

err:  0.24804363650294955


0.2519568953327828

In [89]:
cat_sub = sample.copy()

In [91]:
cat_sub.CHURN = np.mean(y_pred_totcb1, axis=0)

In [92]:
cat_sub.to_csv('25KFold_Cat.csv', index = False)

3-StratKFold Catboost model

In [93]:
train.shape, test.shape, y.shape

((400000, 160), (100000, 160), (400000,))

In [63]:
sc = StandardScaler()

In [105]:
errcb2=[]
y_pred_totcb2=[]
fold = StratifiedKFold(n_splits=3, shuffle= True, random_state= 0)#5
i=1
for train_index, test_index in fold.split(train,y):
    print(str(i) + ' iter')
    X_train, X_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    X_pred = sc.transform(test)
    
    m2 = CatBoostClassifier(n_estimators=500,eval_metric='Logloss',random_seed= 10, use_best_model=True)
    m2.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=300,verbose=100)
    preds=m2.predict_proba(X_test)[:,1]
    print("err: ",log_loss(y_test,preds))
    errcb2.append((log_loss(y_test,preds)))
    p2 = m2.predict_proba(X_pred)[:,1]
    y_pred_totcb2.append(p2)
    i+=1
np.mean(errcb2)

1 iter
Learning rate set to 0.170244
0:	learn: 0.4812967	test: 0.4812967	test1: 0.4806269	best: 0.4806269 (0)	total: 117ms	remaining: 58.2s
100:	learn: 0.2494701	test: 0.2494701	test1: 0.2528859	best: 0.2528859 (100)	total: 12.6s	remaining: 49.8s
200:	learn: 0.2471093	test: 0.2471093	test1: 0.2528639	best: 0.2528156 (154)	total: 25s	remaining: 37.1s
300:	learn: 0.2450455	test: 0.2450455	test1: 0.2529669	best: 0.2528156 (154)	total: 35.9s	remaining: 23.8s
400:	learn: 0.2430554	test: 0.2430554	test1: 0.2531818	best: 0.2528156 (154)	total: 47.9s	remaining: 11.8s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.2528155906
bestIteration = 154

Shrink model to first 155 iterations.


<catboost.core.CatBoostClassifier at 0x1f103cfb70>

err:  0.2528155906037959
2 iter
Learning rate set to 0.170244
0:	learn: 0.4784894	test: 0.4784894	test1: 0.4789028	best: 0.4789028 (0)	total: 264ms	remaining: 2m 11s
100:	learn: 0.2498448	test: 0.2498448	test1: 0.2522361	best: 0.2522234 (98)	total: 12.9s	remaining: 51.1s
200:	learn: 0.2475281	test: 0.2475281	test1: 0.2521749	best: 0.2521719 (147)	total: 25.5s	remaining: 38s
300:	learn: 0.2456093	test: 0.2456093	test1: 0.2522623	best: 0.2521564 (239)	total: 39.5s	remaining: 26.1s
400:	learn: 0.2437155	test: 0.2437155	test1: 0.2525340	best: 0.2521564 (239)	total: 51.2s	remaining: 12.6s
499:	learn: 0.2421210	test: 0.2421210	test1: 0.2527871	best: 0.2521564 (239)	total: 1m 2s	remaining: 0us

bestTest = 0.2521564189
bestIteration = 239

Shrink model to first 240 iterations.


<catboost.core.CatBoostClassifier at 0x1f10afefd0>

err:  0.2521564188803394
3 iter
Learning rate set to 0.170244
0:	learn: 0.4718606	test: 0.4718606	test1: 0.4721915	best: 0.4721915 (0)	total: 151ms	remaining: 1m 15s
100:	learn: 0.2499391	test: 0.2499391	test1: 0.2522131	best: 0.2522131 (100)	total: 13.1s	remaining: 51.7s
200:	learn: 0.2477287	test: 0.2477287	test1: 0.2520730	best: 0.2520730 (200)	total: 25.3s	remaining: 37.6s
300:	learn: 0.2456626	test: 0.2456626	test1: 0.2522210	best: 0.2520457 (211)	total: 37.1s	remaining: 24.5s
400:	learn: 0.2437305	test: 0.2437305	test1: 0.2524362	best: 0.2520457 (211)	total: 49s	remaining: 12.1s
499:	learn: 0.2420665	test: 0.2420665	test1: 0.2526516	best: 0.2520457 (211)	total: 1m 1s	remaining: 0us

bestTest = 0.2520456985
bestIteration = 211

Shrink model to first 212 iterations.


<catboost.core.CatBoostClassifier at 0x1f0d6ac588>

err:  0.2520456984589125


0.25233923598101593

In [106]:
cat_1_sub = sample.copy()

In [107]:
cat_1_sub.CHURN = np.mean(y_pred_totcb2, axis=0)

In [108]:
#cat_1_sub.to_csv('3STRATKFold_Cat.csv', index = False) # early 100

In [109]:
cat_1_sub.to_csv('3STRATKFold_Cat3scaled.csv', index = False)# early 300

25-KFold Xgbboost

In [92]:
errcb3=[]
y_pred_totcb3=[]
fold=KFold(n_splits=25)#25
i=1
for train_index, test_index in fold.split(train,y):
    print(str(i) + ' iter')
    X_train, X_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    m3  = XGBClassifier(n_estimators=1000,random_seed = 10, use_best_model=True, eval_metric = 'logloss')
    m3.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=100,verbose=100)
    preds=m3.predict_proba(X_test)[:,1]
    print("err: ",log_loss(y_test,preds))
    errcb3.append((log_loss(y_test,preds)))
    p3 = m3.predict_proba(test)[:,1]
    y_pred_totcb3.append(p3)
    i+=1
np.mean(errcb3)

1 iter
[0]	validation_0-logloss:0.52151	validation_1-logloss:0.52123
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24283	validation_1-logloss:0.24865
Stopping. Best iteration:
[53]	validation_0-logloss:0.24612	validation_1-logloss:0.24774



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.2477421123352524
2 iter
[0]	validation_0-logloss:0.52149	validation_1-logloss:0.52101
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24201	validation_1-logloss:0.25107
Stopping. Best iteration:
[21]	validation_0-logloss:0.24913	validation_1-logloss:0.25018



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.25017494368712506
3 iter
[0]	validation_0-logloss:0.52156	validation_1-logloss:0.52068
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24290	validation_1-logloss:0.24705
Stopping. Best iteration:
[70]	validation_0-logloss:0.24502	validation_1-logloss:0.24694



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.24694448021781137
4 iter
[0]	validation_0-logloss:0.52156	validation_1-logloss:0.52261
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24222	validation_1-logloss:0.25444
Stopping. Best iteration:
[50]	validation_0-logloss:0.24591	validation_1-logloss:0.25396



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.2539574174433892
5 iter
[0]	validation_0-logloss:0.52138	validation_1-logloss:0.52232
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24197	validation_1-logloss:0.25692
Stopping. Best iteration:
[63]	validation_0-logloss:0.24472	validation_1-logloss:0.25627



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.2562694803016337
6 iter
[0]	validation_0-logloss:0.52166	validation_1-logloss:0.52056
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24284	validation_1-logloss:0.24673
Stopping. Best iteration:
[45]	validation_0-logloss:0.24685	validation_1-logloss:0.24607



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.2460720073108737
7 iter
[0]	validation_0-logloss:0.52141	validation_1-logloss:0.52278
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24225	validation_1-logloss:0.25659
Stopping. Best iteration:
[29]	validation_0-logloss:0.24772	validation_1-logloss:0.25584



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.2558384589061143
8 iter
[0]	validation_0-logloss:0.52153	validation_1-logloss:0.52077
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24251	validation_1-logloss:0.25110
Stopping. Best iteration:
[33]	validation_0-logloss:0.24756	validation_1-logloss:0.25041



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.25041528107444083
9 iter
[0]	validation_0-logloss:0.52153	validation_1-logloss:0.52268
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24212	validation_1-logloss:0.25651
Stopping. Best iteration:
[27]	validation_0-logloss:0.24798	validation_1-logloss:0.25562



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.2556176037781206
10 iter
[0]	validation_0-logloss:0.52161	validation_1-logloss:0.52089
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24249	validation_1-logloss:0.25160
Stopping. Best iteration:
[20]	validation_0-logloss:0.24926	validation_1-logloss:0.25100



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.25099704589983596
11 iter
[0]	validation_0-logloss:0.52146	validation_1-logloss:0.52209
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24216	validation_1-logloss:0.25754
Stopping. Best iteration:
[32]	validation_0-logloss:0.24735	validation_1-logloss:0.25672



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.25671924978490096
12 iter
[0]	validation_0-logloss:0.52154	validation_1-logloss:0.52225
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24232	validation_1-logloss:0.25117
Stopping. Best iteration:
[38]	validation_0-logloss:0.24710	validation_1-logloss:0.25079



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.250789519010606
13 iter
[0]	validation_0-logloss:0.52150	validation_1-logloss:0.52163
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24286	validation_1-logloss:0.25245
Stopping. Best iteration:
[37]	validation_0-logloss:0.24737	validation_1-logloss:0.25183



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.2518332332585455
14 iter
[0]	validation_0-logloss:0.52149	validation_1-logloss:0.52247
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24175	validation_1-logloss:0.25468
Stopping. Best iteration:
[48]	validation_0-logloss:0.24573	validation_1-logloss:0.25423



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.2542267733505432
15 iter
[0]	validation_0-logloss:0.52153	validation_1-logloss:0.52241
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24224	validation_1-logloss:0.25357
Stopping. Best iteration:
[44]	validation_0-logloss:0.24643	validation_1-logloss:0.25302



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.25301954090111123
16 iter
[0]	validation_0-logloss:0.52144	validation_1-logloss:0.52199
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24205	validation_1-logloss:0.25527
Stopping. Best iteration:
[21]	validation_0-logloss:0.24893	validation_1-logloss:0.25475



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.25475056091607984
17 iter
[0]	validation_0-logloss:0.52152	validation_1-logloss:0.52217
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24222	validation_1-logloss:0.25441
Stopping. Best iteration:
[26]	validation_0-logloss:0.24813	validation_1-logloss:0.25340



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.2534001376785391
18 iter
[0]	validation_0-logloss:0.52137	validation_1-logloss:0.52390
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24177	validation_1-logloss:0.26280
Stopping. Best iteration:
[19]	validation_0-logloss:0.24908	validation_1-logloss:0.26162



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.2616204976667068
19 iter
[0]	validation_0-logloss:0.52134	validation_1-logloss:0.52159
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24257	validation_1-logloss:0.25041
Stopping. Best iteration:
[35]	validation_0-logloss:0.24707	validation_1-logloss:0.24968



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.24968241698421342
20 iter
[0]	validation_0-logloss:0.52148	validation_1-logloss:0.52277
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24196	validation_1-logloss:0.25626
Stopping. Best iteration:
[20]	validation_0-logloss:0.24914	validation_1-logloss:0.25532



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.2553223284971827
21 iter
[0]	validation_0-logloss:0.52161	validation_1-logloss:0.52046
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24273	validation_1-logloss:0.24692
Stopping. Best iteration:
[32]	validation_0-logloss:0.24799	validation_1-logloss:0.24575



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.24575115844339962
22 iter
[0]	validation_0-logloss:0.52158	validation_1-logloss:0.52060
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24272	validation_1-logloss:0.24808
Stopping. Best iteration:
[25]	validation_0-logloss:0.24863	validation_1-logloss:0.24762



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.247615525015186
23 iter
[0]	validation_0-logloss:0.52127	validation_1-logloss:0.52307
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24184	validation_1-logloss:0.25853
Stopping. Best iteration:
[51]	validation_0-logloss:0.24579	validation_1-logloss:0.25784



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.25784438840227925
24 iter
[0]	validation_0-logloss:0.52132	validation_1-logloss:0.52274
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24174	validation_1-logloss:0.25684
Stopping. Best iteration:
[30]	validation_0-logloss:0.24734	validation_1-logloss:0.25625



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.2562502689226876
25 iter
[0]	validation_0-logloss:0.52160	validation_1-logloss:0.52038
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24257	validation_1-logloss:0.24899
Stopping. Best iteration:
[54]	validation_0-logloss:0.24593	validation_1-logloss:0.24850



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.24849955794200967


0.2524541595091435

In [93]:
xgb_sub = sample.copy()

In [94]:
xgb_sub.CHURN = np.mean(y_pred_totcb3, axis=0)

In [95]:
xgb_sub.to_csv('25KFold_Xgb.csv', index = False)

3-StratKFold Xgbboost model

In [55]:
train.shape, test.shape, y.shape

((400000, 160), (100000, 160), (400000,))

In [64]:
errcb4=[]
y_pred_totcb4=[]
fold = StratifiedKFold(n_splits=3, shuffle= True, random_state= 0)#5
i=1
for train_index, test_index in fold.split(train,y):
    print(str(i) + ' iter')
    X_train, X_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    X_pred = sc.transform(test)
    
    m4 = XGBClassifier(n_estimators=1000,eval_metric='logloss',random_seed= 10, use_best_model=True)
    m4.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=100,verbose=100)
    preds=m4.predict_proba(X_test)[:,1]
    print("err: ",log_loss(y_test,preds))
    errcb4.append((log_loss(y_test,preds)))
    p4 = m4.predict_proba(X_pred)[:,1]
    y_pred_totcb4.append(p4)
    i+=1
np.mean(errcb4)

1 iter
[0]	validation_0-logloss:0.52145	validation_1-logloss:0.52172
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.23891	validation_1-logloss:0.25469
Stopping. Best iteration:
[26]	validation_0-logloss:0.24664	validation_1-logloss:0.25354



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.25353394341942115
2 iter
[0]	validation_0-logloss:0.52148	validation_1-logloss:0.52206
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.23955	validation_1-logloss:0.25370
Stopping. Best iteration:
[31]	validation_0-logloss:0.24681	validation_1-logloss:0.25280



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.2528070067666714
3 iter
[0]	validation_0-logloss:0.52162	validation_1-logloss:0.52193
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 100 rounds.
[100]	validation_0-logloss:0.24012	validation_1-logloss:0.25353
Stopping. Best iteration:
[28]	validation_0-logloss:0.24707	validation_1-logloss:0.25244



XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, random_seed=10, random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              use_best_model=True, validate_parameters=False, verbosity=None)

err:  0.25243592398959064


0.25292562472522767

In [65]:
xgb_1_sub = sample.copy()

In [66]:
xgb_1_sub.CHURN = np.mean(y_pred_totcb4, axis=0)

In [67]:
xgb_1_sub.to_csv('3STRATKFold_Xgbscaled.csv', index = False)

##### Blending of  predictions

In [103]:
best = pd.read_csv('25KFold_Cat.csv')
sec = pd.read_csv('25KFold_Xgb.csv')

In [104]:
blend = ((sec.CHURN*0.4) + (best.CHURN*0.6))*0.95 + (best.CHURN*0.05)

In [105]:
blend_sub = sample.copy()

In [106]:
blend_sub.CHURN = blend

In [107]:
blend_sub.to_csv('XGB_CAT_blend3.csv', index = False)

In [108]:
best.head(3)

Unnamed: 0,user_id,CHURN
0,af900d87e73b7ff6509d2203df4704a98aa5f2a6,0.783101
1,5335efd940280b82143272275637d1e65d37eadb,0.639718
2,a581f4fa08677c26f83f643248c667e241043086,0.133376


In [109]:
blend_sub.head(3)

Unnamed: 0,user_id,CHURN
0,af900d87e73b7ff6509d2203df4704a98aa5f2a6,0.781986
1,5335efd940280b82143272275637d1e65d37eadb,0.646253
2,a581f4fa08677c26f83f643248c667e241043086,0.127451
