In [1]:
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

##### Importing Relevant Packages

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, KFold
import re

In [3]:
from matplotlib import pyplot as plt

In [4]:
# load datasets
train = pd.read_csv("Train.csv")
test  = pd.read_csv("Test.csv")
sample  = pd.read_csv("sample_submission.csv")
var = pd.read_csv('VariableDefinitions.csv')

In [5]:
train.head(2)
train.shape

Unnamed: 0,user_id,REGION,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,ZONE1,ZONE2,MRG,REGULARITY,TOP_PACK,FREQ_TOP_PACK,CHURN
0,dcf68cc2fb515ccad7d8b9b3bd80ee2a4b270063,SAINT-LOUIS,K > 24 month,17000.0,32.0,18000.0,6000.0,34.0,,97.0,355.0,6.0,,,NO,62,All-net 500F=2000F;5d,35.0,0
1,71c44b5ba328db5c4192a80f7cf8f244d9350ed0,,K > 24 month,4300.0,29.0,4427.0,1476.0,37.0,1764.0,8.0,3.0,0.0,,2.0,NO,40,"Data: 100 F=40MB,24H",22.0,0


(400000, 19)

In [6]:
test.shape

(100000, 18)

In [7]:
train.REGULARITY.nunique()

62

In [8]:
train.isna().sum() / train.shape[0]

user_id           0.000000
REGION            0.393800
TENURE            0.000000
MONTANT           0.350693
FREQUENCE_RECH    0.350693
REVENUE           0.336657
ARPU_SEGMENT      0.336657
FREQUENCE         0.336657
DATA_VOLUME       0.492135
ON_NET            0.364548
ORANGE            0.415793
TIGO              0.598465
ZONE1             0.920775
ZONE2             0.936218
MRG               0.000000
REGULARITY        0.000000
TOP_PACK          0.418322
FREQ_TOP_PACK     0.418322
CHURN             0.000000
dtype: float64

##### Feature Engineering

In [9]:
useless = ['user_id', 'MRG']

In [10]:
train.REGION.fillna('REG',inplace=True)
test.REGION.fillna('REG',inplace=True)

In [11]:
train.TOP_PACK.fillna('PACK',inplace=True)
test.TOP_PACK.fillna('PACK',inplace=True)

In [12]:
minus_999999 = ['MONTANT','FREQUENCE_RECH','REVENUE','ARPU_SEGMENT','FREQUENCE','DATA_VOLUME','ON_NET','ORANGE','FREQ_TOP_PACK','TIGO','ZONE1','ZONE2']

In [13]:
for col in minus_999999:
    train[col].fillna(-99999, inplace=True)
    test[col].fillna(-99999, inplace=True)

In [14]:
train.drop(columns=[i for i in useless], inplace=True)
test.drop(columns=[i for i in useless], inplace=True)

In [15]:
data = pd.concat([train,test],axis=0)
data = data.reset_index(drop=True)

In [16]:
def tenure(x):
    'function to encode the Tenure based on the lowest amount of month spent'
    if x == 'K > 24 month':
        return 24
    elif x == 'H 15-18 month':
        return 15
    elif x == 'G 12-15 month':
        return 12
    elif x == 'J 21-24 month':
        return 21
    elif x == 'I 18-21 month':
        return 18
    elif x == 'E 6-9 month':
        return 6
    elif x == 'F 9-12 month':
        return 9
    elif x == 'D 3-6 month':
        return 3
    else:
        pass

In [17]:
data.TENURE = data.TENURE.apply(tenure)

In [18]:
categ = ['REGION','TOP_PACK'] 

In [19]:
len(data.REGION.unique()), len(data.TOP_PACK.unique())

(15, 113)

In [20]:
data = pd.get_dummies(data, prefix = categ, columns = categ)

In [21]:
data.head(3)

Unnamed: 0,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,...,TOP_PACK_VAS(IVR_Radio_Monthly),TOP_PACK_VAS(IVR_Radio_Weekly),TOP_PACK_WIFI_ Family _10MBPS,TOP_PACK_WIFI_ Family _4MBPS,TOP_PACK_WIFI_Family_2MBPS,"TOP_PACK_YMGX 100=1 hour FNF, 24H/1 month",TOP_PACK_Yewouleen_PKG,TOP_PACK_pack_chinguitel_24h,TOP_PACK_pilot_offer5,TOP_PACK_pilot_offer6
0,24,17000.0,32.0,18000.0,6000.0,34.0,-99999.0,97.0,355.0,6.0,...,0,0,0,0,0,0,0,0,0,0
1,24,4300.0,29.0,4427.0,1476.0,37.0,1764.0,8.0,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,24,1500.0,3.0,1500.0,500.0,3.0,-99999.0,30.0,30.0,-99999.0,...,0,0,0,0,0,0,0,0,0,0


##### Feature Creation

In [22]:
data['income_per_topup_amount'] = data['REVENUE'] / data['MONTANT']
data['active_per_duration'] =  data['TENURE'] / data['REGULARITY']
data['income_to_amount_ratio'] = data['REVENUE'] / data['MONTANT']
data['income_left_after_top_up'] = data['REVENUE'] - data['MONTANT']
data['tenure_to_topup_frequency'] = data['TENURE'] / data['FREQUENCE_RECH']
data['income_in90days_perRevenue'] = data['ARPU_SEGMENT'] / data['REVENUE'] 

In [23]:
train=data[data.CHURN.notnull()].reset_index(drop=True)
test=data[data.CHURN.isna()].reset_index(drop=True)

In [24]:
y = train.CHURN
train.drop('CHURN', axis=1, inplace=True)
test.drop('CHURN', axis=1, inplace=True)

In [25]:
test.head(2)
train.shape

Unnamed: 0,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,...,TOP_PACK_Yewouleen_PKG,TOP_PACK_pack_chinguitel_24h,TOP_PACK_pilot_offer5,TOP_PACK_pilot_offer6,income_per_topup_amount,active_per_duration,income_to_amount_ratio,income_left_after_top_up,tenure_to_topup_frequency,income_in90days_perRevenue
0,24,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,...,0,0,0,0,1.0,24.0,1.0,0.0,-0.00024,1.0
1,24,-99999.0,-99999.0,10.0,3.0,1.0,-99999.0,-99999.0,-99999.0,-99999.0,...,0,0,0,0,-0.0001,12.0,-0.0001,100009.0,-0.00024,0.3


(400000, 148)

In [26]:
c = [i for i in train.columns if train[i].dtype == object]
len(c)

0

In [27]:
train.shape, y.shape, test.shape

((400000, 148), (400000,), (100000, 148))

##### NOTE
I already did feature selection and correlation to determine what features mattered in predicting customer churn for this dataset. So the clust variable below is not something I randomly chose

In [28]:
train.head(5)

Unnamed: 0,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,...,TOP_PACK_Yewouleen_PKG,TOP_PACK_pack_chinguitel_24h,TOP_PACK_pilot_offer5,TOP_PACK_pilot_offer6,income_per_topup_amount,active_per_duration,income_to_amount_ratio,income_left_after_top_up,tenure_to_topup_frequency,income_in90days_perRevenue
0,24,17000.0,32.0,18000.0,6000.0,34.0,-99999.0,97.0,355.0,6.0,...,0,0,0,0,1.058824,0.387097,1.058824,1000.0,0.75,0.333333
1,24,4300.0,29.0,4427.0,1476.0,37.0,1764.0,8.0,3.0,0.0,...,0,0,0,0,1.029535,0.6,1.029535,127.0,0.827586,0.333409
2,24,1500.0,3.0,1500.0,500.0,3.0,-99999.0,30.0,30.0,-99999.0,...,0,0,0,0,1.0,0.75,1.0,0.0,8.0,0.333333
3,24,1500.0,3.0,2497.0,832.0,4.0,0.0,159.0,45.0,19.0,...,0,0,0,0,1.664667,1.333333,1.664667,997.0,8.0,0.3332
4,24,-99999.0,-99999.0,498.0,166.0,3.0,1.0,1.0,3.0,-99999.0,...,0,0,0,0,-0.00498,0.48,-0.00498,100497.0,-0.00024,0.333333


In [29]:
clust = [
'REVENUE',
'MONTANT',
'FREQUENCE_RECH',
'ARPU_SEGMENT',
'FREQUENCE',
'ON_NET',
'ORANGE',
'REGULARITY',
'FREQ_TOP_PACK',
'REGION_REG',
'TOP_PACK_PACK']

The cell below is quite computationally expensive and should only be run if you're interested in seeing the plots I used in determining the optimal k values, using the elbow method, also most k values i got were 2.

In [34]:
# for j in clust:
#     distortions = []
#     if train[j].dtype == object: # One-Hot encode data if it is categorical
#         dummies = pd.get_dummies(train[j], drop_first=True)
#         for i in range(1, 11):
#             km = KMeans(
#                 n_clusters=i, init='k-means++',
#                 n_init=20, max_iter=300,
#                 tol=1e-04, random_state=0
#             )
#             km.fit(dummies)
#             distortions.append(km.inertia_)

#         # plot
#         print(j, km.inertia_)
#         plt.plot(range(1, 11), distortions, marker='o')
#         plt.xlabel('Number of clusters')
#         plt.ylabel('Distortion')
#         plt.show()
    
#     else:
#         for i in range(1, 11):
#             km = KMeans(
#                 n_clusters=i, init='k-means++',
#                 n_init=20, max_iter=300,
#                 tol=1e-04, random_state=0
#             )
#             km.fit(pd.DataFrame(train[j]))
#             distortions.append(km.inertia_)

#         # plot

#         print(j, km.inertia_)
#         plt.plot(range(1, 11), distortions, marker='o')
#         plt.xlabel('Number of clusters')
#         plt.ylabel('Distortion')
#         plt.show()

In [35]:
kmeans = KMeans(
        n_clusters=2, init='k-means++',
        n_init=20, max_iter=300,
        tol=1e-04, random_state=0
    )

In [36]:
%%capture 
#To avoid clumsy cell output

for col in clust:
    if train[col].dtype == object:
        dummies = pd.get_dummies(train[col], drop_first=True)
        test_dum = pd.get_dummies(test[col], drop_first=True)
        kmeans.fit(dummies)
        train[col+'_grouped'] = kmeans.labels_
        test[col+'_grouped'] = kmeans.predict(test_dum)
    else:
        kmeans.fit(pd.DataFrame(train[col]))
        train[col+'_grouped'] = kmeans.labels_
        test[col+'_grouped'] = kmeans.predict(pd.DataFrame(test[col]))

In [37]:
train.shape, test.shape

((400000, 159), (100000, 159))

In [38]:
del data

In [39]:
sum(train.isna().sum())

0

The cell below is quite computationally expensive and should only be run if you're interested in seeing the plots I used in determining the optimal k values, using the elbow method, also most k values i got were 2.

In [36]:
# distortions = []
# for i in range(1, 11):
#     km = KMeans(
#         n_clusters=i, init='k-means++',
#         n_init=20, max_iter=300,
#         tol=1e-04, random_state=0
#     )
#     km.fit(pd.DataFrame(train))
#     distortions.append(km.inertia_)

# # plot
# plt.plot(range(1, 11), distortions, marker='o')
# plt.xlabel('Number of clusters')
# plt.ylabel('Distortion')
# plt.show()

In [37]:
km = KMeans(
        n_clusters=2, init='k-means++',
        n_init=20, max_iter=300,
        tol=1e-04, random_state=0
    )

In [38]:
km.fit(train)

KMeans(n_clusters=2, n_init=20, random_state=0)

In [39]:
train['groups'] = km.labels_
test['groups'] = km.predict(test)

In [40]:
train.shape, y.shape, test.shape

((400000, 160), (400000,), (100000, 160))

In [41]:
train.head(3)

Unnamed: 0,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,...,FREQUENCE_RECH_grouped,ARPU_SEGMENT_grouped,FREQUENCE_grouped,ON_NET_grouped,ORANGE_grouped,REGULARITY_grouped,FREQ_TOP_PACK_grouped,REGION_REG_grouped,TOP_PACK_PACK_grouped,groups
0,24,17000.0,32.0,18000.0,6000.0,34.0,-99999.0,97.0,355.0,6.0,...,1,1,0,0,0,1,0,1,1,0
1,24,4300.0,29.0,4427.0,1476.0,37.0,1764.0,8.0,3.0,0.0,...,1,1,0,0,0,1,0,0,1,0
2,24,1500.0,3.0,1500.0,500.0,3.0,-99999.0,30.0,30.0,-99999.0,...,1,1,0,0,0,1,0,1,1,0


##### Local Validation

In [42]:
X_1, X_2, y_1, y_2 = train_test_split(train, y, test_size = 0.01, random_state = 0)

In [43]:
X_3, X_4, y_3, y_4 = train_test_split(X_2, y_2, test_size = 0.1, random_state = 0)

In [44]:
X_3.shape, y_3.shape

((3600, 160), (3600,))

In [45]:
val_model = CatBoostClassifier(random_seed = 10, n_estimators = 1000)

In [47]:
%%capture 
#In the real sense we want to see how the model's doing but I did this to avoid long training lines on github

val_model.fit(X_3, y_3)

In [48]:
val_pred = val_model.predict_proba(X_4)[:, 1]

In [49]:
log_loss(y_4, val_pred)

0.2440104222463054

Local log_loss score of about 0.244010 seems okay, but caution is that it's just a very small sample from our dataset.

##### Model Training

- Models I used in making my preddictions are catboost and xgboost

In [50]:
train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x), inplace=True)
test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x), inplace=True)

# I'm renaming the columns because Lightgbm(LGBM) model would give an error saying it can't accept JSON as column names.

In [51]:
train.head(2)

Unnamed: 0,TENURE,MONTANT,FREQUENCE_RECH,REVENUE,ARPU_SEGMENT,FREQUENCE,DATA_VOLUME,ON_NET,ORANGE,TIGO,...,FREQUENCE_RECH_grouped,ARPU_SEGMENT_grouped,FREQUENCE_grouped,ON_NET_grouped,ORANGE_grouped,REGULARITY_grouped,FREQ_TOP_PACK_grouped,REGION_REG_grouped,TOP_PACK_PACK_grouped,groups
0,24,17000.0,32.0,18000.0,6000.0,34.0,-99999.0,97.0,355.0,6.0,...,1,1,0,0,0,1,0,1,1,0
1,24,4300.0,29.0,4427.0,1476.0,37.0,1764.0,8.0,3.0,0.0,...,1,1,0,0,0,1,0,0,1,0


25-KFold Catboost model

In [52]:
train.shape, test.shape, y.shape

((400000, 160), (100000, 160), (400000,))

In [53]:
%%capture
#In the real sense we want to see how the model's doing but I did this to avoid long training lines on github

errcb1=[]
y_pred_totcb1=[]
fold=KFold(n_splits=25)#25
i=1
for train_index, test_index in fold.split(train,y):
    print(str(i) + ' iter')
    X_train, X_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    m1  = CatBoostClassifier(n_estimators=1000,eval_metric='Logloss',random_seed= 10, use_best_model=True)
    m1.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=100,verbose=100)
    preds=m1.predict_proba(X_test)[:,1]
    print("err: ",log_loss(y_test,preds))
    errcb1.append((log_loss(y_test,preds)))
    p1 = m1.predict_proba(test)[:,1]
    y_pred_totcb1.append(p1)
    i+=1
np.mean(errcb1)

In [54]:
cat_sub = sample.copy()

In [55]:
cat_sub.CHURN = np.mean(y_pred_totcb1, axis=0)

In [92]:
cat_sub.to_csv('25KFold_Cat.csv', index = False)

25-KFold Xgbboost

In [56]:
%%capture
# Usually we want to see how the model's doing but I did this to avoid long training lines on github

errcb2=[]
y_pred_totcb2=[]
fold=KFold(n_splits=25)#25
i=1
for train_index, test_index in fold.split(train,y):
    print(str(i) + ' iter')
    X_train, X_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    m2  = XGBClassifier(n_estimators=1000,random_seed = 10, use_best_model=True, eval_metric = 'logloss')
    m2.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=100,verbose=100)
    preds=m2.predict_proba(X_test)[:,1]
    print("err: ",log_loss(y_test,preds))
    errcb2.append((log_loss(y_test,preds)))
    p2 = m2.predict_proba(test)[:,1]
    y_pred_totcb2.append(p2)
    i+=1
np.mean(errcb2)

In [57]:
xgb_sub = sample.copy()

In [58]:
xgb_sub.CHURN = np.mean(y_pred_totcb2, axis=0)

In [95]:
xgb_sub.to_csv('25KFold_Xgb.csv', index = False)

###### Blending of  predictions

In [62]:
blend = (xgb_sub.CHURN*0.4) + (cat_sub.CHURN*0.6)

In [63]:
blend_sub = sample.copy()

In [64]:
blend_sub.CHURN = blend

In [65]:
blend_sub.to_csv('XGB_CAT.csv', index = False)

Using this blend will get a log_loss score of approximately 0.246675

#### Submision of 0.246643

In [67]:
reblend = ((xgb_sub.CHURN*0.33) + (cat_sub.CHURN*0.67))*0.95 + (cat_sub.CHURN*0.05)

In [66]:
reblend_sub = sample.copy()

In [68]:
reblend_sub.CHURN = reblend

In [69]:
reblend_sub.to_csv('XGB_CAT1.csv', index = False)