In [None]:
import pandas as pd
import numpy as np

import lightgbm as lgb
import catboost as cat

from sklearn.model_selection import StratifiedKFold

In [None]:
reg_dic = {
    'SEDHIOU': 1, 
    'KEDOUGOU': 2,
    'DIOURBEL': 3,
    'ZIGUINCHOR': 4,
    'KAOLACK': 5,
    'MATAM': 6, 
    'DAKAR': 7,
    'THIES': 8,
    'LOUGA': 9, 
    'TAMBACOUNDA': 10,
    'FATICK': 11,
    'SAINT-LOUIS': 12,
    'KOLDA': 13, 
    'KAFFRINE': 14
}
tenure_dic = {
    'G 12-15 month': 1,
    'I 18-21 month': 2,
    'H 15-18 month': 3,
    'F 9-12 month': 4,
    'J 21-24 month': 5,
    'K > 24 month': 6,
    'E 6-9 month': 7,
    'D 3-6 month': 8    
}

In [None]:
def preparing(df, is_train=True):
    df.columns = [i.lower() for i in df.columns]
    df = df.drop(['mrg'], axis=1)
    
    # label encoding
    df["region"] = df["region"].replace(reg_dic)
    df["tenure"] = df["tenure"].replace(tenure_dic)   
    
    # count columns with NaN values by row 
    df['is_null'] = df.isna().sum(axis=1)
    
    df['balance'] = df['montant'] - df['revenue']
     
    # interactions of columns with NaN values
    agg_cols = ['region', 'montant', 'frequence_rech', 'revenue', 'data_volume', 'on_net', 'zone1', 'zone2', 'freq_top_pack']
    for col in agg_cols:
        for col2 in agg_cols:
            if col<col2:
                df[f"{col}_{col2}_is_null"] = df[[col,col2]].isna().sum(axis=1)
    
    # montant and revenue are linearly dependent, so we can fill NaN's in one column with value in another column
    df['montant'] = df['montant'].fillna(df.revenue)
    df['revenue'] = df['revenue'].fillna(df.montant)
    df['frequence'] = df['frequence'].fillna(df.frequence_rech)
    df['frequence_rech'] = df['frequence_rech'].fillna(df.frequence)
    
    # identifier of the fact that revenue is divisible by 10
    df['rev_good'] = df['revenue'].apply(lambda x: 1 if len(str(x))>2 and str(x)[-4:]=='0.0' else 0)
    
    # sums of columns with activities (calls and data_volume)
    df['other_cnt_calls'] = df[['tigo','zone1','zone2','orange']].sum(axis=1)
    df['cnt_calls'] = df[['on_net', 'tigo','zone1','zone2','orange']].sum(axis=1)
    df['cnt_data_voice'] = df['data_volume'].fillna(0)/20 + df['cnt_calls']
    
    df['on_net_part'] = df['on_net'].fillna(0).div(df['cnt_calls'], fill_value=0)  
    
    # agregation some features by top_pack
    agg_cols = ['data_volume', 'montant', 'frequence_rech', 'revenue', 'frequence', 'on_net', 'regularity']
    agg_by_top_pack = df.groupby('top_pack').agg({col:'median' for col in agg_cols}).reset_index()
    agg_by_top_pack.columns = ['top_pack'] + [col+'_median_top_pack' for col in agg_cols]
    if is_train:
        df = df.merge(agg_by_top_pack, on='top_pack', how='left')
    
    # drop useless columns
    df = df.drop(['freq_top_pack_zone1_is_null', 'data_volume_frequence_rech_is_null', 'frequence_rech_revenue_is_null', 
        'on_net_zone2_is_null', 'on_net_zone1_is_null', 'frequence_rech_zone1_is_null', 'freq_top_pack_zone2_is_null',
        'freq_top_pack_montant_is_null', 'freq_top_pack_frequence_rech_is_null', 'frequence_rech_zone2_is_null',
                   'arpu_segment'], axis=1)
    
    for col in ['data_volume','on_net', 'orange','freq_top_pack', 'montant', 'revenue', 'frequence', 'frequence_rech', 
                'cnt_calls', 'other_cnt_calls', 'cnt_data_voice']:
        df[col] = np.log1p(df[col])
   
    return df, agg_by_top_pack

In [None]:
df, agg_by_top_pack = preparing(pd.read_csv('Train.csv'))
y = df["churn"]
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)
features = [
    'region', 'tenure', 'montant', 'frequence_rech', 'revenue', 'frequence', 'data_volume', 'on_net', 'orange', 'tigo',
    'zone1', 'zone2', 'regularity', 'freq_top_pack', 'is_null', 'balance', 'region_revenue_is_null', 
    'region_zone1_is_null', 'region_zone2_is_null', 'montant_region_is_null', 'montant_revenue_is_null',
    'montant_on_net_is_null', 'montant_zone1_is_null', 'montant_zone2_is_null', 'frequence_rech_region_is_null', 
    'frequence_rech_montant_is_null', 'frequence_rech_on_net_is_null', 'revenue_zone1_is_null', 'revenue_zone2_is_null',
    'data_volume_region_is_null', 'data_volume_montant_is_null', 'data_volume_revenue_is_null', 
    'data_volume_on_net_is_null', 'data_volume_zone1_is_null', 'data_volume_zone2_is_null', 
    'data_volume_freq_top_pack_is_null', 'on_net_region_is_null', 'on_net_revenue_is_null', 'zone1_zone2_is_null', 
    'freq_top_pack_region_is_null', 'freq_top_pack_revenue_is_null', 'freq_top_pack_on_net_is_null', 'rev_good', 
    'other_cnt_calls', 'cnt_calls', 'cnt_data_voice', 'on_net_part', 'data_volume_median_top_pack', 
    'montant_median_top_pack', 'frequence_rech_median_top_pack', 'revenue_median_top_pack', 'frequence_median_top_pack',
    'on_net_median_top_pack', 'regularity_median_top_pack'
]

## LightGBM

In [None]:
lgb_params = {
    "objective": "binary", 
    "metric": "auc",
    "learning_rate": 0.02,
    "n_estimators": 1300,
    "colsample_bytree": 0.7,
    "subsample": 0.8,
    "subsample_freq": 5,
    "num_leaves": 128,
    "random_state": 123,
    "min_child_samples": 20,
    "lambda_l1": 5,
    "lambda_l2": 5,
}

In [None]:
models_lgb = []

for fold_n, (train_index, valid_index) in enumerate(folds.split(df, y)):
    print(f'Fold {fold_n}')
    X_train, X_valid = df.iloc[train_index], df.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(X_train[features], y_train, 
              eval_set=[(X_train[features], y_train), (X_valid[features], y_valid)], verbose=50, early_stopping_rounds=90)
    
    models_lgb.append(model)

## Catboost

In [None]:
params_cat = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'bootstrap_type': 'Bernoulli',
    'max_depth': 7,
    'learning_rate': 0.04,
    'random_state': 1234,
    'num_boost_round': 1800,
    "subsample": 0.9,
    "border_count": 512,
    "l2_leaf_reg": 5,
}

In [None]:
cat_models = []
for fold_n, (train_index, valid_index) in enumerate(folds.split(df, y)):
    print(f'Fold {fold_n}')
    X_train, X_valid = df[features].iloc[train_index], df[features].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_dataset = cat.Pool(X_train, y_train)
    val_dataset = cat.Pool(X_valid, y_valid)
    model = cat.train(params=params_cat,
                      pool=train_dataset, 
                      eval_set=[train_dataset, val_dataset], 
                      verbose_eval=50,
                      early_stopping_rounds=90)
    
    cat_models.append(model)

## Final prediction

In [None]:
test, _ = preparing(pd.read_csv('Test.csv'), is_train=False)
test = test.merge(agg_by_top_pack, on='top_pack', how='left')  # aggregations by top_pack from train dataset
test["lgb"] = 0
test["cat"] = 0

In [None]:
for i,m in enumerate(models_lgb):
    test["lgb"] += m.predict_proba(test[features])[:,1]/5.000
for i,m in enumerate(cat_models):
    test["cat"] += m.predict(test[features])/5.000

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

test['cat'] = sigmoid(test['cat'])

In [None]:
# save file for submit
test["CHURN"] = 0.5*test.cat + 0.5*test.lgb
test[["user_id", "CHURN"]].to_csv('cat_lgb_5_5.csv', index=None)