## Predict test dataset

### GBMs + TomekLinks is our choice

In [1]:
import pandas as pd

train_df = pd.read_csv('./all_train.csv')
train_df.columns

Index(['id', 'cons_12m', 'cons_gas_12m', 'cons_last_month',
       'cons_last_month.1', 'forecast_cons_12m', 'forecast_cons_year',
       'forecast_discount_energy', 'forecast_meter_rent_12m',
       'forecast_price_energy_p1', 'forecast_price_energy_p2',
       'forecast_price_pow_p1', 'has_gas', 'imp_cons', 'margin_gross_pow_ele',
       'margin_net_pow_ele', 'nb_prod_act', 'net_margin', 'num_years_antig',
       'pow_max', 'channel0', 'channel1', 'channel2', 'channel3', 'channel4',
       'channel5', 'channel6', 'date_activ', 'date_end', 'date_modif_prod',
       'date_renewal', 'origin0', 'origin1', 'origin2', 'origin3', 'origin4',
       'origin5', 'price_p1_var', 'price_p2_var', 'price_p3_var',
       'price_p1_fix', 'price_p2_fix', 'price_p3_fix', 'churn'],
      dtype='object')

In [1]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import confusion_matrix
import pandas as pd

train_df = pd.read_csv('./all_train.csv')
train_df = train_df.drop(['cons_last_month.1'], axis=1)

# We don't need ids here so just drop it for a while
ids = list(train_df.id)
train_gbm = train_df.drop(['id'], axis=1)

X = train_gbm.drop(['churn'], axis=1).as_matrix()
y = train_gbm['churn'].as_matrix()

def oversample(X, y, seed):
    tl = TomekLinks()
    X_re, y_re = tl.fit_sample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_re, y_re, test_size=0.3, random_state=seed)
    #X_train_re, y_train_re = SMOTE().fit_sample(X_train, y_train)
    #X_test_re, y_test_re = SMOTE().fit_sample(X_test, y_test)

    gbm = GradientBoostingClassifier(n_estimators=100, \
                                     learning_rate=0.2, \
                                     max_depth=2, \
                                     min_samples_leaf=1)
    #gbm.fit(X_train_re, y_train_re)
    #y_pred = gbm.predict_proba(X_test_re)
    gbm.fit(X_train, y_train)
    y_pred = gbm.predict_proba(X_test)
    y_positive = [x[1] for x in y_pred]
    y_binary = [0 if x[0] > x[1] else 1 for x in y_pred]
    
    y_train_pred = gbm.predict_proba(X_train)
    y_train_positive = [x[1] for x in y_train_pred]
    y_train_binary = [0 if x[0] > x[1] else 1 for x in y_train_pred]

    print ("Seed: ", seed)
    auroc_gbm = roc_auc_score(y_test, y_positive)
    brier_gbm = brier_score_loss(y_test, y_positive)
    confusion = confusion_matrix(y_test, y_binary).ravel()
    
    #auroc_gbm = roc_auc_score(y_test_re, y_positive)
    #brier_gbm = brier_score_loss(y_test_re, y_positive)
    #confusion = confusion_matrix(y_test_re, y_binary).ravel()
    
    auroc_gbm_t = roc_auc_score(y_train, y_train_positive)
    brier_gbm_t = brier_score_loss(y_train, y_train_positive)
    confusion_t = confusion_matrix(y_train, y_train_binary).ravel()
    
    
    print ("AUROC score test/train: ", auroc_gbm, '/', auroc_gbm_t)
    print ("Brier loss test/train: ", brier_gbm, '/', brier_gbm_t)
    print ("[tn, fp, fn, tp] test/train: ", confusion, '/', confusion_t)
    return gbm
    
gbms = []
for i in range (5):
    gbms.append(oversample(X, y, i))

Seed:  0
AUROC score test/train:  0.7011243960545943 / 0.7671988956541802
Brier loss test/train:  0.08286915252757507 / 0.0806272106290579
[tn, fp, fn, tp] test/train:  [4077    6  433    7] / [9471    0 1037   43]
Seed:  1
AUROC score test/train:  0.6614384958388898 / 0.7754891824579926
Brier loss test/train:  0.08866151920317585 / 0.078152420396888
[tn, fp, fn, tp] test/train:  [4047   14  456    6] / [9493    0 1014   44]
Seed:  2
AUROC score test/train:  0.6852731774330681 / 0.7676016166946577
Brier loss test/train:  0.08498756292626564 / 0.08005418314464756
[tn, fp, fn, tp] test/train:  [4071    6  442    4] / [9477    0 1031   43]
Seed:  3
AUROC score test/train:  0.670845058479997 / 0.7799518480218162
Brier loss test/train:  0.08934021512556156 / 0.07734221884402628
[tn, fp, fn, tp] test/train:  [4046   12  461    4] / [9496    0 1009   46]
Seed:  4
AUROC score test/train:  0.6866511460512406 / 0.7770389490010817
Brier loss test/train:  0.08239523145415585 / 0.07998333078435317


In [12]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import confusion_matrix
import pandas as pd

train_df = pd.read_csv('./all_train.csv')
train_df = train_df.drop(['cons_last_month.1'], axis=1)

# We don't need ids here so just drop it for a while
ids = list(train_df.id)
train_gbm = train_df.drop(['id'], axis=1)

X = train_gbm.drop(['churn'], axis=1).as_matrix()
y = train_gbm['churn'].as_matrix()

def oversample(X, y, seed):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    X_train_re, y_train_re = SMOTE().fit_sample(X_train, y_train)
    #X_test_re, y_test_re = SMOTE().fit_sample(X_test, y_test)

    gbm = GradientBoostingClassifier(n_estimators=200, \
                                     learning_rate=0.2, \
                                     max_depth=3, \
                                     min_samples_leaf=1)
    #gbm.fit(X_train_re, y_train_re)
    #y_pred = gbm.predict_proba(X_test_re)
    gbm.fit(X_train_re, y_train_re)
    y_pred = gbm.predict_proba(X_test)
    y_positive = [x[1] for x in y_pred]
    y_binary = [0 if x[0] > x[1] else 1 for x in y_pred]
    
    y_train_pred = gbm.predict_proba(X_train_re)
    y_train_positive = [x[1] for x in y_train_pred]
    y_train_binary = [0 if x[0] > x[1] else 1 for x in y_train_pred]

    print ("Seed: ", seed)
    auroc_gbm = roc_auc_score(y_test, y_positive)
    brier_gbm = brier_score_loss(y_test, y_positive)
    confusion = confusion_matrix(y_test, y_binary).ravel()
    
    #auroc_gbm = roc_auc_score(y_test_re, y_positive)
    #brier_gbm = brier_score_loss(y_test_re, y_positive)
    #confusion = confusion_matrix(y_test_re, y_binary).ravel()
    
    auroc_gbm_t = roc_auc_score(y_train_re, y_train_positive)
    brier_gbm_t = brier_score_loss(y_train_re, y_train_positive)
    confusion_t = confusion_matrix(y_train_re, y_train_binary).ravel()
    
    
    print ("AUROC score test/train: ", auroc_gbm, '/', auroc_gbm_t)
    print ("Brier loss test/train: ", brier_gbm, '/', brier_gbm_t)
    print ("[tn, fp, fn, tp] test/train: ", confusion, '/', confusion_t)
    return gbm
    
gbms = []
for i in range (5):
    gbms.append(oversample(X, y, i))

Seed:  0
AUROC score test/train:  0.6578354869385243 / 0.9811468705527123
Brier loss test/train:  0.08856345258518972 / 0.04581888391345607
[tn, fp, fn, tp] test/train:  [2791   41  274   29] / [11254    68  1117 10205]
Seed:  1
AUROC score test/train:  0.6922265562331937 / 0.981333787202677
Brier loss test/train:  0.08400012580079003 / 0.04615288946121129
[tn, fp, fn, tp] test/train:  [2799   42  257   37] / [11239    74  1137 10176]
Seed:  2
AUROC score test/train:  0.6920888465410614 / 0.9799001159972154
Brier loss test/train:  0.07981473574970342 / 0.04787652984528908
[tn, fp, fn, tp] test/train:  [2823   40  229   43] / [11217    74  1201 10090]
Seed:  3
AUROC score test/train:  0.7027364788596123 / 0.981237965031585
Brier loss test/train:  0.08597355014066746 / 0.046168228434464664
[tn, fp, fn, tp] test/train:  [2791   37  273   34] / [11269    57  1167 10159]
Seed:  4
AUROC score test/train:  0.6595377307627954 / 0.9813758102126078
Brier loss test/train:  0.08927387782022077 / 0

### Preprocess test data

In [126]:
test_raw = pd.read_csv('./ml_case_data/ml_case_test_data.csv')
id_raw = list(test_raw.id.unique())
X_test = pd.read_csv('./all_test.csv')
test_id = list(X_test.id)

id_todo = [x for x in id_raw if x not in test_id]
test_todo = test_raw[test_raw.id.isin(id_todo)] \
                    .drop(['activity_new', 'campaign_disc_ele', 'date_first_activ', 'forecast_base_bill_ele', 'forecast_base_bill_year', 'forecast_bill_12m', 'forecast_cons'], axis=1) \
                    .fillna('nodata')

In [127]:
test_hist = pd.read_csv('./ml_case_data/ml_case_test_hist_data.csv')
test_hist_todo = test_hist[test_hist.id.isin(id_todo)]

In [137]:
train_df = pd.read_csv('./all_train.csv')

user = pd.read_csv('./preprocessed1_train.csv')
user_test = pd.read_csv('./preprocessed1_test.csv')
origin_up_ = list(user_test.origin_up.unique())
origin_up = list(set(list(user.origin_up.unique()) + origin_up_ ))
origins_cols = ['origin'+str(i) for i in range(len(origin_up))]

channel = list(user.channel_sales.unique())
chan_cols = ['channel'+str(i) for i in range(len(channel))]

from datetime import datetime
start_date = '2000-07-25'
FMT = '%Y-%m-%d'

X_test = pd.read_csv('./all_test.csv')
X_test = X_test.drop(['cons_last_month.1'], axis=1)
raw_cols = list(X_test.columns)

for i in id_todo:
    tmp_df = test_todo[test_todo.id == i]
    
    # channel_sales
    if tmp_df['channel_sales'].values[0] != 'nodata':
        tmp_cha = [0] * len(channel)
        try:
            tmp_cha[channel.index(tmp_df['channel_sales'].values[0])] = 1
        except ValueError as e:
            pass
    else:
        tmp_cha = [0] * len(channel)
    for idx in range(len(chan_cols)):
        tmp_df[chan_cols[idx]] = [tmp_cha[idx]] 
    tmp_df = tmp_df.drop(['channel_sales'], axis=1)    
    
    # date_activ and date_end
    tmp_df['date_activ'] = [(datetime.strptime(tmp_df['date_activ'].values[0], FMT) - datetime.strptime(start_date, FMT)).days]
    tmp_df['date_end'] = [(datetime.strptime(tmp_df['date_end'].values[0], FMT) - datetime.strptime(start_date, FMT)).days]
    
    # date_modif_prod
    if tmp_df['date_modif_prod'].values[0] != 'nodata':
        tmp_df['date_modif_prod'] = [(datetime.strptime(tmp_df['date_modif_prod'].values[0], FMT) - datetime.strptime(start_date, FMT)).days]
    else:
        tmp_df['date_modif_prod'] = [train_df.date_modif_prod.mean()]
    
    # date_renewal
    if tmp_df['date_renewal'].values[0] != 'nodata':
        tmp_df['date_renewal'] = [(datetime.strptime(tmp_df['date_renewal'].values[0], FMT) - datetime.strptime(start_date, FMT)).days]
    else:
        tmp_df['date_renewal'] = [train_df.date_renewal.mean()]
    
    # origin_up
    if tmp_df['origin_up'].values[0] != 'nodata':
        tmp_origin = [0] * len(origin_up)
        tmp_origin[origin_up.index(tmp_df['origin_up'].values[0])] = 1
    else:
        tmp_origin = [0] * len(origin_up)
    for idx in range(len(origins_cols)):
        tmp_df[origins_cols[idx]] = [tmp_origin[idx]]
    tmp_df = tmp_df.drop(['origin_up'], axis=1)
    
        
    # has_gas
    if tmp_df['has_gas'].values[0] == 't':
        tmp_df['has_gas'] = [1]
    else:
        tmp_df['has_gas'] = [0]
        
    # price
    tmp_df['price_p1_var'] = [test_hist_todo[test_hist_todo.id == i]['price_p1_var'].mean()]
    tmp_df['price_p2_var'] = [test_hist_todo[test_hist_todo.id == i]['price_p2_var'].mean()]
    tmp_df['price_p3_var'] = [test_hist_todo[test_hist_todo.id == i]['price_p3_var'].mean()]
    tmp_df['price_p1_fix'] = [test_hist_todo[test_hist_todo.id == i]['price_p1_fix'].mean()]
    tmp_df['price_p2_fix'] = [test_hist_todo[test_hist_todo.id == i]['price_p2_fix'].mean()]
    tmp_df['price_p3_fix'] = [test_hist_todo[test_hist_todo.id == i]['price_p3_fix'].mean()]
    
    # else
    cols_todo = ['forecast_discount_energy', 'forecast_price_energy_p1', 'forecast_price_energy_p2', 'forecast_price_pow_p1', 'margin_gross_pow_ele', 'margin_net_pow_ele', 'net_margin','pow_max']
    for col in cols_todo:
        if tmp_df[col].values[0] == 'nodata':
            tmp_df[col] = [train_df[col].mean()]
    
    tmp_df = tmp_df[raw_cols]
    X_test = X_test.append(tmp_df, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [138]:
X_test.to_csv('./all_test_modified.csv')

### Train final model and make predictions

In [2]:
train_df = pd.read_csv('./all_train.csv')
train_gbm = train_df.drop(['id', 'cons_last_month.1'], axis=1)
X_train = train_gbm.drop(['churn'], axis=1).as_matrix()
y_train = train_gbm['churn'].as_matrix()
X_resampled, y_resampled = TomekLinks().fit_sample(X_train, y_train)
gbm = GradientBoostingClassifier(n_estimators=150, \
                                 learning_rate=0.2, \
                                 max_depth=3, \
                                 min_samples_leaf=1)
gbm.fit(X_resampled, y_resampled)

# Feed test data into our model
X_test = pd.read_csv('./all_test_modified.csv')
test_id = list(X_test.id)
X_test_df = X_test.drop(['id', 'Unnamed: 0'], axis=1).as_matrix()
y_test_pred = gbm.predict(X_test_df).tolist()
y_test_pred_prob = [x[1] for x in gbm.predict_proba(X_test_df)]

In [3]:
len(X_test_df)

4024

In [6]:
result_cols = ['id', 'Churn_prediction', 'Churn_probability']
results = [[test_id[idx], y_test_pred[idx], y_test_pred_prob[idx]] for idx in range(len(test_id))]
results.sort(key=lambda x: x[2], reverse=True)

with open ('../../ml_case_data/ml_case_test_output.csv', 'w') as f:
    f.write(','.join(result_cols))
    for line in results:
        f.write('\n')
        f.write(','.join([str(x) for x in line]))

### Apply discount to churn clients

In [7]:
# apply 20% discount to price columns
X_disc = X_test.drop(['Unnamed: 0'], axis=1)
disc_cols = ['price_p1_var', 'price_p2_var', 'price_p3_var', 'price_p1_fix', 'price_p2_fix', 'price_p3_fix']
churn_id = [x[0] for x in results if x[1] == 1]
for i in churn_id:
    for col in disc_cols:
        # dunno why this does not work
        # X_disc[X_disc.id == i][col] = X_disc[X_disc.id == i][col].values[0] * 0.8
        X_disc.loc[X_disc.id == i, col] = X_disc[X_disc.id == i][col].values[0] * 0.8

# use our trained gbm model to predict
X_disc_arr = X_disc.drop(['id'], axis=1).as_matrix()
y_disc_pred = gbm.predict(X_disc_arr).tolist()
y_disc_pred_prob = [x[1] for x in gbm.predict_proba(X_disc_arr)]

results_disc = [[list(X_disc.id)[idx], y_disc_pred[idx], y_disc_pred_prob[idx]] for idx in range(len(list(X_disc.id)))]
results_disc.sort(key=lambda x: x[2], reverse=True)

In [8]:
# How many customers will not churn after the discount?
churn_id = [x[0] for x in results if x[1] == 1]
nochurn_id = [x[0] for x in results if x[1] == 0]
prob_dict = {x[0]:x[2] for x in results}

churn_disc_id = [x[0] for x in results_disc if x[1] == 1]
nochurn_disc_id = [x[0] for x in results_disc if x[1] == 0]
disc_prob_dict = {x[0]:x[2] for x in results_disc}

# customer who turns from churn to nochurn and, nochurn to churn
churn2no_id = [x for x in churn_id if x in nochurn_disc_id]
no2churn_id = [x for x in nochurn_id if x in churn_disc_id]

# churn probabilities of churn customer before and after
churn_prob = []
churn_disc_prob = []
for i in churn_id:
    churn_prob.append(prob_dict[i])
    churn_disc_prob.append(disc_prob_dict[i])

In [9]:
import numpy as np

prob_diff = [churn_prob[idx] - churn_disc_prob[idx] for idx in range(len(churn_prob))]
prob_diff = np.array(prob_diff)

print ('Number of Churn customers: ', len(churn_id))
print ('Churn to Nochurn: ', len(churn2no_id))
print ('Nochurn to Churn: ', len(no2churn_id))
print ('Mean churn probability of Churn customer: ', np.array(churn_prob).mean())
print ('Mean churn probability of Churn customer (discount): ', np.array(churn_disc_prob).mean())
print ('Mean churn probability: ', np.array(list(prob_dict.values())).mean())
print ('Mean churn probability (discount): ', np.array(list(disc_prob_dict.values())).mean())

Number of Churn customers:  38
Churn to Nochurn:  18
Nochurn to Churn:  0
Mean churn probability of Churn customer:  0.6328473199728
Mean churn probability of Churn customer (discount):  0.5467513802574293
Mean churn probability:  0.09519933325798939
Mean churn probability (discount):  0.09438630003006093


In [10]:
prob_diff

array([ 0.91291811,  0.48873494, -0.03590337, -0.03590337, -0.03121291,
        0.00478556,  0.00823381,  0.18131058,  0.14686485, -0.07432545,
        0.25811355, -0.22360476, -0.06857641,  0.22821947, -0.20310275,
       -0.03764857,  0.18573855,  0.57257907,  0.16384057, -0.27008472,
       -0.38433694, -0.28583721,  0.23829282, -0.09658989,  0.18366542,
       -0.15812541,  0.19269748,  0.14798905,  0.42985412, -0.08741774,
        0.05007548,  0.23190981,  0.16366733, -0.02945533,  0.25329494,
       -0.10044093,  0.2053094 ,  0.14611655])