In [5]:
import feather
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

  from numpy.core.umath_tests import inner1d


# Defining helper functions

In [2]:
def load_data(fname, label):
    """Load features and labels file and join them together using user_id_hash"""
    df = feather.read_dataframe(fname)
    df['user_id_hash'] = df['user_id_hash'].astype('category')
    labels = pd.read_csv('labels.csv', usecols=['user_id_hash', label])
    df = df.merge(labels, how='left', on='user_id_hash')
    df[label].fillna(0, inplace=True)
    df = df.dropna(axis=0)
    return df

In [3]:
def stratified_data_split(feature_columns:list, label_column:str):
    """Splitting train and test set using stratified sampling given that our data is highly skewed"""
    return train_test_split(df[feature_columns],
                            df[label_column], 
                            stratify=df[label_column], random_state=1)

# Model for 14 days period

In [35]:
df = load_data(fname='Data/features_before_14_w_events.feather', label='label_14')

In [36]:
df.tail(5).T

Unnamed: 0,620326,620327,620328,620329,620330
user_id_hash,fc17cb069b3f1c91feb326ebd48d8ccc6cc9e8892a632d...,fc836b0fb93fb19124abeadf4d7cfae0deb8b0523303cd...,fe6295a8142f4799d1c122e7855f682e9d4670aa25555f...,feae65e92d95c4610997c27ef1b943bd92b3b6c4a064d1...,ff191d081c4d4629082688bb95a7afd6a2ac6911ded427...
num_session,3,20,5,2,104
last_active,15,3,31,0,0
receive_message,0,3,1,0,6
accept_message,0,1,0,0,1
life_time_value,0,31.437,0,0,195.832
num_purchase,0,9,0,0,124
avg_purchase,0,3.493,0,0,1.57929
num_event,29,585,50,5,2781
event_per_session,9.66667,29.25,10,2.5,26.7404


__Splitting train and validation set with stratified sampling__

In [37]:
df.columns[1:-1] # features

Index(['num_session', 'last_active', 'receive_message', 'accept_message',
       'life_time_value', 'num_purchase', 'avg_purchase', 'num_event',
       'event_per_session', 'last_date_event_0', 'last_date_event_1',
       'last_date_event_10', 'last_date_event_11', 'last_date_event_14',
       'last_date_event_3', 'last_date_event_32', 'last_date_event_4',
       'last_date_event_40', 'last_date_event_41', 'last_date_event_42',
       'last_date_event_43', 'last_date_event_44', 'last_date_event_45',
       'last_date_event_47', 'last_date_event_48', 'last_date_event_49',
       'last_date_event_5', 'last_date_event_50', 'last_date_event_51',
       'last_date_event_52', 'last_date_event_54', 'last_date_event_55',
       'last_date_event_56', 'last_date_event_57', 'last_date_event_58',
       'last_date_event_59', 'last_date_event_6', 'last_date_event_60',
       'last_date_event_61', 'last_date_event_63', 'last_date_event_64',
       'last_date_event_7', 'last_purchase', 'last_date_eve

In [38]:
X_train, X_val, y_train, y_val = stratified_data_split(feature_columns=df.columns[1:-1], 
                                                       label_column='label_14')

In [39]:
y_train.mean(), y_val.mean()

(0.00985066029300502, 0.009852788506799584)

In [40]:
rf = RandomForestClassifier(max_depth=9, random_state=10)
rf_model = rf.fit(X_train, y_train)

pred_train = rf_model.predict_proba(X_train) 
pred_val = rf_model.predict_proba(X_val)

In [41]:
roc_auc_score(y_true=y_val, y_score=pred_val[:,1])

0.9700209272356837

As we're quite satisfied with the performance of this, we will use this model as the first model to start with and use it for our very first prediction.

In [43]:
X_all = feather.read_dataframe('./Data/features_w_events.feather')

In [96]:
pred_all = rf_model.predict_proba(X_all[X_all.columns[1:]])
X_all['user_purchase_binary_14_days'] = pred_all[:, 1]

__Saving results__

In [104]:
submission = pd.read_csv('sample_submission_2.csv', usecols=['user_id_hash'])
submission = submission.merge(X_all[['user_id_hash', 'user_purchase_binary_14_days']], 
                                                how='left', on='user_id_hash')

In [105]:
submission.head()

Unnamed: 0,user_id_hash,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.000229
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.000789
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.000216
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.000532
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.00016


# Model for 7 day period

In [106]:
df = feather.read_dataframe('Data/features_before_7.feather')

In [107]:
df.head().T

Unnamed: 0,0,1,2,3,4
user_id_hash,002e447aed33ed4c51a68743cc293ef2148058b6a6239e...,005c9d79cf18efc6c8b5fa767964b1def9b8e2f8abd23f...,0061b4d30e8a9935b2ebeec954ff509b4f0cb500cd725c...,00fbbf507c7d3c2f259cd3329d241c29d35712e2d4699f...,0107a0017873efc2dea9a9155832363ceacf6fe97bd428...
num_session,3,2,1,3,1
last_active,33,49,61,58,46
receive_message,0,0,0,0,0
accept_message,0,0,0,0,0
life_time_value,0,0,0,0,0
num_purchase,0,0,0,0,0
avg_purchase,0,0,0,0,0
num_event,119,24,34,47,14
event_per_session,39.6667,12,34,15.6667,14


In [108]:
df = load_data(fname='Data/features_before_7.feather', label='label_7')

X_train, X_val, y_train, y_val = stratified_data_split(feature_columns=df.columns[1:-1], 
                                                       label_column='label_7')

In [109]:
rf = RandomForestClassifier(max_depth=7, random_state=0, max_features=8)
rf_model = rf.fit(X_train, y_train)

pred_train = rf_model.predict_proba(X_train) 
pred_val = rf_model.predict_proba(X_val)

roc_auc_score(y_true=y_val, y_score=pred_val[:,1])

0.9685642588237585

The model for the 7 day time window performs better, which is not surprising given that the shorter in the future, the more certainty we have about it.

In [114]:
pred_all = rf_model.predict_proba(X_all[X_all.columns[1:-1]])
X_all['user_purchase_binary_7_days'] = pred_all[:, 1]

__Saving results__

In [115]:
submission = submission.merge(X_all[['user_id_hash', 'user_purchase_binary_7_days']], 
                                                how='left', on='user_id_hash')

In [116]:
submission.head(5)

Unnamed: 0,user_id_hash,user_purchase_binary_14_days,user_purchase_binary_7_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.000229,6.4e-05
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.000789,0.000344
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.000216,6.9e-05
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.000532,0.000298
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.00016,6.3e-05


In [117]:
submission.fillna(0.0023675038, inplace=True)

In [118]:
submission.to_csv('submission4.csv', index=False)

# Features Importance

In [44]:
importance = rf_model.feature_importances_

In [56]:
impt_features = []
for ft, score in sorted(zip(df.columns[1:-1], importance), key=lambda x: x[1], reverse=True):
    print(f"{ft}: \t{score}")
    impt_features.append(ft)

life_time_value: 	0.20226815144389124
last_date_event_5: 	0.10247384890081726
last_date_event_1: 	0.1006658543753062
num_purchase: 	0.09596257295780874
last_purchase: 	0.07436985359681196
last_date_event_7: 	0.06842299694631708
avg_purchase: 	0.0647769437070023
last_active: 	0.046028320771466125
last_date_event_45: 	0.04488585367432333
last_date_event_3: 	0.025756780177117234
last_date_event_42: 	0.024277872919565735
num_session: 	0.023527966104820327
num_event: 	0.020339309082653627
last_date_event_40: 	0.016068843939104356
event_per_session: 	0.01239877399469268
last_date_event_6: 	0.011645161577498222
last_date_event_44: 	0.009982103346948632
last_date_event_4: 	0.009480222773823772
last_date_event_14: 	0.008915079819776312
last_date_event_0: 	0.008365882902563208
last_date_event_9: 	0.0074225214015416945
last_date_event_41: 	0.0070957016457405524
receive_message: 	0.005005683328121162
last_date_event_11: 	0.0049199826117361355
last_date_event_43: 	0.0025181635270544603
accept_messa

In [58]:
selected_features = impt_features[:-28]
selected_features

['life_time_value',
 'last_date_event_5',
 'last_date_event_1',
 'num_purchase',
 'last_purchase',
 'last_date_event_7',
 'avg_purchase',
 'last_active',
 'last_date_event_45',
 'last_date_event_3',
 'last_date_event_42',
 'num_session',
 'num_event',
 'last_date_event_40',
 'event_per_session',
 'last_date_event_6']

# Phase 2: XGB

## For 14 days

XGB is very roburst to overfitting. It actually helps us get better performance with more features.

In [6]:
df = load_data(fname='Data/features_before_14_w_events.feather', label='label_14')

X_train, X_val, y_train, y_val = stratified_data_split(feature_columns=df.columns[1:-1], 
                                                       label_column='label_14')

In [8]:
import xgboost as xgb
xgb = xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=0.5)
gbm = xgb.fit(X_train, y_train)

In [9]:
gbm_pred_train = gbm.predict_proba(X_train)
gbm_pred_val = gbm.predict_proba(X_val)
roc_auc_score(y_true=y_val, y_score=gbm_pred_val[:,1])

0.9715235907252906

In [12]:
X_all = feather.read_dataframe('Data/features_w_events.feather').set_index('user_id_hash')
X_all = X_all[X_train.columns]

In [13]:
pred_all_14 = gbm.predict_proba(X_all)


## For 7 days

In [23]:
df = load_data(fname='Data/features_before_7_w_events.feather', label='label_7')

X_train, X_val, y_train, y_val = stratified_data_split(feature_columns=df.columns[1:-1], 
                                                       label_column='label_7')

In [24]:
import xgboost as xgb
xgb = xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=0.5, nthread=-1)
gbm = xgb.fit(X_train, y_train)

In [25]:
gbm_pred_train = gbm.predict_proba(X_train)
gbm_pred_val = gbm.predict_proba(X_val)
roc_auc_score(y_true=y_val, y_score=gbm_pred_val[:,1])

0.9783239194055413

In [33]:
X_all = feather.read_dataframe('Data/features_w_events.feather').set_index('user_id_hash')
X_all = X_all[X_train.columns]
pred_all_7 = gbm.predict_proba(X_all)


In [35]:
X_all['user_purchase_binary_14_days'] = pred_all_14[:, 1]
X_all['user_purchase_binary_7_days'] = pred_all_7[:, 1]

## Saving prediction

In [36]:
submission = pd.read_csv('sample_submission_2.csv', usecols=['user_id_hash'], index_col='user_id_hash')

submission = submission.join(X_all[['user_purchase_binary_7_days', 'user_purchase_binary_14_days']], how='left')
submission.fillna(0.0023675038, inplace=True)

submission.to_csv('submission5.csv')

In [None]:
plot_importance(gbm)