In [1]:
import feather
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

  from numpy.core.umath_tests import inner1d


# Defining helper functions

In [2]:
def load_data(fname, label):
    """Load features and labels file and join them together using user_id_hash"""
    df = feather.read_dataframe(fname)
    df['user_id_hash'] = df['user_id_hash'].astype('category')
    labels = pd.read_csv('labels.csv', usecols=['user_id_hash', label])
    df = df.merge(labels, how='left', on='user_id_hash')
    df[label].fillna(0, inplace=True)
    df = df.dropna(axis=0)
    return df

In [3]:
def stratified_data_split(feature_columns:list, label_column:str):
    """Splitting train and test set using stratified sampling given that our data is highly skewed"""
    return train_test_split(df[feature_columns], 
                            df[label_column], 
                            stratify=df[label_column])

# Model for 14 days period

In [4]:
df = load_data(fname='Data/features_before_14.feather', label='label_14')

In [5]:
df.tail(5).T

Unnamed: 0,620326,620327,620328,620329,620330
user_id_hash,fc17cb069b3f1c91feb326ebd48d8ccc6cc9e8892a632d...,fc836b0fb93fb19124abeadf4d7cfae0deb8b0523303cd...,fe6295a8142f4799d1c122e7855f682e9d4670aa25555f...,feae65e92d95c4610997c27ef1b943bd92b3b6c4a064d1...,ff191d081c4d4629082688bb95a7afd6a2ac6911ded427...
num_session,3,20,5,2,104
last_active,15,3,31,0,0
receive_message,0,3,1,0,6
accept_message,0,1,0,0,1
life_time_value,0,31.437,0,0,195.832
num_purchase,0,9,0,0,124
avg_purchase,0,3.493,0,0,1.57929
num_event,29,585,50,5,2781
event_per_session,9.66667,29.25,10,2.5,26.7404


__Splitting train and validation set with stratified sampling__

In [6]:
df.columns[1:-1] # features

Index(['num_session', 'last_active', 'receive_message', 'accept_message',
       'life_time_value', 'num_purchase', 'avg_purchase', 'num_event',
       'event_per_session'],
      dtype='object')

In [7]:
X_train, X_val, y_train, y_val = stratified_data_split(feature_columns=df.columns[1:-1], 
                                                       label_column='label_14')

In [8]:
rf = RandomForestClassifier(max_depth=8, random_state=0)
rf_model = rf.fit(X_train, y_train)

pred_train = rf_model.predict_proba(X_train) 
pred_val = rf_model.predict_proba(X_val)

In [9]:
roc_auc_score(y_true=y_val, y_score=pred_val[:,1])

0.9579528673918533

As we're quite satisfied with the performance of this, we will use this model as the first model to start with and use it for our very first prediction.

In [10]:
pred_all = rf_model.predict_proba(df[df.columns[1:-1]])

In [11]:
df['user_purchase_binary_14_days'] = pred_all[:, 1]

__Saving results__

In [12]:
submission = pd.read_csv('sample_submission_2.csv')
submission = submission[['user_id_hash']].merge(df[['user_id_hash', 'user_purchase_binary_14_days']], 
                                                how='left', on='user_id_hash')

# Model for 7 day period

In [13]:
df = feather.read_dataframe('Data/features_before_7.feather')

In [14]:
df.head().T

Unnamed: 0,0,1,2,3,4
user_id_hash,002e447aed33ed4c51a68743cc293ef2148058b6a6239e...,005c9d79cf18efc6c8b5fa767964b1def9b8e2f8abd23f...,0061b4d30e8a9935b2ebeec954ff509b4f0cb500cd725c...,00fbbf507c7d3c2f259cd3329d241c29d35712e2d4699f...,0107a0017873efc2dea9a9155832363ceacf6fe97bd428...
num_session,3,2,1,3,1
last_active,33,49,61,58,46
receive_message,0,0,0,0,0
accept_message,0,0,0,0,0
life_time_value,0,0,0,0,0
num_purchase,0,0,0,0,0
avg_purchase,0,0,0,0,0
num_event,119,24,34,47,14
event_per_session,39.6667,12,34,15.6667,14


In [15]:
df = load_data(fname='Data/features_before_7.feather', label='label_7')

X_train, X_val, y_train, y_val = stratified_data_split(feature_columns=df.columns[1:-1], 
                                                       label_column='label_7')

In [16]:
rf = RandomForestClassifier(max_depth=8, random_state=0)
rf_model = rf.fit(X_train, y_train)

pred_train = rf_model.predict_proba(X_train) 
pred_val = rf_model.predict_proba(X_val)

roc_auc_score(y_true=y_val, y_score=pred_val[:,1])

0.9680182985079531

The model for the 7 day time window performs better, which is not surprising given that the shorter in the future, the more certainty we have about it.

In [17]:
pred_all = rf_model.predict_proba(df[df.columns[1:-1]])
df['user_purchase_binary_7_days'] = pred_all[:, 1]

__Saving results__

In [18]:
submission = submission.merge(df[['user_id_hash', 'user_purchase_binary_7_days']], 
                                                how='left', on='user_id_hash')

In [19]:
submission.head(5)

Unnamed: 0,user_id_hash,user_purchase_binary_14_days,user_purchase_binary_7_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.000271,9e-05
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.001026,0.000139
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.000504,0.000142
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.006815,0.000758
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.000194,8.8e-05


In [20]:
submission.fillna(0.002, inplace=True)

In [21]:
submission.to_csv('submission2.csv', index=False)

# Features Importance

In [22]:
importance = rf_model.feature_importances_

In [23]:
for ft, score in sorted(zip(df.columns[1:-1], importance), key=lambda x: x[1], reverse=True):
    print(f"{ft}: \t{score}")

last_active: 	0.41751337972687946
life_time_value: 	0.2218124052169821
num_purchase: 	0.09457659725295062
avg_purchase: 	0.08052729317126536
num_event: 	0.07667720409905306
num_session: 	0.04273161798313839
event_per_session: 	0.038988030277003136
receive_message: 	0.018665915405842965
accept_message: 	0.008507556866884966


`last_active` is the most important feature in the model. Other features regarding the level of activeness of users - how much they often engage with the app are also very important with `num_session` ranking second and `num_event` ranking forth.

Features related to user purchase behaviors are highly important, all of which rank only after `last_active`. It's interesting that `num_purchase` is not as important as `life_time_value` although we are predicting the binary outcome: whether an user will make a purchase or not.

It is also interesting that how many messages are sent to the user is a more important feature than how responsive he or she is to those messages (`accept_message`). Maybe this is because we don't know exactly what 'Accept' is as defined by this app or whether this action is only relevant to only a few certain kinds of messages. 