In [301]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.offline import iplot, plot, init_notebook_mode
from config import credentials
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from datetime import datetime  
from datetime import timedelta  

init_notebook_mode(connected=True)

In [3]:
user_engagement = pd.read_csv('takehome_user_engagement.csv')
users = pd.read_csv('takehome_users.csv', encoding='latin-1')

In [194]:
users.head(10)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0
5,6,2013-12-17 03:37:06,Cunha Eduardo,EduardoPereiraCunha@yahoo.com,GUEST_INVITE,1387424000.0,0,0,197,11241.0
6,7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,1356010000.0,0,1,37,
7,8,2013-07-31 05:34:02,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,,1,1,74,
8,9,2013-11-05 04:04:24,Amsel Paul,PaulAmsel@hotmail.com,PERSONAL_PROJECTS,,0,0,302,
9,10,2013-01-16 22:08:03,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,1401833000.0,1,1,318,4143.0


In [6]:
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [8]:
user_engagement.time_stamp = pd.to_datetime(user_engagement.time_stamp)

The code below creates a mapping between user id and the "adopted_user" label. 

In [236]:
adopted = {}
for user in user_engagement.user_id.unique():
    user_df = user_engagement[user_engagement.user_id==user].reset_index(drop=True)

    if len(user_df) < 3:
        adopted[user] = False
    else:
        # list of the difference between consecutive timestamps
        three_day_deltas = [(x - user_df.time_stamp[i - 2]).days for i, x in enumerate(user_df.time_stamp) if i > 1]

        if np.array(three_day_deltas).min() <=7:
            status=True
        else:
            status=False

        adopted[user] = status

In [309]:
X = users.copy()
X.drop(['name', 'email'], axis=1, inplace=True)

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 8 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(2)
memory usage: 750.1+ KB


Missing values in `last_session_creation_time` will be filled in with the creation time. If a user has a missing `last_session_creation_time`, it's because they created an account but never logged back in. In this case, we can simply treat the account creation time as the one and only time they logged in.

Before we can impute the missing values from the `creation_time` column, we first need to convert both columns into unix timestamps so they have the same format.

In [310]:
X['creation_time'] = pd.to_datetime(X.creation_time).astype(np.int64)/(1e9)
X['last_session_creation_time'] = X['last_session_creation_time'].fillna(X['creation_time'])

To address the missing `invited_by_user_id` values, I'm going to simply fill them with the `object_id` or the user's own `object_id`. This makes sense since if no one else invited them, they "invited themselves".

In [311]:
X.invited_by_user_id = X.invited_by_user_id.fillna(X.object_id)

In [312]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 8 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null float64
creation_source               12000 non-null object
last_session_creation_time    12000 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            12000 non-null float64
dtypes: float64(3), int64(4), object(1)
memory usage: 750.1+ KB


Next, we turn `creation_source` into a dummy variable.

In [313]:
X = pd.get_dummies(X)

The binary label is generated from the adopted mapping we created earlier. If no engagement data exists for a user, they are automatically classified as False or NOT adopted, since they don't meet the criteria.

In [314]:
y = X.object_id.map(adopted).fillna(False).astype(int)

In [315]:
X = X.set_index('object_id')
X.head()

Unnamed: 0_level_0,creation_time,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1398139000.0,1398139000.0,1,0,11,10803.0,1,0,0,0,0
2,1384487000.0,1396238000.0,0,0,1,316.0,0,1,0,0,0
3,1363735000.0,1363735000.0,0,0,94,1525.0,0,1,0,0,0
4,1369124000.0,1369210000.0,0,0,1,5151.0,1,0,0,0,0
5,1358418000.0,1358850000.0,0,0,193,5240.0,1,0,0,0,0


In [316]:
y.head()

0    0
1    1
2    0
3    0
4    0
Name: object_id, dtype: int32

## Model

In [317]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,
                                                    stratify=y)

Check proportions are retained across split.

In [318]:
y_train.sum()/y_train.count()

0.13805970149253732

In [319]:
y_test.sum()/y_test.count()

0.13787878787878788

Train model

In [320]:
model = XGBClassifier(n_jobs=15)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
              n_jobs=15, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

In [321]:
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Train Accuracy: %.4f%%" % (train_accuracy * 100.0))

Train Accuracy: 97.7985%


In [322]:
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy: %.4f%%" % (test_accuracy * 100.0))

Test Accuracy: 96.4646%


The model had a 97.8% training accuracy and 96.5% test accuracy. This is a highly accurate model, considering the baseline data has a retention of only 13.8%!

In [323]:
feat_importances = pd.DataFrame(columns=['feature', 'importance'])
for i in zip(X_train.columns, model.feature_importances_):
     feat_importances = feat_importances.append({'feature':i[0],'importance':i[1]}, ignore_index=True)
feat_importances = feat_importances.sort_values('importance', ascending=False).reset_index(drop=True)

In [325]:
feat_importances

Unnamed: 0,feature,importance
0,creation_time,0.466142
1,last_session_creation_time,0.412598
2,org_id,0.053543
3,invited_by_user_id,0.048819
4,opted_in_to_mailing_list,0.006299
5,creation_source_SIGNUP_GOOGLE_AUTH,0.006299
6,creation_source_GUEST_INVITE,0.00315
7,creation_source_PERSONAL_PROJECTS,0.00315
8,enabled_for_marketing_drip,0.0
9,creation_source_ORG_INVITE,0.0


The feature importances are shown above. The creation times were by far the most important features. Let's look at the difference in creation time distributions between adopted and not adopted users.

In [329]:
users['adopted'] =  users.object_id.map(adopted).fillna(False).astype(int)

In [328]:
data = [go.Box(name='adopted', y=users[users.adopted==1].creation_time),
        go.Box(name='not adopted', y=users[users.adopted==0].creation_time)]

layout = go.Layout()
fig = go.Figure(data, layout)

iplot(fig, filename='overlaid histogram')

The box plot above shows the distributions have a lot of overlap. The not adopted user distribution is shifted up further in time. More adopted users created accounts early on.

In [330]:
data = [go.Box(name='adopted', y=users[users.adopted==1].last_session_creation_time),
        go.Box(name='not adopted', y=users[users.adopted==0].last_session_creation_time)]

layout = go.Layout()
fig = go.Figure(data, layout)

iplot(fig, filename='overlaid histogram')

last_session_creation_time shows very different distributions. The plotted data is the original data, before I imputed missing values for this column. This shows how the vast majority of adopted users had recently logged in, while most of the non adopted users had last login times that were much longer.