# Relax Inc. Take Home Challenge

In this project, we use two dataset to identify which factors predict future user adoption

In [114]:
# import libraries
import pandas as pd
import numpy as np
from datetime import timedelta as dt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
%matplotlib inline

In [93]:
# load users datasets
df_user = pd.read_csv('takehome_users.csv', encoding = "ISO-8859-1")
df_user.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [94]:
df_user.isna().sum()

object_id                        0
creation_time                    0
name                             0
email                            0
creation_source                  0
last_session_creation_time    3177
opted_in_to_mailing_list         0
enabled_for_marketing_drip       0
org_id                           0
invited_by_user_id            5583
dtype: int64

In [95]:
# convert creation_time into timestamp
df_user['creation_time'] = pd.to_datetime(df_user['creation_time'], format='%Y-%m-%d %H:%M:%S')
df_user['last_session_creation_time'] = pd.to_datetime(df_user['last_session_creation_time'], unit='s')

In [96]:
# create a new feature activity_time
# defined as the time (in days) between creation time and the last login
df_user['activity_time'] = df_user['last_session_creation_time'] - df_user['creation_time']
# convert the activity_time in days
df_user['activity_time'] = df_user['activity_time'].dt.days

In [97]:
# drop features that are unimportant
dropped_cols = ['creation_time', 'name', 'email', 'last_session_creation_time']
df_user = df_user.drop(dropped_cols, axis=1)
df_user.head()

Unnamed: 0,object_id,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,activity_time
0,1,GUEST_INVITE,1,0,11,10803.0,0.0
1,2,ORG_INVITE,0,0,1,316.0,136.0
2,3,ORG_INVITE,0,0,94,1525.0,0.0
3,4,GUEST_INVITE,0,0,1,5151.0,1.0
4,5,GUEST_INVITE,0,0,193,5240.0,5.0


In [98]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_source             12000 non-null  object 
 2   opted_in_to_mailing_list    12000 non-null  int64  
 3   enabled_for_marketing_drip  12000 non-null  int64  
 4   org_id                      12000 non-null  int64  
 5   invited_by_user_id          6417 non-null   float64
 6   activity_time               8823 non-null   float64
dtypes: float64(2), int64(4), object(1)
memory usage: 656.4+ KB


In [99]:
# remove invited_by_user_id since almost half values are missing
df_user = df_user.drop('invited_by_user_id', axis=1)
# drop NaN values from activity_time feature
df_user.dropna(inplace=True)
df_user.head()

Unnamed: 0,object_id,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,activity_time
0,1,GUEST_INVITE,1,0,11,0.0
1,2,ORG_INVITE,0,0,1,136.0
2,3,ORG_INVITE,0,0,94,0.0
3,4,GUEST_INVITE,0,0,1,1.0
4,5,GUEST_INVITE,0,0,193,5.0


In [100]:
df_engagement = pd.read_csv('takehome_user_engagement.csv')
df_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [101]:
# convert creation_time into timestamp
df_engagement['time_stamp'] = pd.to_datetime(df_engagement['time_stamp'], format='%Y-%m-%d %H:%M:%S')
# make the time_stamp the index
#df_engagement.set_index('time_stamp', inplace=True)

In [102]:
# create a function that return the rolling count over a window period
def rolling_count(df, window):
    return df.rolling(window, on='time_stamp')['visited'].count()
# create a new feature that counts rolling count of feature visited over a week
df_engagement['visited_in_7days'] = df_engagement.groupby('user_id', as_index=False, group_keys=False).apply(rolling_count, window='7D')  
df_engagement.head()


Unnamed: 0,time_stamp,user_id,visited,visited_in_7days
0,2014-04-22 03:53:30,1,1,1.0
1,2013-11-15 03:45:04,2,1,1.0
2,2013-11-29 03:45:04,2,1,1.0
3,2013-12-09 03:45:04,2,1,1.0
4,2013-12-25 03:45:04,2,1,1.0


In [103]:
# find adopted users who logged in at least 3 times in a week
df_adopted = df_engagement.loc[(df_engagement['visited_in_7days'] >= 3)]
#drop duplicates
df_adopted = df_adopted.drop_duplicates('user_id', keep = 'first')

In [104]:
df_adopted.head()

Unnamed: 0,time_stamp,user_id,visited,visited_in_7days
9,2014-02-09 03:45:04,2,1,3.0
27,2013-02-19 22:08:03,10,1,3.0
312,2014-03-13 11:46:38,20,1,3.0
331,2014-03-23 06:29:09,33,1,3.0
354,2012-12-26 19:05:07,42,1,3.0


In [105]:
num_engaged = df_engagement['user_id'].nunique()
num_adopted = len(df_adopted['user_id'])
print('There were {} adopted users out of {} engaged users'.format(num_adopted, num_engaged))

There were 1602 adopted users out of 8823 engaged users


In [106]:
# create a list of  ids of adopted users
adopted_users_ids = df_adopted['user_id'].tolist()
#create a new feature for adopted users returning 1 if a user is adopted and 0 otherwise
df_user['adopted_user'] = df_user['object_id'].isin(adopted_users_ids).astype(int)

In [107]:
df_user.head()

Unnamed: 0,object_id,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,activity_time,adopted_user
0,1,GUEST_INVITE,1,0,11,0.0,0
1,2,ORG_INVITE,0,0,1,136.0,1
2,3,ORG_INVITE,0,0,94,0.0,0
3,4,GUEST_INVITE,0,0,1,1.0,0
4,5,GUEST_INVITE,0,0,193,5.0,0


In [108]:
# drop object_id, it's no longer needed
df_user = df_user.drop('object_id', axis=1)
df_user.head()

Unnamed: 0,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,activity_time,adopted_user
0,GUEST_INVITE,1,0,11,0.0,0
1,ORG_INVITE,0,0,1,136.0,1
2,ORG_INVITE,0,0,94,0.0,0
3,GUEST_INVITE,0,0,1,1.0,0
4,GUEST_INVITE,0,0,193,5.0,0


In [119]:
df_user['adopted_user'].value_counts()

0    7221
1    1602
Name: adopted_user, dtype: int64

## Predictive Modelling

In [111]:
# create dummy variable for creation_source feature
creation_source = pd.get_dummies(df_user['creation_source'], drop_first=True)
df_model = pd.concat([df_user, creation_source], axis=1).drop('creation_source', axis=1)
#target
y = df_model['adopted_user']
#features
X = df_model.drop('adopted_user', axis=1)

In [112]:
# set random_state SEED variable
SEED = 42
# split features and target into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state = SEED)

In [113]:
# use random forest model with balanced class_weights for imbalance correction in the dataset
clf = RandomForestClassifier(random_state=42, class_weight='balanced')
# use CV to tune number of trees
params = {'n_estimators': [10, 50, 100, 200]}
model = GridSearchCV(clf, params)
model.fit(X_train, y_train)
print('Best parameters were', model.best_params_)
print('Best average precision was', model.best_score_)

Best parameters were {'n_estimators': 50}
Best average precision was 0.9587120527232944


In [116]:
# Model evaluation
model = RandomForestClassifier(
    random_state=SEED, class_weight='balanced', n_estimators=50
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy of test set was', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy of test set was 0.9610880241783151
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2152
           1       0.91      0.87      0.89       495

    accuracy                           0.96      2647
   macro avg       0.94      0.93      0.93      2647
weighted avg       0.96      0.96      0.96      2647



In [118]:
#feature importance
importance = pd.Series(model.feature_importances_, index=X.columns)
importance = importance.sort_values(ascending=False)
print(importance)

activity_time                 0.902810
org_id                        0.079568
opted_in_to_mailing_list      0.003574
enabled_for_marketing_drip    0.003272
ORG_INVITE                    0.003077
SIGNUP                        0.003066
SIGNUP_GOOGLE_AUTH            0.002728
PERSONAL_PROJECTS             0.001904
dtype: float64
