Relax Inc. makes productivity and project management software that is popular with both individuals and teams. It was founded by former Facebook employees.

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import  warnings
warnings.simplefilter('ignore')

import datetime
import math
import datetime as dt

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix,roc_curve,roc_auc_score, f1_score, accuracy_score
from sklearn.metrics import recall_score, precision_score
from sklearn import tree, metrics
from sklearn.tree import DecisionTreeClassifier

#### Loading the First Dataset

In [30]:
users = pd.read_csv("../joshm/takehome_users.csv", encoding='latin-1')
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [31]:
users.shape

(12000, 10)

In [32]:
users.loc[users['object_id'] == 1693]

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
1692,1693,2012-05-31 21:58:33,Faulkner Hayden,HaydenFaulkner@gmail.com,SIGNUP_GOOGLE_AUTH,1399932000.0,0,1,50,


#### Loading the Second Dataset

In [33]:
user_engagement = pd.read_csv(".\\takehome_user_engagement.csv", encoding='latin-1', parse_dates=True)
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [34]:
user_engagement.shape

(207917, 3)

In [35]:
user_engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


### Data Wrangling

#### checking the missing values

In [36]:
users.isnull().sum()

object_id                        0
creation_time                    0
name                             0
email                            0
creation_source                  0
last_session_creation_time    3177
opted_in_to_mailing_list         0
enabled_for_marketing_drip       0
org_id                           0
invited_by_user_id            5583
dtype: int64

There are some missing values in the 'last_session_creation_time' column and 'invited_by_user_id' columns.

#### Imputing missing values with 'zero'.

In [37]:
users['invited_by_user_id'] = users['invited_by_user_id'].fillna(0)

In [38]:
#now set that to datetime
users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'], unit='s')

In [39]:
users['last_session_creation_time'] = users['last_session_creation_time'].fillna(users['creation_time'])

In [40]:
user_engagement.isnull().sum()

time_stamp    0
user_id       0
visited       0
dtype: int64

It was observed that there are no null values in the users_engagement dataset.

#### Adopted user:

A user who has logged into the product on three separate days in at least one seven­day period, identifying which factors predict future user adoption .

In [41]:
import datetime

#set the time_stamp to datetime and the set it as the index
user_engagement.time_stamp = pd.to_datetime(user_engagement.time_stamp)
user_engagement = user_engagement.set_index('time_stamp', drop= True)

In [42]:
from datetime import timedelta

def label_adopted(x):    
    "takes a users input and returns whether or not they have been active within any 7-day period"
    df_temp = user_engagement.loc[user_engagement['user_id'] == x] #select out rows of this user
    df_temp = df_temp.resample('D').mean().dropna() #resample to show if active in a day. .mean() is just of 1
    adopted = 0
    for i in range(len(df_temp)-2): #loop over active days till the second to last day
        if df_temp.index[i + 2] - df_temp.index[i] <= timedelta(days=7): # difference between every 1st and 3rd day
            adopted = 1
            break
        else:
            adopted = 0
    return adopted

In [43]:
#apply to user df to label users as adopted=true 
users['adopted_user'] = users['object_id'].apply(label_adopted)

In [44]:
print(sum(users['adopted_user']))
print(sum(users.adopted_user)/len(users.adopted_user))

1656
0.138


In [45]:
print("Total number of engaged users: {}".format(user_engagement.user_id.nunique()))

Total number of engaged users: 8823


In [46]:
type(user_engagement)


pandas.core.frame.DataFrame

In [47]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0,0


In [48]:
adopted = users[users["adopted_user"] == 1]
not_adopted = users[users["adopted_user"] == 0]

In [49]:
#X=adopted
#y=not_adopted

In [50]:
print("adopted : ", X.shape)
print("not_adopted : ", y.shape)

adopted :  (12000, 4)
not_adopted :  (12000,)


In [51]:
users['email_domain'] = [x.split('@')[1] for x in users.email] 
top3_domains = users.email_domain.value_counts().index[:3] # top 3 domains
print("Top 3 domains: {}".format(top3_domains))
users['email_domain'] = [x if x in top3_domains else "other" for x in users.email_domain]

Top 3 domains: Index(['gmail.com', 'yahoo.com', 'jourrapide.com'], dtype='object')


In [61]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user,email_domain
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0,0,yahoo.com
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0,1,other
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0,0,other
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0,0,yahoo.com
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0,0,yahoo.com


### Feature Encoding

In [62]:
categorical_feature_mask = X.dtypes==np.object
# filter categorical columns using mask and turn it into a list
categorical_cols = X.columns[categorical_feature_mask].tolist()

In [63]:
from sklearn.preprocessing import LabelEncoder

no_need_cols = ['object_id', 'creation_time', 'name', 'email', 'last_session_creation_time', 
                    'org_id', 'invited_by_user_id', 'adopted_user', 'creation_source']
                
y = users.adopted_user
X = users.drop(no_need_cols, axis=1)

In [64]:
X_encoded_getdummies = pd.get_dummies(X, columns = categorical_cols, prefix_sep='_', drop_first=True) 

In [67]:
X = X_encoded_getdummies
y = users.adopted_user

### Train-Test Split

In [69]:
from sklearn.model_selection import train_test_split

X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Random Forest Model

In [70]:
from sklearn.ensemble import RandomForestClassifier

#train and test classifier
rf_clf = RandomForestClassifier()

rf_clf.fit(X_train, X_test)

rf_clf.score(y_train, y_test)

0.8603535353535353