In [89]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import datetime

pd.options.mode.chained_assignment = None

In [90]:
takehome_users = pd.read_csv('takehome_users.csv', encoding='latin-1')

In [91]:
takehome_users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [92]:
engagement = pd.read_csv('takehome_user_engagement.csv')

In [93]:
engagement.head(20)

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1
5,2013-12-31 03:45:04,2,1
6,2014-01-08 03:45:04,2,1
7,2014-02-03 03:45:04,2,1
8,2014-02-08 03:45:04,2,1
9,2014-02-09 03:45:04,2,1


In [94]:
engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [95]:
# convert time_stamp column into datetime type
engagement.time_stamp = pd.to_datetime(engagement.time_stamp)

## Create "adopted" label for each user

In [96]:
# In order to determine users who were labeled as adopted we will sort the data by user_id. 
engagement.sort_values(by='user_id', inplace=True)
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
14,2014-03-31 03:45:04,2,1
13,2014-03-13 03:45:04,2,1
12,2014-03-09 03:45:04,2,1
11,2014-02-16 03:45:04,2,1


In [97]:
# Next we will find all the users who have logged in at least three separate times 
total_visits = engagement.groupby('user_id').sum().reset_index()
at_least_3_logins = total_visits.user_id[total_visits.visited >= 3] 

In [98]:
# We now go back to the original engagement dataframe and acquire all login times of users who have logged in at least 3 times 
potential_adopted_users = engagement[engagement.user_id.isin(at_least_3_logins)]
potential_adopted_users.sort_values(by=['user_id', 'time_stamp'], inplace=True)
potential_adopted_users.head()

Unnamed: 0,time_stamp,user_id,visited
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1
5,2013-12-31 03:45:04,2,1


In [99]:
# group by user_id
grouped_users = potential_adopted_users.groupby('user_id')

In [100]:
adopted_users = []

# loop through users 
for user_id in at_least_3_logins:
    current_user_logins = grouped_users.get_group(user_id)
    time_stamps = current_user_logins.time_stamp
    
    # calculate time differences between every 3 logins
    three_day_differences = [login_3-login_1 for login_1, login_3 in zip(time_stamps[:-1], time_stamps[2:])]
    
    # check if any of the three day differences are less than or equal to true
    adopted = any([time_diff <= datetime.timedelta(days=7) for time_diff in three_day_differences])
    
    if adopted: 
        adopted_users.append(user_id)

In [101]:
# create a boolean column in takehome_users dataframe of whether the user was adopted 
takehome_users['adopted'] = takehome_users.object_id.isin(adopted_users)

In [102]:
takehome_users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,False
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,True
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,False
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,False
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,False


## Feature Engineering/Data Wrangling

In [103]:
# create column for year each user signed up 
takehome_users['creation_year'] = pd.DatetimeIndex(takehome_users.creation_time).year

# create column for month each user signed up
takehome_users['creation_month'] = pd.DatetimeIndex(takehome_users.creation_time).month

In [104]:
takehome_users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,creation_year,creation_month
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,False,2014,4
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,True,2013,11
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,False,2013,3
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,False,2013,5
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,False,2013,1


In [106]:
one_hot = pd.get_dummies(takehome_users.creation_source)
takehome_users = takehome_users.drop('creation_source', axis = 1)
takehome_users = takehome_users.join(one_hot)

In [107]:
takehome_users.head()

Unnamed: 0,object_id,creation_time,name,email,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,creation_year,creation_month,GUEST_INVITE,ORG_INVITE,PERSONAL_PROJECTS,SIGNUP,SIGNUP_GOOGLE_AUTH
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,1398139000.0,1,0,11,10803.0,False,2014,4,1,0,0,0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,1396238000.0,0,0,1,316.0,True,2013,11,0,1,0,0,0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,1363735000.0,0,0,94,1525.0,False,2013,3,0,1,0,0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,1369210000.0,0,0,1,5151.0,False,2013,5,1,0,0,0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,1358850000.0,0,0,193,5240.0,False,2013,1,1,0,0,0,0


### Imbalance

In [108]:
takehome_users.adopted.value_counts()

False    10344
True      1656
Name: adopted, dtype: int64

Note that there is an imbalance within the dataset, however we will use logsitic regression as a predictor. This imbalance will only affect the intercept of the curve, so choosing a correct threshold will negate the effect of this imbalance. 

## Model Building

In [178]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report, confusion_matrix

# Declare training and test sets 
excluded_features = ['object_id', 
                     'creation_time',
                     'name',
                     'email', 
                     'last_session_creation_time', 
                     'invited_by_user_id',
                     'adopted']

feature_cols = [col for col in takehome_users.columns if col not in excluded_features]

X = takehome_users[feature_cols]
y = takehome_users.adopted

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [180]:
# train classifier
clf = LogisticRegressionCV(random_state=101, solver='lbfgs', cv=5, class_weight="balanced").fit(X_train, y_train)

pred = clf.predict(X_test)

# View confusion matrix and classification report
print(confusion_matrix(y_test, pred))
print()
print(classification_report(y_test, pred))

[[2062 1343]
 [ 265  290]]

              precision    recall  f1-score   support

       False       0.89      0.61      0.72      3405
        True       0.18      0.52      0.27       555

    accuracy                           0.59      3960
   macro avg       0.53      0.56      0.49      3960
weighted avg       0.79      0.59      0.66      3960



In [190]:
for coef, feat in zip(os_clf.coef_[0], feature_cols): 
    print(f'{feat:<27}: {coef:>7.4f}')

opted_in_to_mailing_list   : -0.4648
enabled_for_marketing_drip : -0.4758
org_id                     :  0.0015
creation_year              :  0.0016
creation_month             :  0.0152
GUEST_INVITE               : -4.0593
ORG_INVITE                 : -4.1272
PERSONAL_PROJECTS          : -5.1937
SIGNUP                     : -4.3056
SIGNUP_GOOGLE_AUTH         : -4.2773


From the logistic regresion coefficients, the most important features dictating user adoption are the time the account was created and which organization the user is part of. Additionally, users who created accounts through personal projects were less likely to be adopted than users who created their account through other means. 