In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json
from datetime import datetime, timedelta
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
sns.set()

In [2]:
#Load users information data from takehome_users.csv and view its first 5 rows.
# Examine data types and missing value columns with the .info method

user_df = pd.read_csv('takehome_users.csv', encoding='latin-1') # would not load with the default utf-8 encoding
print(user_df.info())
user_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB
None


Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


The `takehome_user.csv` file would not load normally into pandas due to the unicode standard which is by default `utf-8`. Needed to change the encoding to `latin-1` for the dataset to load into pandas without throwing an error

In [3]:
# Convert the creation_time which is string and last_session_creation_time is which unix time stamp
# to datetime format
# Examine the converted time using .info()
# Peek the top enteries of the dataset to examine the conversions to datetime

user_df.creation_time = pd.to_datetime(user_df.creation_time)
user_df.last_session_creation_time = pd.to_datetime(user_df.last_session_creation_time, unit='s')
print(user_df.info())
user_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null datetime64[ns]
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: datetime64[ns](2), float64(1), int64(4), object(3)
memory usage: 937.6+ KB
None


Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0


From the `.info` and `.head` methods it is evident that the columns creation_time and last_session_creation_time have been converted to the datetime objects.

In [4]:
# Load the takehome_user_engagement.csv file and examine the first five rows
# Use .info() method to get details about the entire dataset.

user_engagement_df = pd.read_csv('takehome_user_engagement.csv')
print(user_engagement_df.info())
user_engagement_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB
None


Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


The `time_stamp` column is represented as a string object and not a datetime object. For easier manipulation of the data to get more insight, time_stamp column is converted to datetime object.

In [5]:
# convert time_stamp to datetime object from string and examine the dataset with the .info and .head methods

user_engagement_df.time_stamp = pd.to_datetime(user_engagement_df.time_stamp)
print(user_engagement_df.info())
user_engagement_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null datetime64[ns]
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: datetime64[ns](1), int64(2)
memory usage: 4.8 MB
None


Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


To get the complete picture of the users and their engagement with the website we will need to check data from the two files.
For uniformity we rename the column `object_id` to `user_id` and `time_stamp` to `daily_login_time_stamp`

In [6]:
# Rename the columns indicated

user_data = user_df.rename(index=str, columns={'object_id':'user_id'})
user_engagement_data = user_engagement_df.rename(index=str, columns={'time_stamp':'daily_login_time_stamp'})

# print list of column names for both files.

print(user_data.columns)
print('\n\n',user_engagement_data.columns)

Index(['user_id', 'creation_time', 'name', 'email', 'creation_source',
       'last_session_creation_time', 'opted_in_to_mailing_list',
       'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id'],
      dtype='object')


 Index(['daily_login_time_stamp', 'user_id', 'visited'], dtype='object')


In [7]:
# make the daily_login_time_stamp into a datetimeindex for easy manipulation

user_engagement_data_dti = user_engagement_data.set_index(pd.DatetimeIndex(user_engagement_data.daily_login_time_stamp))
del user_engagement_data_dti['daily_login_time_stamp']
user_engagement_data_dti.head()

Unnamed: 0_level_0,user_id,visited
daily_login_time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-22 03:53:30,1,1
2013-11-15 03:45:04,2,1
2013-11-29 03:45:04,2,1
2013-12-09 03:45:04,2,1
2013-12-25 03:45:04,2,1


### User Adoption

An "adopted user" as a user who has logged into the product on **three separate days in at least one seven day period**

We considered a week as a 7 day period and resampled the login times on weekly aggregates

In [8]:
# Group each user by the number of times the logged-in in a 7day period
user_engagement_data_dti_7d=user_engagement_data_dti.groupby('user_id')['visited'].resample('W').count().unstack().fillna(0)
user_engagement_data_dti_7d.head()

daily_login_time_stamp,2012-06-03 00:00:00,2012-06-10 00:00:00,2012-06-17 00:00:00,2012-06-24 00:00:00,2012-07-01 00:00:00,2012-07-08 00:00:00,2012-07-15 00:00:00,2012-07-22 00:00:00,2012-07-29 00:00:00,2012-08-05 00:00:00,...,2014-04-06 00:00:00,2014-04-13 00:00:00,2014-04-20 00:00:00,2014-04-27 00:00:00,2014-05-04 00:00:00,2014-05-11 00:00:00,2014-05-18 00:00:00,2014-05-25 00:00:00,2014-06-01 00:00:00,2014-06-08 00:00:00
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Filter users who logged in 3 times or more within any Week (7days)

user_3logins_7days = user_engagement_data_dti_7d[user_engagement_data_dti_7d.values >=3].drop_duplicates()
print(user_3logins_7days.info())
user_3logins_7days.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1445 entries, 2 to 11988
Columns: 106 entries, 2012-06-03 to 2014-06-08
dtypes: float64(106)
memory usage: 1.2 MB
None


daily_login_time_stamp,2012-06-03 00:00:00,2012-06-10 00:00:00,2012-06-17 00:00:00,2012-06-24 00:00:00,2012-07-01 00:00:00,2012-07-08 00:00:00,2012-07-15 00:00:00,2012-07-22 00:00:00,2012-07-29 00:00:00,2012-08-05 00:00:00,...,2014-04-06 00:00:00,2014-04-13 00:00:00,2014-04-20 00:00:00,2014-04-27 00:00:00,2014-05-04 00:00:00,2014-05-11 00:00:00,2014-05-18 00:00:00,2014-05-25 00:00:00,2014-06-01 00:00:00,2014-06-08 00:00:00
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,5.0,7.0,4.0,5.0,7.0,7.0,7.0,7.0,2.0
20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0
33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,2.0,1.0,2.0,1.0,0.0,2.0,2.0,2.0,0.0
42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,3.0,3.0,5.0,3.0,5.0,2.0,3.0,0.0,0.0


**Users with 3 logins with any week are considered adopted users**

In [11]:
# List of user_ids of adopted users
user_3logins_7days.index

Int64Index([    2,    10,    20,    33,    42,    43,    53,    63,    69,
               74,
            ...
            11957, 11958, 11959, 11961, 11964, 11965, 11967, 11969, 11975,
            11988],
           dtype='int64', name='user_id', length=1445)

In [17]:
2 in user_3logins_7days.index

True

In [18]:
# Add a column called adopted_user to user_data where 1 is for a user whose user_id is found in the list
# of adopted users.

for idx, row in user_data.iterrows():
    if row.user_id in user_3logins_7days.index:
        user_data.loc[idx,'adopted_user'] = 1
    else:
        user_data.loc[idx,'adopted_user'] = 0

print(user_data.info())
user_data.head()

<class 'pandas.core.frame.DataFrame'>
Index: 12000 entries, 0 to 11999
Data columns (total 11 columns):
user_id                       12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null datetime64[ns]
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
adopted_user                  12000 non-null int64
dtypes: datetime64[ns](2), float64(1), int64(5), object(3)
memory usage: 1.4+ MB
None


Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0,0


In [22]:
# Filter relevant numerical and categorical columns and one hot encode for predictive analysis

user_data_ml = user_data[['creation_source','opted_in_to_mailing_list','enabled_for_marketing_drip','org_id','adopted_user' ]]
user_data_ml = pd.get_dummies(user_data_ml)
print(user_data_ml.info())
user_data_ml.head()

<class 'pandas.core.frame.DataFrame'>
Index: 12000 entries, 0 to 11999
Data columns (total 9 columns):
opted_in_to_mailing_list              12000 non-null int64
enabled_for_marketing_drip            12000 non-null int64
org_id                                12000 non-null int64
adopted_user                          12000 non-null int64
creation_source_GUEST_INVITE          12000 non-null uint8
creation_source_ORG_INVITE            12000 non-null uint8
creation_source_PERSONAL_PROJECTS     12000 non-null uint8
creation_source_SIGNUP                12000 non-null uint8
creation_source_SIGNUP_GOOGLE_AUTH    12000 non-null uint8
dtypes: int64(4), uint8(5)
memory usage: 847.3+ KB
None


Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,adopted_user,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH
0,1,0,11,0,1,0,0,0,0
1,0,0,1,1,0,1,0,0,0
2,0,0,94,0,0,1,0,0,0
3,0,0,1,0,1,0,0,0,0
4,0,0,193,0,1,0,0,0,0


In [23]:
# Prepare dataset to be used in models
X = user_data_ml.drop(['adopted_user'], axis=1).values
x_columns = user_data_ml.drop(['adopted_user'], axis=1).columns
y =  user_data_ml.adopted_user.values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

RF=RandomForestClassifier(n_estimators=100)

kfold = KFold(n_splits=10, random_state=42)
cv_results = cross_val_score(RF, X_train, y_train, cv=kfold, scoring='accuracy')

RF.fit(X_train,y_train)

y_pred = RF.predict(X_test)

print('Mean of Cross validation results is',cv_results.mean())
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

Mean of Cross validation results is 0.8359523809523811
[[2963  206]
 [ 398   33]]
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      3169
           1       0.14      0.08      0.10       431

   micro avg       0.83      0.83      0.83      3600
   macro avg       0.51      0.51      0.50      3600
weighted avg       0.79      0.83      0.81      3600



In [25]:
# Zip important features with respective columns
features = sorted(zip(RF.feature_importances_,x_columns), key=lambda x:x[0], reverse=True)
features_dict = {}
for i, j in features:
    features_dict[j] = [i]
    
features_df = pd.DataFrame(features_dict, index=['importance'])
features_df.T

Unnamed: 0,importance
org_id,0.958975
opted_in_to_mailing_list,0.0101
enabled_for_marketing_drip,0.008828
creation_source_PERSONAL_PROJECTS,0.008437
creation_source_GUEST_INVITE,0.003936
creation_source_ORG_INVITE,0.003811
creation_source_SIGNUP_GOOGLE_AUTH,0.003062
creation_source_SIGNUP,0.00285


The main factors for predicting future user adoption are the `org_id` primarily and to a lesser extent, if the user opted into a `mailing list` and `enabled marketing drip`