# Relax Inc. Challenge

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

import warnings
warnings.filterwarnings("ignore")

In [2]:
users = pd.read_csv('takehome_users.csv', encoding = 'latin-1', parse_dates = True)
user_engagement = pd.read_csv('takehome_user_engagement.csv', parse_dates = True)

In [3]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,4/22/2014 3:53,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,11/15/2013 3:45,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,3/19/2013 23:14,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,5/21/2013 8:09,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,1/17/2013 10:14,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [4]:
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [5]:
users = users.rename(columns = {'object_id':'user_id'})

In [6]:
users.head()

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,4/22/2014 3:53,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,11/15/2013 3:45,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,3/19/2013 23:14,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,5/21/2013 8:09,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,1/17/2013 10:14,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [7]:
import datetime

user_engagement['time_stamp'] = pd.to_datetime(user_engagement['time_stamp'])
user_engagement = user_engagement.set_index('time_stamp', drop = True)

In [8]:
user_engagement.head()

Unnamed: 0_level_0,user_id,visited
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-22 03:53:30,1,1
2013-11-15 03:45:04,2,1
2013-11-29 03:45:04,2,1
2013-12-09 03:45:04,2,1
2013-12-25 03:45:04,2,1


In [9]:
from datetime import timedelta

def adopted_user(x):
    df_temp = user_engagement.loc[user_engagement['user_id'] == x] 
    df_temp = df_temp.resample('D').mean().dropna()
    adopted = 0
    for i in range(len(df_temp) - 2):
        if df_temp.index[i+2] - df_temp.index[i] <= timedelta(days = 7):
            adopted = 1
            break
        else:
            adopted = 0
    return adopted

In [10]:
users['adopted_user'] = users['user_id'].apply(adopted_user)

In [11]:
users.head()

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,4/22/2014 3:53,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0
1,2,11/15/2013 3:45,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1
2,3,3/19/2013 23:14,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0
3,4,5/21/2013 8:09,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0
4,5,1/17/2013 10:14,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0


In [12]:
print('Active users: ', sum(users['adopted_user']))
print(' ')
print('Percentage that are active: ', (sum(users['adopted_user'])/len(users['adopted_user'])*100))

Active users:  1656
 
Percentage that are active:  13.8


## Data Wrangling

In [13]:
#Set all date columns to datetime
users['creation_time'] = pd.to_datetime(users['creation_time'])
users['last_session_creation_time'] = users['last_session_creation_time'].map(lambda data:
                                                                              datetime.datetime.fromtimestamp(int(data)).strftime('%Y-%m-%d %H:%M:%S'),
                                                                              na_action = 'ignore')
users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'])

In [14]:
users['usage_length'] = users['last_session_creation_time'] - users['creation_time']
users['usage_length'] = users['usage_length'].dt.days

In [15]:
users['last_session_creation_time'] = users['last_session_creation_time'].fillna(0)
users['usage_length'] = users['usage_length'].fillna(0)

In [16]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   user_id                     12000 non-null  int64         
 1   creation_time               12000 non-null  datetime64[ns]
 2   name                        12000 non-null  object        
 3   email                       12000 non-null  object        
 4   creation_source             12000 non-null  object        
 5   last_session_creation_time  12000 non-null  object        
 6   opted_in_to_mailing_list    12000 non-null  int64         
 7   enabled_for_marketing_drip  12000 non-null  int64         
 8   org_id                      12000 non-null  int64         
 9   invited_by_user_id          6417 non-null   float64       
 10  adopted_user                12000 non-null  int64         
 11  usage_length                12000 non-null  float64   

In [17]:
users['invited_by_user_id'] = users['invited_by_user_id'].fillna(0)

## Feature Engineering

In [18]:
user_features = users.iloc[:, 4:]
user_features = user_features.drop('last_session_creation_time', axis=1)

In [19]:
user_features.head()

Unnamed: 0,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user,usage_length
0,GUEST_INVITE,1,0,11,10803.0,0,0.0
1,ORG_INVITE,0,0,1,316.0,1,136.0
2,ORG_INVITE,0,0,94,1525.0,0,0.0
3,GUEST_INVITE,0,0,1,5151.0,0,1.0
4,GUEST_INVITE,0,0,193,5240.0,0,5.0


In [20]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
creation_source_labels = le.fit_transform(users['creation_source'])
user_features.creation_source = creation_source_labels

org_id_labels = le.fit_transform(users['org_id'])
user_features.org_id = org_id_labels

invited_labels = le.fit_transform(users['invited_by_user_id'])
user_features.invited_by_user_id = invited_labels

In [21]:
user_features.head()

Unnamed: 0,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user,usage_length
0,0,1,0,11,2325,0,0.0
1,1,0,0,1,56,1,136.0
2,1,0,0,94,298,0,0.0
3,0,0,0,1,1104,0,1.0
4,0,0,0,193,1127,0,5.0


In [22]:
user_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   creation_source             12000 non-null  int32  
 1   opted_in_to_mailing_list    12000 non-null  int64  
 2   enabled_for_marketing_drip  12000 non-null  int64  
 3   org_id                      12000 non-null  int64  
 4   invited_by_user_id          12000 non-null  int64  
 5   adopted_user                12000 non-null  int64  
 6   usage_length                12000 non-null  float64
dtypes: float64(1), int32(1), int64(5)
memory usage: 609.5 KB


## Predictive Modeling

In [31]:
from sklearn.model_selection import train_test_split

# Test data
X = user_features.drop('adopted_user', axis=1)
y = user_features['adopted_user']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [33]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(class_weight = 'balanced_subsample')
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [38]:
from sklearn.metrics import classification_report, confusion_matrix

print('Score: ', rf.score(X_test, y_test))
print(' ')
print('Classification report: ')
print(classification_report(y_test,y_pred))
print(' ')
print('Confusion matrix: ')
print(confusion_matrix(y_test,y_pred))

Score:  0.9737373737373738
 
Classification report: 
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3407
           1       0.93      0.88      0.90       553

    accuracy                           0.97      3960
   macro avg       0.95      0.94      0.94      3960
weighted avg       0.97      0.97      0.97      3960

 
Confusion matrix: 
[[3368   39]
 [  65  488]]


In [45]:
importance_features = pd.DataFrame(zip(list(X.columns),rf.feature_importances_),columns=['feature','importance'])
importance_features.sort_values(by='importance',ascending=False)

Unnamed: 0,feature,importance
5,usage_length,0.905629
3,org_id,0.051308
4,invited_by_user_id,0.025861
0,creation_source,0.010942
1,opted_in_to_mailing_list,0.003328
2,enabled_for_marketing_drip,0.002932


From this model, it shows that the usage length seems to be the most important feature when analyzing if a user will become an adopted user