In [1]:
import pandas as pd
import pandas_profiling
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv('takehome_users.csv', parse_dates=[1, 5], encoding='ansi')
engagement = pd.read_csv('takehome_user_engagement.csv', parse_dates=[0])

In [3]:
df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398138810,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396237504,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363734892,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210168,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358849660,0,0,193,5240.0


In [4]:
df['last_session_creation_time'] = pd.to_datetime(df['last_session_creation_time'], unit='s')
df['email_domain'] = [re.split('[@.]', x)[1] for x in df['email']]

In [5]:
df.dtypes

object_id                              int64
creation_time                 datetime64[ns]
name                                  object
email                                 object
creation_source                       object
last_session_creation_time    datetime64[ns]
opted_in_to_mailing_list               int64
enabled_for_marketing_drip             int64
org_id                                 int64
invited_by_user_id                   float64
email_domain                          object
dtype: object

In [6]:
engagement.set_index('time_stamp', inplace=True)

In [7]:
engagement.head()

Unnamed: 0_level_0,user_id,visited
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-22 03:53:30,1,1
2013-11-15 03:45:04,2,1
2013-11-29 03:45:04,2,1
2013-12-09 03:45:04,2,1
2013-12-25 03:45:04,2,1


In [8]:
print(len(df['object_id'].unique()))
print(len(engagement['user_id'].unique()))

12000
8823


In [9]:
user_ids = engagement['user_id'].unique()
possible_adopted_users = user_ids[engagement.groupby('user_id')['visited'].sum() >= 3]

In [10]:
len(possible_adopted_users)

2248

In [11]:
df['adopted'] = False
for user in possible_adopted_users:
    adopted = (engagement[engagement['user_id']==user].resample('d').sum()['visited'].rolling(7).sum() >= 3).any()
    df.loc[df['object_id']==user, 'adopted'] = adopted

In [12]:
df['adopted'].sum()

1597

In [13]:
df = df.drop(['creation_time', 'name', 'email', 'last_session_creation_time'], axis=1).set_index('object_id')
df['creation_source'] = df['creation_source'].astype('category')
df['email_domain'] = df['email_domain'].astype('category')

In [14]:
df.head()

Unnamed: 0_level_0,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,email_domain,adopted
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,GUEST_INVITE,1,0,11,10803.0,yahoo,False
2,ORG_INVITE,0,0,1,316.0,gustr,True
3,ORG_INVITE,0,0,94,1525.0,gustr,False
4,GUEST_INVITE,0,0,1,5151.0,yahoo,False
5,GUEST_INVITE,0,0,193,5240.0,yahoo,False


In [15]:
df.profile_report()



Only 6 email domains account for 90% of values. I will group the rest in 'other'

In [16]:
email_levels = ['gmail', 'yahoo', 'jourrapide', 'cuvox', 'gustr', 'hotmail', 'other']
df['email_domain'].cat.set_categories(email_levels, inplace=True)
df['email_domain'].fillna('other', inplace=True)

In [17]:
df['email_domain'].value_counts()

gmail         3562
yahoo         2447
jourrapide    1259
cuvox         1202
other         1186
gustr         1179
hotmail       1165
Name: email_domain, dtype: int64

In [18]:
df['invited_by_user_id'].fillna(0, inplace=True)

In [19]:
X = df.iloc[:, :-1]
X = pd.get_dummies(X, drop_first=True)
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train, y_train)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
y_pred = lr.predict(X_test)
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

[[2594  406]
 [   0    0]]
              precision    recall  f1-score   support

       False       1.00      0.86      0.93      3000
        True       0.00      0.00      0.00         0

   micro avg       0.86      0.86      0.86      3000
   macro avg       0.50      0.43      0.46      3000
weighted avg       1.00      0.86      0.93      3000



In [21]:
y_pred = rf.predict(X_test)
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

[[2419  380]
 [ 175   26]]
              precision    recall  f1-score   support

       False       0.93      0.86      0.90      2799
        True       0.06      0.13      0.09       201

   micro avg       0.81      0.81      0.81      3000
   macro avg       0.50      0.50      0.49      3000
weighted avg       0.87      0.81      0.84      3000



In [24]:
from imblearn.over_sampling import SMOTE

In [25]:
X = df.iloc[:, :-1]
X = pd.get_dummies(X, drop_first=True)
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train, y_train)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
y_pred = lr.predict(X_test)
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

[[ 533   74]
 [2061  332]]
              precision    recall  f1-score   support

       False       0.21      0.88      0.33       607
        True       0.82      0.14      0.24      2393

   micro avg       0.29      0.29      0.29      3000
   macro avg       0.51      0.51      0.29      3000
weighted avg       0.69      0.29      0.26      3000



In [27]:
y_pred = rf.predict(X_test)
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

[[2376  372]
 [ 218   34]]
              precision    recall  f1-score   support

       False       0.92      0.86      0.89      2748
        True       0.08      0.13      0.10       252

   micro avg       0.80      0.80      0.80      3000
   macro avg       0.50      0.50      0.50      3000
weighted avg       0.85      0.80      0.82      3000



In [28]:
df.drop('email_domain', inplace=True, axis=1)

In [29]:
X = df.iloc[:, :-1]
X = pd.get_dummies(X, drop_first=True)
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train, y_train)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
y_pred = lr.predict(X_test)
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

[[ 534   74]
 [2060  332]]
              precision    recall  f1-score   support

       False       0.21      0.88      0.33       608
        True       0.82      0.14      0.24      2392

   micro avg       0.29      0.29      0.29      3000
   macro avg       0.51      0.51      0.29      3000
weighted avg       0.69      0.29      0.26      3000



In [31]:
y_pred = rf.predict(X_test)
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

[[2286  360]
 [ 308   46]]
              precision    recall  f1-score   support

       False       0.88      0.86      0.87      2646
        True       0.11      0.13      0.12       354

   micro avg       0.78      0.78      0.78      3000
   macro avg       0.50      0.50      0.50      3000
weighted avg       0.79      0.78      0.78      3000



In [32]:
pd.DataFrame(lr.coef_.T, index=X.columns).sort_values(by=0, ascending=False)

Unnamed: 0,0
org_id,0.0007436234
creation_source_SIGNUP_GOOGLE_AUTH,1.480408e-06
creation_source_SIGNUP,5.133507e-07
opted_in_to_mailing_list,3.043591e-07
enabled_for_marketing_drip,1.810119e-07
creation_source_ORG_INVITE,-9.853868e-07
creation_source_PERSONAL_PROJECTS,-3.588379e-06
invited_by_user_id,-7.409716e-06


In [33]:
pd.DataFrame(rf.feature_importances_, index=X.columns).sort_values(by=0, ascending=False)

Unnamed: 0,0
org_id,0.347366
invited_by_user_id,0.183357
opted_in_to_mailing_list,0.13027
creation_source_ORG_INVITE,0.106648
enabled_for_marketing_drip,0.077373
creation_source_PERSONAL_PROJECTS,0.074802
creation_source_SIGNUP,0.040862
creation_source_SIGNUP_GOOGLE_AUTH,0.039322
