In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

In [3]:
# Read in data tables
engagement = pd.read_csv('takehome_user_engagement.csv')
users = pd.read_csv('takehome_users.csv',encoding="latin-1")

In [13]:
# Check general form of tables
engagement.head(20)

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1
5,2013-12-31 03:45:04,2,1
6,2014-01-08 03:45:04,2,1
7,2014-02-03 03:45:04,2,1
8,2014-02-08 03:45:04,2,1
9,2014-02-09 03:45:04,2,1


In [8]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [11]:
print(f'{len(users)} users logged in a total of {len(engagement)} times.')

12000 users logged in a total of 207917 times.


In [22]:
# Let's try completing this one with SQL instead.
engine = create_engine('sqlite:///sqlengine.db')

In [23]:
engagement.to_sql('engagement',engine)
users.to_sql('users',engine)

12000

In [24]:
# Every login counts for only one visit
pd.read_sql('SELECT DISTINCT visited FROM engagement',engine)

Unnamed: 0,visited
0,1


In [26]:
# This shows that not every user ever logs in on the engagement table
pd.read_sql('SELECT DISTINCT user_id FROM engagement LIMIT 10', engine)

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,10
8,11
9,13


In [31]:
pd.read_sql('SELECT user_id, SUM(visited) AS visits FROM engagement GROUP BY user_id HAVING visits >= 3',engine)

Unnamed: 0,user_id,visits
0,2,14
1,10,284
2,20,7
3,33,18
4,42,342
...,...,...
2243,11975,216
2244,11980,8
2245,11981,4
2246,11988,30


In [None]:
pd.read_sql('CREATE TABLE active AS SELECT user_id, SUM(visited) AS "visits" FROM engagement GROUP BY user_id HAVING visits >= 3',engine)

In [36]:
pd.read_sql('SELECT * FROM active LIMIT 10',engine)

Unnamed: 0,user_id,visits
0,2,14
1,10,284
2,20,7
3,33,18
4,42,342
5,43,8
6,50,17
7,53,9
8,59,3
9,60,8


In [71]:
# Total logins by only users who have logged in more than 3 times
total_active_logins = pd.read_sql('SELECT COUNT(*) FROM engagement WHERE user_id IN (SELECT user_id FROM active)',engine).values[0][0]
total_active_logins

201002

In [72]:
# Total users who have logged in more than 3 times
total_active_users = pd.read_sql('SELECT COUNT(DISTINCT(user_id)) FROM active', engine).values[0][0]
total_active_users

2248

In [73]:
# Total logins by all users
total_logins = pd.read_sql('SELECT COUNT(*) FROM engagement',engine).values[0][0]
total_logins

207917

In [75]:
# Total number of users
total_users = pd.read_sql('SELECT COUNT(DISTINCT object_id) FROM users', engine).values[0][0]
total_users

12000

In [84]:
# Number of users who never logged in
inactive_users = pd.read_sql('SELECT COUNT(*) FROM users WHERE object_id NOT IN (SELECT user_id FROM engagement)',engine).values[0][0]
inactive_users

3177

In [97]:
print(f"To sum up, of the {total_logins} total logins by {total_users} total users, {total_active_logins} \
({round(total_active_logins/total_logins*100)}% of logins)\nwere done by the {total_active_users} \
users who had logged in more than 3 times (just {round(total_active_users/total_users*100)}% of users). \n\
In fact, {inactive_users} users ({round(inactive_users/total_users*100)}% of users who created an account) \
never logged in at all!")

To sum up, of the 207917 total logins by 12000 total users, 201002 (97% of logins)
were done by the 2248 users who had logged in more than 3 times (just 19% of users). 
In fact, 3177 users (26% of users who created an account) never logged in at all!


In [102]:
print(f"The average number of logins per user with an account is {round(total_logins/total_users,1)}.")
print(f"The average number of logins per user who logged in at least once is {round(total_logins/(total_users - inactive_users),1)}.")
print(f"The average number of logins per user who logged in at least 3 times is {round(total_active_logins/total_active_users,1)}.")

The average number of logins per user with an account is 17.3.
The average number of logins per user who logged in at least once is 23.6.
The average number of logins per user who logged in at least 3 times is 89.4.


In [103]:
pd.read_sql('SELECT * FROM engagement LIMIT 5', engine)

Unnamed: 0,index,time_stamp,user_id,visited
0,0,2014-04-22 03:53:30,1,1
1,1,2013-11-15 03:45:04,2,1
2,2,2013-11-29 03:45:04,2,1
3,3,2013-12-09 03:45:04,2,1
4,4,2013-12-25 03:45:04,2,1


In [None]:
# Indexing to make the query run faster
pd.read_sql('CREATE INDEX idx_time_stamp ON engagement(time_stamp)', engine)
pd.read_sql('CREATE INDEX idx_user_id ON engagement(user_id)', engine)

In [137]:
# Find logins within one week of each other (for only users who logged in at least 3 times)
query = '''SELECT x.user_id,
x.time_stamp AS first_stamp,
y.time_stamp AS second_stamp
FROM engagement as x 
JOIN engagement as y
ON x.user_id = y.user_id
WHERE x.user_id IN 
(SELECT user_id FROM active)
AND x.time_stamp < y.time_stamp
AND y.time_stamp <= DATE(x.time_stamp, '7 days')'''
with_timestamps = pd.read_sql(query, engine)

In [139]:
with_timestamps.to_sql('twotimes', engine)

702974

In [140]:
with_timestamps.head()

Unnamed: 0,user_id,first_stamp,second_stamp
0,2,2013-12-25 03:45:04,2013-12-31 03:45:04
1,2,2014-02-03 03:45:04,2014-02-08 03:45:04
2,2,2014-02-03 03:45:04,2014-02-09 03:45:04
3,2,2014-02-08 03:45:04,2014-02-09 03:45:04
4,2,2014-02-08 03:45:04,2014-02-13 03:45:04


In [142]:
query = '''SELECT x.user_id, 
x.first_stamp, 
x.second_stamp, 
y.second_stamp AS third_stamp
FROM twotimes AS x
JOIN twotimes AS y
ON x.user_id = y.user_id
WHERE x.second_stamp < y.second_stamp
AND y.second_stamp < DATE(x.first_stamp, '7 days')'''
three_weekly = pd.read_sql(query, engine)

In [144]:
three_weekly.to_sql('adopted',engine)

6629928

In [147]:
pd.read_sql('SELECT COUNT(DISTINCT user_id) FROM adopted',engine)

Unnamed: 0,COUNT(DISTINCT user_id)
0,1602


In [148]:
three_weekly.head()

Unnamed: 0,user_id,first_stamp,second_stamp,third_stamp
0,2,2014-02-03 03:45:04,2014-02-08 03:45:04,2014-02-09 03:45:04
1,2,2014-02-03 03:45:04,2014-02-08 03:45:04,2014-02-09 03:45:04
2,2,2014-02-08 03:45:04,2014-02-09 03:45:04,2014-02-13 03:45:04
3,2,2014-02-08 03:45:04,2014-02-09 03:45:04,2014-02-13 03:45:04
4,10,2013-02-14 22:08:03,2013-02-17 22:08:03,2013-02-19 22:08:03


In [149]:
three_weekly.tail()

Unnamed: 0,user_id,first_stamp,second_stamp,third_stamp
6629923,11988,2014-05-24 11:04:47,2014-05-26 11:04:47,2014-05-27 11:04:47
6629924,11988,2014-05-24 11:04:47,2014-05-26 11:04:47,2014-05-27 11:04:47
6629925,11988,2014-05-24 11:04:47,2014-05-26 11:04:47,2014-05-27 11:04:47
6629926,11988,2014-05-26 11:04:47,2014-05-27 11:04:47,2014-06-01 11:04:47
6629927,11988,2014-05-26 11:04:47,2014-05-27 11:04:47,2014-06-01 11:04:47


In [None]:
# Creates a column in the users table to indicate whether they are an adopted user
query = '''ALTER TABLE users 
ADD COLUMN adopted INTEGER'''
pd.read_sql(query,engine)

In [153]:
pd.read_sql('''SELECT * FROM users LIMIT 5''', engine)

Unnamed: 0,index,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
0,0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,
1,1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,
2,2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,
3,3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,
4,4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,


In [156]:
query = '''UPDATE users
SET adopted = CASE
WHEN object_id IN 
(SELECT DISTINCT user_id FROM adopted)
THEN 1
ELSE 0
END'''

In [None]:
pd.read_sql(query, engine)

In [159]:
pd.read_sql('SELECT * FROM users LIMIT 5', engine)

Unnamed: 0,index,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
0,0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0
1,1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1
2,2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0
3,3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0
4,4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0


In [161]:
# Double check that it worked
pd.read_sql('SELECT * FROM users WHERE adopted = 1 LIMIT 5', engine)

Unnamed: 0,index,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
0,1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1
1,9,10,2013-01-16 22:08:03,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,1401833000.0,1,1,318,4143.0,1
2,19,20,2014-03-06 11:46:38,Helms Mikayla,lqyvjilf@uhzdq.com,SIGNUP,1401364000.0,0,0,58,,1
3,32,33,2014-03-11 06:29:09,Araujo José,JoseMartinsAraujo@cuvox.de,GUEST_INVITE,1401518000.0,0,0,401,79.0,1
4,41,42,2012-11-11 19:05:07,Pinto Giovanna,GiovannaCunhaPinto@cuvox.de,SIGNUP,1401045000.0,1,0,235,,1


In [163]:
pd.read_sql('SELECT DISTINCT(user_id) FROM adopted LIMIT 5', engine)

Unnamed: 0,user_id
0,2
1,10
2,20
3,33
4,42


In [165]:
full_users = pd.read_sql('SELECT * FROM users', engine)

In [167]:
# Check for null values
full_users.isna().sum()

index                            0
object_id                        0
creation_time                    0
name                             0
email                            0
creation_source                  0
last_session_creation_time    3177
opted_in_to_mailing_list         0
enabled_for_marketing_drip       0
org_id                           0
invited_by_user_id            5583
adopted                          0
dtype: int64

In [169]:
full_users.dtypes

index                           int64
object_id                       int64
creation_time                  object
name                           object
email                          object
creation_source                object
last_session_creation_time    float64
opted_in_to_mailing_list        int64
enabled_for_marketing_drip      int64
org_id                          int64
invited_by_user_id            float64
adopted                         int64
dtype: object

In [177]:
# Change last_session_creation_time from a float into a datetime
full_users['last_session_creation_time'] = pd.to_datetime(full_users.last_session_creation_time*1e9)

In [182]:
full_users['invited_by_user_id'] = full_users['invited_by_user_id'].fillna(0).astype('int64')

In [184]:
full_users.head(10)

Unnamed: 0,index,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted
0,0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803,0
1,1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316,1
2,2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525,0
3,3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151,0
4,4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240,0
5,5,6,2013-12-17 03:37:06,Cunha Eduardo,EduardoPereiraCunha@yahoo.com,GUEST_INVITE,2013-12-19 03:37:06,0,0,197,11241,0
6,6,7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,2012-12-20 13:24:32,0,1,37,0,0
7,7,8,2013-07-31 05:34:02,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,NaT,1,1,74,0,0
8,8,9,2013-11-05 04:04:24,Amsel Paul,PaulAmsel@hotmail.com,PERSONAL_PROJECTS,NaT,0,0,302,0,0
9,9,10,2013-01-16 22:08:03,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,2014-06-03 22:08:03,1,1,318,4143,1


In [187]:
full_users['invited'] = full_users['invited_by_user_id'] > 0

In [200]:
full_users['creation_time'] = pd.to_datetime(full_users['creation_time'])

In [205]:
full_users['time_active'] = full_users['last_session_creation_time'] - full_users['creation_time']

In [206]:
full_users.head(10)

Unnamed: 0,index,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,invited,time_active
0,0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803,0,True,0 days
1,1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316,1,True,136 days
2,2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525,0,True,0 days
3,3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151,0,True,1 days
4,4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240,0,True,5 days
5,5,6,2013-12-17 03:37:06,Cunha Eduardo,EduardoPereiraCunha@yahoo.com,GUEST_INVITE,2013-12-19 03:37:06,0,0,197,11241,0,True,2 days
6,6,7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,2012-12-20 13:24:32,0,1,37,0,0,False,4 days
7,7,8,2013-07-31 05:34:02,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,NaT,1,1,74,0,0,False,NaT
8,8,9,2013-11-05 04:04:24,Amsel Paul,PaulAmsel@hotmail.com,PERSONAL_PROJECTS,NaT,0,0,302,0,0,False,NaT
9,9,10,2013-01-16 22:08:03,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,2014-06-03 22:08:03,1,1,318,4143,1,True,503 days


In [208]:
full_users.dtypes

index                                   int64
object_id                               int64
creation_time                  datetime64[ns]
name                                   object
email                                  object
creation_source                        object
last_session_creation_time     datetime64[ns]
opted_in_to_mailing_list                int64
enabled_for_marketing_drip              int64
org_id                                  int64
invited_by_user_id                      int64
adopted                                 int64
invited                                  bool
time_active                   timedelta64[ns]
dtype: object

In [227]:
full_users['days_active'] = full_users['time_active'].dt.days.fillna(-1).astype('int64')

In [228]:
full_users.dtypes

index                                   int64
object_id                               int64
creation_time                  datetime64[ns]
name                                   object
email                                  object
creation_source                        object
last_session_creation_time     datetime64[ns]
opted_in_to_mailing_list                int64
enabled_for_marketing_drip              int64
org_id                                  int64
invited_by_user_id                      int64
adopted                                 int64
invited                                  bool
time_active                   timedelta64[ns]
days_active                             int64
dtype: object

In [229]:
full_users.head(10)

Unnamed: 0,index,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,invited,time_active,days_active
0,0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803,0,True,0 days,0
1,1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316,1,True,136 days,136
2,2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525,0,True,0 days,0
3,3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151,0,True,1 days,1
4,4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240,0,True,5 days,5
5,5,6,2013-12-17 03:37:06,Cunha Eduardo,EduardoPereiraCunha@yahoo.com,GUEST_INVITE,2013-12-19 03:37:06,0,0,197,11241,0,True,2 days,2
6,6,7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,2012-12-20 13:24:32,0,1,37,0,0,False,4 days,4
7,7,8,2013-07-31 05:34:02,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,NaT,1,1,74,0,0,False,NaT,-1
8,8,9,2013-11-05 04:04:24,Amsel Paul,PaulAmsel@hotmail.com,PERSONAL_PROJECTS,NaT,0,0,302,0,0,False,NaT,-1
9,9,10,2013-01-16 22:08:03,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,2014-06-03 22:08:03,1,1,318,4143,1,True,503 days,503


In [231]:
pd.get_dummies(full_users['creation_source']).sum()

GUEST_INVITE          2163
ORG_INVITE            4254
PERSONAL_PROJECTS     2111
SIGNUP                2087
SIGNUP_GOOGLE_AUTH    1385
dtype: int64

In [238]:
# Oops! The "invited" column I made out of the invited_by_user_id column turns out to just be 
# a combination of GUEST_INVITE and ORG_INVITE, and does not add new information! 
pd.get_dummies(full_users[full_users['invited']]['creation_source']).sum()

GUEST_INVITE    2163
ORG_INVITE      4254
dtype: int64

In [241]:
full_users = full_users.join(pd.get_dummies(full_users['creation_source'],prefix='source',drop_first=True))

In [242]:
full_users.columns

Index(['index', 'object_id', 'creation_time', 'name', 'email',
       'creation_source', 'last_session_creation_time',
       'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'org_id',
       'invited_by_user_id', 'adopted', 'invited', 'time_active',
       'days_active', 'source_ORG_INVITE', 'source_PERSONAL_PROJECTS',
       'source_SIGNUP', 'source_SIGNUP_GOOGLE_AUTH'],
      dtype='object')

In [269]:
# If we include this one, the model will likely treat it as if they're ordered,
# when really they're categorical. And there's far too many to One-Hot Encode. 
# We could encode the top organizations and call the other miscellaneous. 
# I've chosen every organization over 90 users, because that's the first 11 organizations in order.
org_counts = full_users['org_id'].value_counts()
top_orgs = (org_counts[org_counts > 90])
top_orgs

org_id
0     319
1     233
2     201
3     168
4     159
6     138
5     128
9     124
7     119
10    104
8      97
Name: count, dtype: int64

In [271]:
print(f"The top 11 organizations have {top_orgs.sum()} users, which is \
{round(top_orgs.sum()/len(full_users)*100)}% of users")

The top 11 organizations have 1790 users, which is 15% of users


In [275]:
full_users['top_org'] = full_users.apply(lambda x: x['org_id'] in top_orgs.index, axis=1)

In [276]:
full_users.head()

Unnamed: 0,index,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,invited,time_active,days_active,source_ORG_INVITE,source_PERSONAL_PROJECTS,source_SIGNUP,source_SIGNUP_GOOGLE_AUTH,top_org
0,0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803,0,True,0 days,0,False,False,False,False,False
1,1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316,1,True,136 days,136,True,False,False,False,True
2,2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525,0,True,0 days,0,True,False,False,False,False
3,3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151,0,True,1 days,1,False,False,False,False,True
4,4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240,0,True,5 days,5,False,False,False,False,False


In [277]:
predictive_columns = ['creation_time', 
       'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 
       'days_active', 'top_org', 'source_ORG_INVITE', 'source_PERSONAL_PROJECTS',
       'source_SIGNUP', 'source_SIGNUP_GOOGLE_AUTH'] 

In [278]:
predictive_columns

['creation_time',
 'opted_in_to_mailing_list',
 'enabled_for_marketing_drip',
 'days_active',
 'top_org',
 'source_ORG_INVITE',
 'source_PERSONAL_PROJECTS',
 'source_SIGNUP',
 'source_SIGNUP_GOOGLE_AUTH']

In [279]:
X = full_users[predictive_columns]
X.head()

Unnamed: 0,creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,days_active,top_org,source_ORG_INVITE,source_PERSONAL_PROJECTS,source_SIGNUP,source_SIGNUP_GOOGLE_AUTH
0,2014-04-22 03:53:30,1,0,0,False,False,False,False,False
1,2013-11-15 03:45:04,0,0,136,True,True,False,False,False
2,2013-03-19 23:14:52,0,0,0,False,True,False,False,False
3,2013-05-21 08:09:28,0,0,1,True,False,False,False,False
4,2013-01-17 10:14:20,0,0,5,False,False,False,False,False


In [248]:
y = full_users['adopted']

In [253]:
from sklearn.model_selection import train_test_split

In [280]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [282]:
len(X_train)

9600