# Relax Inc 'Take Home Challenge' Solution

In [1]:
from collections import Counter
import numpy as np 
import pandas as pd

### Import and Join Dataframes

In [2]:
user_eng_df = pd.read_csv('takehome_user_engagement.csv')
users_df = pd.read_csv('takehome_users.csv', encoding='latin-1') #would throw error with no encoding

# Full outer join of the 2 tables
data_df = user_eng_df.merge(users_df, left_on='user_id', right_on='object_id', how='outer')

# Remove duplicate/unnecessary columns. 'visited' only has 1 for an entry, trivial column.
del data_df['object_id'], data_df['visited']

### Preprocessing

In [3]:
# If a user is not invited by another user, we give this column a value of 0
data_df.loc[np.isnan(data_df['invited_by_user_id']), 'invited_by_user_id'] = 0

# Drow rows with mostly NaN values
data_df = data_df.dropna()

# Correcting Data Types
data_df['user_id'] = data_df['user_id'].astype(int)
data_df['invited_by_user_id'] = data_df['invited_by_user_id'].astype(int)
data_df['creation_time'] = pd.to_datetime(data_df['creation_time'], format='%Y-%m-%d %H:%M:%S')
data_df['time_stamp'] = pd.to_datetime(data_df['time_stamp'], format='%Y-%m-%d %H:%M:%S')

### 'Adopted User' Label

 - identify  which  factors  predict  future  user adoption .

In [4]:
# Initially set all users as not 'adopted' (0 is false, 1 is true).
data_df['adopted_user'] = 0

# Data of users and how many times each of them have logged in. 
num_logins = data_df['name'].value_counts().values

# Names of users who logged in 3 or more times
valid_users = data_df['name'].value_counts().index[0:len(num_logins[num_logins>=3])]

# Setting a variable to aggregate over
data_df['counter'] = 1
user_labels = []

for idx, user in enumerate(valid_users): 
    user_df = data_df[data_df.name==user]
    # Maximum number of logins in any week for each user
    max_logins = user_df.set_index("time_stamp").resample("7D").sum().dropna()['counter'].max()
    
    if max_logins>=3:
        user_labels.append(1)
    else:
        user_labels.append(0)

# Extracting names of users who are considered 'adopted', and the total num of adopted users
adopted_user_names = valid_users[np.where(np.array(user_labels)==1)]
num_adopted_users = len(adopted_user_names)

# Deleting aggregation counter
del data_df['counter']

# Giving the 'adopted users' their appropriate labels
data_df.loc[data_df['name'].isin(adopted_user_names), 'adopted_user'] = 1

# Do the same for the users dataframe
users_df['adopted_user'] = 0
users_df.loc[users_df['name'].isin(adopted_user_names), 'adopted_user'] = 1

In [5]:
# Deleting a bunch of useless data
del data_df['time_stamp'], data_df['name'], data_df['creation_time']
del data_df['last_session_creation_time']

del users_df['creation_time'], users_df['last_session_creation_time']

### Model Preprocessing

In [6]:
users_df.loc[np.isnan(users_df['invited_by_user_id']), 'invited_by_user_id'] = 0
users_df['invited_by_user_id'] = users_df['invited_by_user_id'].astype(int)

# One hot encode categorical variables
users_df = users_df.join(pd.get_dummies(users_df.creation_source))

# Set the index as the user IDs.
users_df = users_df.set_index(users_df.object_id)

In [9]:
print(users_df)

                          name                                email  \
object_id                                                             
1               Clausen August             AugustCClausen@yahoo.com   
2                Poole Matthew               MatthewPoole@gustr.com   
3            Bottrill Mitchell           MitchellBottrill@gustr.com   
4              Clausen Nicklas            NicklasSClausen@yahoo.com   
5                    Raw Grace                   GraceRaw@yahoo.com   
6                Cunha Eduardo        EduardoPereiraCunha@yahoo.com   
7                 Sewell Tyler           TylerSewell@jourrapide.com   
8            Hamilton Danielle           DanielleHamilton@yahoo.com   
9                   Amsel Paul                PaulAmsel@hotmail.com   
10                Santos Carla        CarlaFerreiraSantos@gustr.com   
11              Paulsen Malthe             MaltheAPaulsen@gustr.com   
12             Mathiesen Lærke            LaerkeLMathiesen@cuvox.de   
13    

In [8]:
del users_df['creation_source'], users_df['object_id']