# Relax Data Science Challenge

In [3]:
# Attempting to load the files with a different encoding
takehome_users = pd.read_csv(users_file_path, encoding='latin1')
takehome_user_engagement = pd.read_csv(engagement_file_path, encoding='latin1')

# Displaying the first few rows of each dataset to ensure they loaded correctly
takehome_users.head(), takehome_user_engagement.head()


(   object_id        creation_time               name  \
 0          1  2014-04-22 03:53:30     Clausen August   
 1          2  2013-11-15 03:45:04      Poole Matthew   
 2          3  2013-03-19 23:14:52  Bottrill Mitchell   
 3          4  2013-05-21 08:09:28    Clausen Nicklas   
 4          5  2013-01-17 10:14:20          Raw Grace   
 
                         email creation_source  last_session_creation_time  \
 0    AugustCClausen@yahoo.com    GUEST_INVITE                1.398139e+09   
 1      MatthewPoole@gustr.com      ORG_INVITE                1.396238e+09   
 2  MitchellBottrill@gustr.com      ORG_INVITE                1.363735e+09   
 3   NicklasSClausen@yahoo.com    GUEST_INVITE                1.369210e+09   
 4          GraceRaw@yahoo.com    GUEST_INVITE                1.358850e+09   
 
    opted_in_to_mailing_list  enabled_for_marketing_drip  org_id  \
 0                         1                           0      11   
 1                         0                      

In [4]:
# Convert timestamps to datetime for better manipulation
takehome_users['creation_time'] = pd.to_datetime(takehome_users['creation_time'])
takehome_users['last_session_creation_time'] = pd.to_datetime(
    takehome_users['last_session_creation_time'], unit='s', errors='coerce'
)
takehome_user_engagement['time_stamp'] = pd.to_datetime(takehome_user_engagement['time_stamp'])

# Defining "adopted users" based on the condition: login on 3 separate days within a 7-day window
user_engagement = takehome_user_engagement.groupby('user_id')['time_stamp'].apply(list)

def is_adopted_user(timestamps):
    # Ensuring timestamps are sorted
    timestamps = sorted(timestamps)
    # Checking for 3 logins within any 7-day period
    for i in range(len(timestamps) - 2):
        if (timestamps[i + 2] - timestamps[i]).days <= 7:
            return 1
    return 0

adopted_user_flags = user_engagement.apply(is_adopted_user).reset_index()
adopted_user_flags.columns = ['user_id', 'is_adopted_user']

# Mergeing the adoption flag with the user dataset
takehome_users = takehome_users.merge(adopted_user_flags, left_on='object_id', right_on='user_id', how='left')
takehome_users['is_adopted_user'].fillna(0, inplace=True)

# Displaying a quick summary of the dataset with the new column
takehome_users.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  takehome_users['is_adopted_user'].fillna(0, inplace=True)


Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_id,is_adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,2014-04-22 03:53:30,1,0,11,10803.0,1.0,0.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,2014-03-31 03:45:04,0,0,1,316.0,2.0,1.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,2013-03-19 23:14:52,0,0,94,1525.0,3.0,0.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,2013-05-22 08:09:28,0,0,1,5151.0,4.0,0.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,2013-01-22 10:14:20,0,0,193,5240.0,5.0,0.0


In [6]:
# Total number of users and adopted users
total_users = takehome_users['object_id'].nunique()
adopted_users = takehome_users['is_adopted_user'].sum()

# Adoption rates by creation source
creation_source_adoption = takehome_users.groupby('creation_source')['is_adopted_user'].mean()

# Specific adoption rates for ORG_INVITE and SIGNUP
org_invite_adoption_rate = creation_source_adoption.get('ORG_INVITE', 0) * 100
signup_adoption_rate = creation_source_adoption.get('SIGNUP', 0) * 100

# Checking if opting into mailing lists or marketing drip affects adoption rates
mailing_list_adoption = takehome_users.groupby('opted_in_to_mailing_list')['is_adopted_user'].mean()
marketing_drip_adoption = takehome_users.groupby('enabled_for_marketing_drip')['is_adopted_user'].mean()

# Checking if being invited correlates positively with adoption
invited_user_adoption = takehome_users['invited_by_user_id'].notna().mean()
invited_adoption_rate = takehome_users[takehome_users['invited_by_user_id'].notna()]['is_adopted_user'].mean()
non_invited_adoption_rate = takehome_users[takehome_users['invited_by_user_id'].isna()]['is_adopted_user'].mean()

{
    "total_users": total_users,
    "adopted_users": adopted_users,
    "adoption_rate": adopted_users / total_users * 100,
    "org_invite_adoption_rate": org_invite_adoption_rate,
    "signup_adoption_rate": signup_adoption_rate,
    "mailing_list_adoption_rates": mailing_list_adoption.to_dict(),
    "marketing_drip_adoption_rates": marketing_drip_adoption.to_dict(),
    "invited_adoption_rate": invited_adoption_rate,
    "non_invited_adoption_rate": non_invited_adoption_rate,
    "invitation_effect": invited_adoption_rate - non_invited_adoption_rate
}


{'total_users': 12000,
 'adopted_users': 1656.0,
 'adoption_rate': 13.8,
 'org_invite_adoption_rate': 13.493182886694877,
 'signup_adoption_rate': 14.470531863919502,
 'mailing_list_adoption_rates': {0: 0.1362425049966689, 1: 0.1432865731462926},
 'marketing_drip_adoption_rates': {0: 0.13704937304075235,
  1: 0.14341517857142858},
 'invited_adoption_rate': 0.14695340501792115,
 'non_invited_adoption_rate': 0.1277091169622067,
 'invitation_effect': 0.01924428805571446}

The main objective was to identify factors that predict user adoption, defined as users who log in on three separate days within a seven-day window.

Methodology:

Data Cleaning & Preprocessing:
There where timestamps in both datasets for a tiime-series analysis.
the adoption flag was merged with the user dataset.   

Defining "Adopted User":
Marked users as "adopted" if they logged in on three separate days within any seven-day period using sliding window logic. 



Key Findings:
Users invited through ORG_INVITE were more likely to be adopted.

Users who were invited by another user had an adoption rate of 14.70%, compared to 12.77% for those who were not invited.
Being invited correlates positively with adoption, with a 1.92% higher adoption rate for invited users.


Future Considerations:


Predictive Modeling:
Build a logistic regression or decision tree model to quantify the impact of each factor on adoption.
Retention Analysis:
Assess long-term engagement trends among adopted users for actionable insights.
