In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
%run functions.ipynb
configurate_display()

# Introduction

Predicction: if an user is going to adopt after the trial.

Identity: user (field: user_id)

Snapshot: third trial day* (field: days_since_trial_start)

Target: user becomes customer (field: customer)  


*Goal: "ideally in the first days of trial" It will be explained later.

# Variables

In [3]:
snapshot = 3
period_init = pd.to_datetime('20201201', format='%Y%m%d')
period_end = pd.to_datetime('20201231', format='%Y%m%d')

# Data reading

In [4]:
df_onboarding = read_file_tsv("tables/users_onboarding_paywall.tsv")
df_onboarding.shape

(42239, 20)

In [5]:
df_activities = read_file_tsv("tables/activities_per_day.tsv")
df_activities.shape

(109764, 23)

# Data cleaning

In [6]:
#Unique users on onboarding. 
'''There are repeated users in onboarding table. I couldn't find any pattern in a reduced time so I decided to ignore them 
because there are less than 500 cases.'''

df_onboarding.drop_duplicates(subset='user_id', keep = False, inplace = True)

In [7]:
#*Prediction "ideally in the first days of trial"
'''
Because the goal say: ideally in the first days of trial, I decided to train with snapshots close to the first trial day.

Analyzing the data, exists activities which days_since_trial_start field is unknown so that samples will be dropped 
because I can find a commun sense or logical reason. Ideally should be understand the reasons and use that data if we are 
sure is not future information.

I will analice which trial day has the best trade-off between min trial day and max user activity (more samples).

The following table shows that if I choose activities where days_since_trial_start are in [0, 1, 2] I can use 91% of the samples. 
I think that is a good final quantity for this demo.
Ideally, this decision may consider the usage of the model and the stakeholders opinion.
'''

first_activity = df_activities.groupby(['user_id'])['days_since_trial_start'].min().to_frame().reset_index()
first_activity.groupby('days_since_trial_start').count().reset_index()

Unnamed: 0,days_since_trial_start,user_id
0,0,22279
1,1,823
2,2,679
3,3,610
4,4,599
5,5,569
6,6,573


In [8]:
df_activities = df_activities[df_activities.days_since_trial_start.isin({0,1,2})]

In [9]:
#Samples without enough maduration: they need at least 3 days of activity after the start of the trial.
df_onboarding = df_onboarding[str_to_dt(df_onboarding, 'subscription_at') <= (period_end - np.timedelta64(snapshot,'D'))]

In [10]:
#Parse data
df_onboarding['onboarding_home_at_dt'] = str_to_dt(df_onboarding, 'onboarding_home_at')
df_onboarding['subscription_at_dt'] = str_to_dt(df_onboarding, 'subscription_at')

# Feature building

### Onboarding features

I assume that data of the onboarding table is obtained before the trial starts. Except: customer

In [11]:
#Subscription delay hypothesis: if an user think more on the subscription that could be related with no adoption.
df_onboarding['min_btw_onboarding_home_and_subscripition'] = df_onboarding['subscription_at_dt'] - df_onboarding['onboarding_home_at_dt']
to_minutes(df_onboarding, 'min_btw_onboarding_home_and_subscripition')

In [12]:
#Seasonality hypothesis: if the trial is during the salary period the users could afford the subscription.
df_onboarding['subscription_seasonality'] = df_onboarding['subscription_at_dt'].dt.day

In [13]:
#Continuous version hypothesis: the higher version number, the later mobile model.
split_version = df_onboarding['os_version'].str.split('.')
df_onboarding['os_version_cont'] = split_version.str[:2].str.join('.')
df_onboarding['os_version_cont'] = pd.to_numeric(df_onboarding.os_version_cont, errors='coerce')

In [14]:
#Simplified timezone hypothesis: too many branches of the data could make a categorical feature unusable.
split_timezone = df_onboarding['timezone'].str.split('/')
df_onboarding['timezone_continent'] = split_timezone.str[0]

In [15]:
df_onboarding_features = df_onboarding[['user_id', 'os_name', 'os_version_cont', 'country', 'timezone_continent', 'locale', 
                                        'source', 'level', 'age', 'signup_provider', 'price', 'currency', 
                                        'payment_platform', 'min_btw_onboarding_home_and_subscripition', 
                                        'subscription_seasonality']]

### Activities features

In [16]:
#I will consider the data per day because disconnections are not necessary for the quantification of the activities.
df_activities_group_per_day = df_activities.groupby(['user_id', 'days_since_trial_start']).agg(
    n_activities_game = pd.NamedAgg(column = "n_activities_game", aggfunc= "sum"),
    n_activities_video = pd.NamedAgg(column = "n_activities_video", aggfunc= "sum"),
    n_activities_traceable = pd.NamedAgg(column = "n_activities_traceable", aggfunc= "sum"),
    n_activities_audiobook = pd.NamedAgg(column = "n_activities_audiobook", aggfunc= "sum"),
    n_activities_completed_game = pd.NamedAgg(column = "n_activities_completed_game", aggfunc= "sum"),
    n_activities_completed_video = pd.NamedAgg(column = "n_activities_completed_video", aggfunc= "sum"),
    n_activities_completed_traceable = pd.NamedAgg(column = "n_activities_completed_traceable", aggfunc= "sum"),
    n_activities_completed_audiobook = pd.NamedAgg(column = "n_activities_completed_audiobook", aggfunc= "sum"),

    duration_game = pd.NamedAgg(column = "total_duration_game", aggfunc= "sum"),
    duration_video = pd.NamedAgg(column = "total_duration_video", aggfunc= "sum"),
    duration_traceable = pd.NamedAgg(column = "total_duration_traceable", aggfunc= "sum"),
    duration_audiobook = pd.NamedAgg(column = "total_duration_audiobook", aggfunc= "sum"),
    
    qty_sessions_per_day = pd.NamedAgg(column = "n_sessions", aggfunc= "count"),
    
    
).reset_index()

#Total per day
df_activities_group_per_day['n_activities_tot'] = df_activities_group_per_day['n_activities_game'] + df_activities_group_per_day['n_activities_video'] + df_activities_group_per_day['n_activities_traceable'] + df_activities_group_per_day['n_activities_audiobook']
df_activities_group_per_day['n_activities_completed_tot'] = df_activities_group_per_day['n_activities_completed_game'] + df_activities_group_per_day['n_activities_completed_video'] + df_activities_group_per_day['n_activities_completed_traceable'] + df_activities_group_per_day['n_activities_completed_audiobook']
df_activities_group_per_day['duration_tot'] = df_activities_group_per_day['duration_game'] + df_activities_group_per_day['duration_video'] + df_activities_group_per_day['duration_traceable'] + df_activities_group_per_day['duration_audiobook']


In [17]:
df_activities_features = df_activities_group_per_day.groupby('user_id').agg(
    #Total
    n_activities_game_tot = pd.NamedAgg(column = "n_activities_game", aggfunc= "sum"),
    n_activities_video_tot = pd.NamedAgg(column = "n_activities_video", aggfunc= "sum"),
    n_activities_traceable_tot = pd.NamedAgg(column = "n_activities_traceable", aggfunc= "sum"),
    n_activities_audiobook_tot = pd.NamedAgg(column = "n_activities_audiobook", aggfunc= "sum"),
    n_activities_completed_game_tot = pd.NamedAgg(column = "n_activities_completed_game", aggfunc= "sum"),
    n_activities_completed_video_tot = pd.NamedAgg(column = "n_activities_completed_video", aggfunc= "sum"),
    n_activities_completed_traceable_tot = pd.NamedAgg(column = "n_activities_completed_traceable", aggfunc= "sum"),
    n_activities_completed_audiobook_tot = pd.NamedAgg(column = "n_activities_completed_audiobook", aggfunc= "sum"),
    
    duration_game_tot = pd.NamedAgg(column = "duration_game", aggfunc= "sum"),
    duration_video_tot = pd.NamedAgg(column = "duration_video", aggfunc= "sum"),
    duration_traceable_tot = pd.NamedAgg(column = "duration_traceable", aggfunc= "sum"),
    duration_audiobook_tot = pd.NamedAgg(column = "duration_audiobook", aggfunc= "sum"),
    
    qty_sessions_per_day_tot = pd.NamedAgg(column = "qty_sessions_per_day", aggfunc= "sum"),
    
    #MIN
    n_activities_game_min = pd.NamedAgg(column = "n_activities_game", aggfunc= "min"),
    n_activities_video_min = pd.NamedAgg(column = "n_activities_video", aggfunc= "min"),
    n_activities_traceable_min = pd.NamedAgg(column = "n_activities_traceable", aggfunc= "min"),
    n_activities_audiobook_min = pd.NamedAgg(column = "n_activities_audiobook", aggfunc= "min"),
    n_activities_completed_game_min = pd.NamedAgg(column = "n_activities_completed_game", aggfunc= "min"),
    n_activities_completed_video_min = pd.NamedAgg(column = "n_activities_completed_video", aggfunc= "min"),
    n_activities_completed_traceable_min = pd.NamedAgg(column = "n_activities_completed_traceable", aggfunc= "min"),
    n_activities_completed_audiobook_min = pd.NamedAgg(column = "n_activities_completed_audiobook", aggfunc= "min"),
    
    duration_game_min = pd.NamedAgg(column = "duration_game", aggfunc= "min"),
    duration_video_min = pd.NamedAgg(column = "duration_video", aggfunc= "min"),
    duration_traceable_min = pd.NamedAgg(column = "duration_traceable", aggfunc= "min"),
    duration_audiobook_min = pd.NamedAgg(column = "duration_audiobook", aggfunc= "min"),
    
    qty_sessions_per_day_min = pd.NamedAgg(column = "qty_sessions_per_day", aggfunc= "min"),
    
    #MAX
    n_activities_game_max = pd.NamedAgg(column = "n_activities_game", aggfunc= "max"),
    n_activities_video_max = pd.NamedAgg(column = "n_activities_video", aggfunc= "max"),
    n_activities_traceable_max = pd.NamedAgg(column = "n_activities_traceable", aggfunc= "max"),
    n_activities_audiobook_max = pd.NamedAgg(column = "n_activities_audiobook", aggfunc= "max"),
    n_activities_completed_game_max = pd.NamedAgg(column = "n_activities_completed_game", aggfunc= "max"),
    n_activities_completed_video_max = pd.NamedAgg(column = "n_activities_completed_video", aggfunc= "max"),
    n_activities_completed_traceable_max = pd.NamedAgg(column = "n_activities_completed_traceable", aggfunc= "max"),
    n_activities_completed_audiobook_max = pd.NamedAgg(column = "n_activities_completed_audiobook", aggfunc= "max"),
    
    duration_game_max = pd.NamedAgg(column = "duration_game", aggfunc= "max"),
    duration_video_max = pd.NamedAgg(column = "duration_video", aggfunc= "max"),
    duration_traceable_max = pd.NamedAgg(column = "duration_traceable", aggfunc= "max"),
    duration_audiobook_max = pd.NamedAgg(column = "duration_audiobook", aggfunc= "max"),
    
    qty_sessions_per_day_max = pd.NamedAgg(column = "qty_sessions_per_day", aggfunc= "max"),
    
    #AVG
    n_activities_game_avg = pd.NamedAgg(column = "n_activities_game", aggfunc= np.mean),
    n_activities_video_avg = pd.NamedAgg(column = "n_activities_video", aggfunc= np.mean),
    n_activities_traceable_avg = pd.NamedAgg(column = "n_activities_traceable", aggfunc= np.mean),
    n_activities_audiobook_avg = pd.NamedAgg(column = "n_activities_audiobook", aggfunc= np.mean),
    n_activities_completed_game_avg = pd.NamedAgg(column = "n_activities_completed_game", aggfunc= np.mean),
    n_activities_completed_video_avg = pd.NamedAgg(column = "n_activities_completed_video", aggfunc= np.mean),
    n_activities_completed_traceable_avg = pd.NamedAgg(column = "n_activities_completed_traceable", aggfunc= np.mean),
    n_activities_completed_audiobook_avg = pd.NamedAgg(column = "n_activities_completed_audiobook", aggfunc= np.mean),
    
    duration_game_avg = pd.NamedAgg(column = "duration_game", aggfunc= np.mean),
    duration_video_avg = pd.NamedAgg(column = "duration_video", aggfunc= np.mean),
    duration_traceable_avg = pd.NamedAgg(column = "duration_traceable", aggfunc= np.mean),
    duration_audiobook_avg = pd.NamedAgg(column = "duration_audiobook", aggfunc= np.mean),
    
    qty_sessions_per_day_avg = pd.NamedAgg(column = "qty_sessions_per_day", aggfunc= np.mean),
    
    #Qty days
    connected_day_min = pd.NamedAgg(column = "days_since_trial_start", aggfunc= "min"),
    connected_day_max = pd.NamedAgg(column = "days_since_trial_start", aggfunc= "max"),
    connected_days_qty = pd.NamedAgg(column = "days_since_trial_start", aggfunc= "sum"),

    
).reset_index()

In [18]:
#ACTIVITIES PORCENTAJES
#Total
df_activities_features['n_activities_tot'] = df_activities_features['n_activities_game_tot'] + df_activities_features['n_activities_video_tot'] + df_activities_features['n_activities_traceable_tot'] + df_activities_features['n_activities_audiobook_tot']
df_activities_features['n_activities_completed_tot'] = df_activities_features['n_activities_completed_game_tot'] + df_activities_features['n_activities_completed_video_tot'] + df_activities_features['n_activities_completed_traceable_tot'] + df_activities_features['n_activities_completed_audiobook_tot']

#Percentage of complete activities
df_activities_features['n_activities_completed_porc'] = df_activities_features['n_activities_completed_tot']/df_activities_features['n_activities_tot']

df_activities_features['n_activities_completed_game_porc'] = df_activities_features['n_activities_completed_game_tot']/ df_activities_features['n_activities_game_tot']
df_activities_features['n_activities_completed_video_porc'] = df_activities_features['n_activities_completed_video_tot']/ df_activities_features['n_activities_video_tot']
df_activities_features['n_activities_completed_traceable_porc'] = df_activities_features['n_activities_completed_traceable_tot']/ df_activities_features['n_activities_traceable_tot']
df_activities_features['n_activities_completed_audiobook_porc'] = df_activities_features['n_activities_completed_audiobook_tot']/ df_activities_features['n_activities_audiobook_tot']

#Share type of activity
df_activities_features['n_activities_completed_game_share'] = df_activities_features['n_activities_completed_game_tot']/ df_activities_features['n_activities_completed_tot']
df_activities_features['n_activities_completed_video_share'] = df_activities_features['n_activities_completed_video_tot']/ df_activities_features['n_activities_completed_tot']
df_activities_features['n_activities_completed_traceable_share'] = df_activities_features['n_activities_completed_traceable_tot']/ df_activities_features['n_activities_completed_tot']
df_activities_features['n_activities_completed_audiobook_share'] = df_activities_features['n_activities_completed_audiobook_tot']/ df_activities_features['n_activities_completed_tot']

df_activities_features['n_activities_game_share'] = df_activities_features['n_activities_game_tot']/ df_activities_features['n_activities_tot']
df_activities_features['n_activities_video_share'] = df_activities_features['n_activities_video_tot']/ df_activities_features['n_activities_tot']
df_activities_features['n_activities_traceable_share'] = df_activities_features['n_activities_traceable_tot']/ df_activities_features['n_activities_tot']
df_activities_features['n_activities_audiobook_share'] = df_activities_features['n_activities_audiobook_tot']/ df_activities_features['n_activities_tot']

In [19]:
#DURATION PERCENTAGES

#Total
df_activities_features['duration_tot'] = df_activities_features['duration_game_tot'] + df_activities_features['duration_video_tot'] + df_activities_features['duration_traceable_tot'] + df_activities_features['duration_audiobook_tot']

#Share type of activity
df_activities_features['duration_game_share'] = df_activities_features['duration_game_tot']/ df_activities_features['duration_tot']
df_activities_features['duration_video_share'] = df_activities_features['duration_video_tot']/ df_activities_features['duration_tot']
df_activities_features['duration_traceable_share'] = df_activities_features['duration_traceable_tot']/ df_activities_features['duration_tot']
df_activities_features['duration_audiobook_share'] = df_activities_features['duration_audiobook_tot']/ df_activities_features['duration_tot']

In [20]:
#TREND hypothesis: if the activity increase that implicate more engage
df_activities_features = pd.merge(df_activities_features, df_activities_group_per_day[['user_id', 'days_since_trial_start', 'n_activities_tot', 'n_activities_completed_tot', 'duration_tot']], how='left', left_on=['user_id', 'connected_day_min'], right_on=['user_id', 'days_since_trial_start'], suffixes = ('','_first_day'))
df_activities_features = pd.merge(df_activities_features, df_activities_group_per_day[['user_id', 'days_since_trial_start', 'n_activities_tot', 'n_activities_completed_tot', 'duration_tot']], how='left', left_on=['user_id', 'connected_day_max'], right_on=['user_id', 'days_since_trial_start'], suffixes = ('','_last_day'))
df_activities_features.rename(columns = {'days_since_trial_start':'days_since_trial_start_first_day'}, inplace = True)


In [21]:
df_activities_features.loc[(df_activities_features['n_activities_tot_first_day'] == 0) & (df_activities_features['n_activities_tot_last_day'] != 0), 'n_activities_trend'] = 999
df_activities_features.loc[(df_activities_features['n_activities_tot_first_day'] == 0) & (df_activities_features['n_activities_tot_last_day'] == 0), 'n_activities_trend'] = 1
df_activities_features.loc[df_activities_features['n_activities_tot_first_day'] != 0, 'n_activities_trend'] = df_activities_features['n_activities_tot_last_day']/df_activities_features['n_activities_tot_first_day']

df_activities_features.loc[(df_activities_features['n_activities_completed_tot_first_day'] == 0) & (df_activities_features['n_activities_completed_tot_last_day'] != 0), 'n_activities_completed_trend'] = 999
df_activities_features.loc[(df_activities_features['n_activities_completed_tot_first_day'] == 0) & (df_activities_features['n_activities_completed_tot_last_day'] == 0), 'n_activities_completed_trend'] = 1
df_activities_features.loc[df_activities_features['n_activities_completed_tot_first_day'] != 0, 'n_activities_completed_trend'] = df_activities_features['n_activities_completed_tot_last_day']/df_activities_features['n_activities_completed_tot_first_day']

df_activities_features.loc[(df_activities_features['duration_tot_first_day'] == 0) & (df_activities_features['duration_tot_last_day'] != 0), 'duration_trend'] = 999
df_activities_features.loc[(df_activities_features['duration_tot_first_day'] == 0) & (df_activities_features['duration_tot_last_day'] == 0), 'duration_trend'] = 1
df_activities_features.loc[df_activities_features['duration_tot_first_day'] != 0, 'duration_trend'] = df_activities_features['duration_tot_last_day']/df_activities_features['duration_tot_first_day']


# Features union

In [22]:
df_features = pd.merge(df_onboarding_features, df_activities_features, how='left', left_on=['user_id'], right_on=['user_id'])

# Target

In [23]:
df = pd.merge(df_features, df_onboarding[['user_id', 'customer']], how='left', left_on=['user_id'], right_on=['user_id'])
df.rename(columns = {'customer': 'target'}, inplace = True)
df.shape

(36025, 102)

# Fill Nan

I fill nan with -999 to easily identify Nan cases.
It could be improved by analysing each feature and assigning the best replace for each.

In [24]:
#Fill numerical features
features_float_values = {'age' : -999, 'price' : -999, 'os_version_cont' : -999, 'min_btw_onboarding_home_and_subscripition' : -999, 'n_activities_game_tot' : -999, 'n_activities_video_tot' : -999, 'n_activities_traceable_tot' : -999, 'n_activities_audiobook_tot' : -999, 'n_activities_completed_game_tot' : -999, 'n_activities_completed_video_tot' : -999, 'n_activities_completed_traceable_tot' : -999, 'n_activities_completed_audiobook_tot' : -999, 'duration_game_tot' : -999, 'duration_video_tot' : -999, 'duration_traceable_tot' : -999, 'duration_audiobook_tot' : -999, 'qty_sessions_per_day_tot' : -999, 'n_activities_game_min' : -999, 'n_activities_video_min' : -999, 'n_activities_traceable_min' : -999, 'n_activities_audiobook_min' : -999, 'n_activities_completed_game_min' : -999, 'n_activities_completed_video_min' : -999, 'n_activities_completed_traceable_min' : -999, 'n_activities_completed_audiobook_min' : -999, 'duration_game_min' : -999, 'duration_video_min' : -999, 'duration_traceable_min' : -999, 'duration_audiobook_min' : -999, 'qty_sessions_per_day_min' : -999, 'n_activities_game_max' : -999, 'n_activities_video_max' : -999, 'n_activities_traceable_max' : -999, 'n_activities_audiobook_max' : -999, 'n_activities_completed_game_max' : -999, 'n_activities_completed_video_max' : -999, 'n_activities_completed_traceable_max' : -999, 'n_activities_completed_audiobook_max' : -999, 'duration_game_max' : -999, 'duration_video_max' : -999, 'duration_traceable_max' : -999, 'duration_audiobook_max' : -999, 'qty_sessions_per_day_max' : -999, 'n_activities_game_avg' : -999, 'n_activities_video_avg' : -999, 'n_activities_traceable_avg' : -999, 'n_activities_audiobook_avg' : -999, 'n_activities_completed_game_avg' : -999, 'n_activities_completed_video_avg' : -999, 'n_activities_completed_traceable_avg' : -999, 'n_activities_completed_audiobook_avg' : -999, 'duration_game_avg' : -999, 'duration_video_avg' : -999, 'duration_traceable_avg' : -999, 'duration_audiobook_avg' : -999, 'qty_sessions_per_day_avg' : -999, 'connected_day_min' : -999, 'connected_day_max' : -999, 'connected_days_qty' : -999, 'n_activities_tot' : -999, 'n_activities_completed_tot' : -999, 'n_activities_completed_porc' : -999, 'n_activities_completed_game_porc' : -999, 'n_activities_completed_video_porc' : -999, 'n_activities_completed_traceable_porc' : -999, 'n_activities_completed_audiobook_porc' : -999, 'n_activities_completed_game_share' : -999, 'n_activities_completed_video_share' : -999, 'n_activities_completed_traceable_share' : -999, 'n_activities_completed_audiobook_share' : -999, 'n_activities_game_share' : -999, 'n_activities_video_share' : -999, 'n_activities_traceable_share' : -999, 'n_activities_audiobook_share' : -999, 'duration_tot' : -999, 'duration_game_share' : -999, 'duration_video_share' : -999, 'duration_traceable_share' : -999, 'duration_audiobook_share' : -999, 'days_since_trial_start_first_day' : -999, 'n_activities_tot_first_day' : -999, 'n_activities_completed_tot_first_day' : -999, 'duration_tot_first_day' : -999, 'days_since_trial_start_last_day' : -999, 'n_activities_tot_last_day' : -999, 'n_activities_completed_tot_last_day' : -999, 'duration_tot_last_day' : -999, 'n_activities_trend' : -999, 'n_activities_completed_trend' : -999, 'duration_trend' : -999, 'subscription_seasonality' : -999}
df = df.fillna(value = features_float_values)

In [25]:
#Fill categorical features
features_categorical_values = {'os_name' : 'no_value', 'country' : 'no_value', 'timezone_continent' : 'no_value', 'locale' : 'no_value', 'source' : 'no_value', 'level' : 'no_value', 'signup_provider' : 'no_value', 'currency' : 'no_value', 'payment_platform' : 'no_value'}
df = df.fillna(value = features_categorical_values)

In [26]:
#Convert categorical features to dummies
categorical_feature = get_categorical_features()
df[categorical_feature] = df[categorical_feature].astype('category')
df_dummy = pd.get_dummies(df[['user_id'] + categorical_feature], columns=categorical_feature)
df = pd.merge(df, df_dummy, how='left', left_on=['user_id'], right_on=['user_id'])

In [27]:
df.head()

Unnamed: 0,user_id,os_name,os_version_cont,country,timezone_continent,locale,source,level,age,signup_provider,price,currency,payment_platform,min_btw_onboarding_home_and_subscripition,subscription_seasonality,n_activities_game_tot,n_activities_video_tot,n_activities_traceable_tot,n_activities_audiobook_tot,n_activities_completed_game_tot,n_activities_completed_video_tot,n_activities_completed_traceable_tot,n_activities_completed_audiobook_tot,duration_game_tot,duration_video_tot,duration_traceable_tot,duration_audiobook_tot,qty_sessions_per_day_tot,n_activities_game_min,n_activities_video_min,n_activities_traceable_min,n_activities_audiobook_min,n_activities_completed_game_min,n_activities_completed_video_min,n_activities_completed_traceable_min,n_activities_completed_audiobook_min,duration_game_min,duration_video_min,duration_traceable_min,duration_audiobook_min,qty_sessions_per_day_min,n_activities_game_max,n_activities_video_max,n_activities_traceable_max,n_activities_audiobook_max,n_activities_completed_game_max,n_activities_completed_video_max,n_activities_completed_traceable_max,n_activities_completed_audiobook_max,duration_game_max,duration_video_max,duration_traceable_max,duration_audiobook_max,qty_sessions_per_day_max,n_activities_game_avg,n_activities_video_avg,n_activities_traceable_avg,n_activities_audiobook_avg,n_activities_completed_game_avg,n_activities_completed_video_avg,n_activities_completed_traceable_avg,n_activities_completed_audiobook_avg,duration_game_avg,duration_video_avg,duration_traceable_avg,duration_audiobook_avg,qty_sessions_per_day_avg,connected_day_min,connected_day_max,connected_days_qty,n_activities_tot,n_activities_completed_tot,n_activities_completed_porc,n_activities_completed_game_porc,n_activities_completed_video_porc,n_activities_completed_traceable_porc,n_activities_completed_audiobook_porc,n_activities_completed_game_share,n_activities_completed_video_share,n_activities_completed_traceable_share,n_activities_completed_audiobook_share,n_activities_game_share,n_activities_video_share,n_activities_traceable_share,n_activities_audiobook_share,duration_tot,duration_game_share,duration_video_share,duration_traceable_share,duration_audiobook_share,days_since_trial_start_first_day,n_activities_tot_first_day,n_activities_completed_tot_first_day,duration_tot_first_day,days_since_trial_start_last_day,n_activities_tot_last_day,n_activities_completed_tot_last_day,duration_tot_last_day,n_activities_trend,n_activities_completed_trend,duration_trend,target,os_name_Android,os_name_iOS,os_name_iPhone OS,os_name_no_value,country_ad,country_ae,country_af,country_ag,country_ai,country_al,country_am,country_ao,country_ar,country_at,country_au,country_aw,country_az,country_ba,country_bb,country_bd,country_be,country_bg,country_bh,country_bj,country_bm,country_bn,country_bo,country_br,country_bs,country_bt,country_bw,country_by,country_bz,country_ca,country_cg,country_ch,country_ci,country_cl,country_cm,country_cn,country_co,country_cr,country_cv,country_cw,country_cy,country_cz,country_de,country_dk,country_dm,country_do,country_dz,country_ec,country_ee,country_eg,country_es,country_et,country_fi,country_fo,country_fr,country_ga,country_gb,country_gd,country_ge,country_gf,country_gh,country_gi,country_gl,country_gm,country_gp,country_gr,country_gt,country_gu,country_gy,country_hk,country_hn,country_hr,country_ht,country_hu,country_id,country_ie,country_il,country_in,country_iq,country_ir,country_is,country_it,country_je,country_jm,country_jo,country_jp,country_ke,country_kg,country_kh,country_kn,country_kr,country_kw,country_ky,country_kz,country_la,country_lb,country_lc,country_li,country_lk,country_lr,country_ls,country_lt,country_lu,country_lv,country_ly,country_ma,country_mc,country_md,country_me,country_mg,country_mk,country_ml,country_mm,country_mn,country_mo,country_mp,country_mq,country_mr,country_mt,country_mu,country_mv,country_mw,country_mx,country_my,country_mz,country_na,country_nc,country_ng,country_ni,country_nl,country_no,country_no_value,country_np,country_nz,country_om,country_pa,country_pe,country_pf,country_pg,country_ph,country_pk,country_pl,country_pr,country_ps,country_pt,country_pw,country_py,country_qa,country_re,country_ro,country_rs,country_ru,country_rw,country_sa,country_sb,country_sc,country_sd,country_se,country_sg,country_si,country_sk,country_sn,country_so,country_sv,country_sz,country_tc,country_tg,country_th,country_tj,country_tn,country_to,country_tr,country_tt,country_tw,country_tz,country_ua,country_ug,country_us,country_uy,country_uz,country_ve,country_vg,country_vi,country_vn,country_ws,country_xk,country_ye,country_yt,country_za,country_zm,country_zw,timezone_continent_Africa,timezone_continent_America,timezone_continent_Antarctica,timezone_continent_Asia,timezone_continent_Atlantic,timezone_continent_Australia,timezone_continent_Etc,timezone_continent_Europe,timezone_continent_GMT,timezone_continent_Indian,timezone_continent_Pacific,timezone_continent_US,timezone_continent_no_value,locale_af,locale_ar,locale_az,locale_bg,locale_ceb,locale_cs,locale_da,locale_de,locale_el,locale_en,locale_es,locale_fa,locale_fi,locale_fr,locale_he,locale_hr,locale_ht,locale_hu,locale_id,locale_is,locale_it,locale_ja,locale_kk,locale_km,locale_ko,locale_lt,locale_lv,locale_mi,locale_mk,locale_ms,locale_my,locale_nl,locale_no,locale_no_value,locale_pl,locale_pt,locale_ro,locale_ru,locale_sk,locale_sl,locale_sq,locale_sr,locale_sv,locale_th,locale_tn,locale_tr,locale_uk,locale_vi,locale_yue,locale_zh-hans,locale_zh-hant,source_launcher,source_no_value,source_parents,source_postonboarding,source_stickeralbum,source_upsell_download_modal,level_advanced,level_beginner,level_medium,level_no_value,signup_provider_apple,signup_provider_email,signup_provider_facebook,signup_provider_fb,signup_provider_no_value,currency_AED,currency_AUD,currency_BDT,currency_BGN,currency_BOB,currency_BRL,currency_CAD,currency_CHF,currency_CLP,currency_CNY,currency_COP,currency_CRC,currency_CZK,currency_DKK,currency_DZD,currency_EGP,currency_EUR,currency_GBP,currency_GEL,currency_GHS,currency_HKD,currency_HRK,currency_HUF,currency_IDR,currency_ILS,currency_INR,currency_JOD,currency_JPY,currency_KES,currency_KRW,currency_KZT,currency_LBP,currency_LKR,currency_MAD,currency_MXN,currency_MYR,currency_NGN,currency_NOK,currency_NZD,currency_PEN,currency_PHP,currency_PKR,currency_PLN,currency_PYG,currency_QAR,currency_RON,currency_RSD,currency_RUB,currency_SAR,currency_SEK,currency_SGD,currency_THB,currency_TRY,currency_TWD,currency_TZS,currency_UAH,currency_USD,currency_VND,currency_ZAR,payment_platform_apple,payment_platform_google
0,23662686,iOS,13.7,th,Asia,th,postonboarding,beginner,1.0,email,639.0,THB,apple,2.0,26,6.0,10.0,13.0,0.0,5.0,3.0,12.0,0.0,598.047,349.238,397.252,0.0,2.0,6.0,10.0,13.0,0.0,5.0,3.0,12.0,0.0,598.047,349.238,397.252,0.0,2.0,6.0,10.0,13.0,0.0,5.0,3.0,12.0,0.0,598.047,349.238,397.252,0.0,2.0,6.0,10.0,13.0,0.0,5.0,3.0,12.0,0.0,598.047,349.238,397.252,0.0,2.0,0.0,0.0,0.0,29.0,20.0,0.689655,0.833333,0.3,0.923077,-999.0,0.25,0.15,0.6,0.0,0.206897,0.344828,0.448276,0.0,1344.537,0.444798,0.259746,0.295456,0.0,0.0,29.0,20.0,1344.537,0.0,29.0,20.0,1344.537,1.0,1.0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1,23059290,iOS,14.0,us,America,en,launcher,beginner,4.0,facebook,14.99,USD,apple,56033.0,27,39.0,17.0,6.0,0.0,28.0,12.0,6.0,0.0,7212.975,1066.776,464.655,0.0,6.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,122.139,0.0,0.0,0.0,1.0,29.0,12.0,3.0,0.0,19.0,7.0,3.0,0.0,6058.176,668.796,393.134,0.0,3.0,13.0,5.666667,2.0,0.0,9.333333,4.0,2.0,0.0,2404.325,355.592,154.885,0.0,2.0,0.0,2.0,3.0,62.0,46.0,0.741935,0.717949,0.705882,1.0,-999.0,0.608696,0.26087,0.130435,0.0,0.629032,0.274194,0.096774,0.0,8744.406,0.824867,0.121995,0.053137,0.0,0.0,44.0,29.0,7120.106,2.0,2.0,2.0,122.139,0.045455,0.068966,0.017154,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
2,13286083,Android,10.0,gb,Europe,en,parents,beginner,1.0,email,12.99,GBP,apple,300799.0,27,24.0,94.0,35.0,1.0,1.0,15.0,21.0,0.0,52298.055,2721.486,4032.32,4.991,9.0,5.0,15.0,5.0,0.0,0.0,3.0,2.0,0.0,13.457,391.26,50.01,0.0,2.0,10.0,47.0,20.0,1.0,1.0,6.0,14.0,0.0,52245.954,1168.188,3507.944,4.991,4.0,8.0,31.333333,11.666667,0.333333,0.333333,5.0,7.0,0.0,17432.685,907.162,1344.106667,1.663667,3.0,0.0,2.0,3.0,154.0,37.0,0.24026,0.041667,0.159574,0.6,0.0,0.027027,0.405405,0.567568,0.0,0.155844,0.61039,0.227273,0.006494,59056.852,0.885554,0.046082,0.068279,8.5e-05,0.0,45.0,18.0,53111.58,2.0,57.0,8.0,1225.505,1.266667,0.444444,0.023074,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,23646868,iOS,11.2,id,Asia,id,postonboarding,beginner,2.0,email,209000.0,IDR,apple,1.0,25,58.0,26.0,3.0,0.0,39.0,4.0,2.0,0.0,5421.151,1182.451,74.274,0.0,5.0,25.0,11.0,1.0,0.0,15.0,1.0,1.0,0.0,2673.351,356.078,30.784,0.0,2.0,33.0,15.0,2.0,0.0,24.0,3.0,1.0,0.0,2747.8,826.373,43.49,0.0,3.0,29.0,13.0,1.5,0.0,19.5,2.0,1.0,0.0,2710.5755,591.2255,37.137,0.0,2.5,0.0,1.0,1.0,87.0,45.0,0.517241,0.672414,0.153846,0.666667,-999.0,0.866667,0.088889,0.044444,0.0,0.666667,0.298851,0.034483,0.0,6677.876,0.811808,0.17707,0.011122,0.0,0.0,41.0,19.0,3530.508,1.0,46.0,26.0,3147.368,1.121951,1.368421,0.891477,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,23675055,Android,9.0,fr,Europe,ru,postonboarding,no_value,10.0,email,14.99,EUR,google,6.0,26,28.0,17.0,17.0,3.0,22.0,1.0,17.0,0.0,3159.312,775.629,271.38,21.691,3.0,7.0,6.0,5.0,1.0,5.0,0.0,5.0,0.0,447.763,307.285,73.715,3.38,1.0,21.0,11.0,12.0,2.0,17.0,1.0,12.0,0.0,2711.549,468.344,197.665,18.311,2.0,14.0,8.5,8.5,1.5,11.0,0.5,8.5,0.0,1579.656,387.8145,135.69,10.8455,1.5,0.0,1.0,1.0,65.0,40.0,0.615385,0.785714,0.058824,1.0,0.0,0.55,0.025,0.425,0.0,0.430769,0.261538,0.261538,0.046154,4228.012,0.747233,0.18345,0.064186,0.00513,0.0,46.0,29.0,3234.81,1.0,19.0,11.0,993.202,0.413043,0.37931,0.307036,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [28]:
df.shape

(36025, 437)

# Save dataset

In [29]:
write_file_csv(df, 'dataset')