In [1]:
import pandas as pd
import numpy as np
import feather

Events:
* Info of what users do on the app.
* Events: many types. Purchase Event index is '8'.

## read in the events.csv

In [2]:
datatype = {'session_id': np.uint64, 'event':'object',
            'event_timestamp':str, 'event_value':str,'user_id_hash':'category'}
events = pd.read_csv('events.csv',dtype=datatype).drop('app_id', axis=1)
events.head(5)

Unnamed: 0,session_id,event,event_timestamp,event_value,user_id_hash
0,5558845121177764917,45,1542215397132,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
1,5558845121177764917,45,1542215484895,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
2,7689508378645584666,.m5100869650219008,1541124410372,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
3,2201961907282901522,4,1543713091129,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...
4,2201961907282901522,6,1543713093116,0.0,9943447915df3a45fd6720a026af905b6da6b56a37701b...


In [3]:
events.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111946597 entries, 0 to 111946596
Data columns (total 5 columns):
session_id         uint64
event              object
event_timestamp    object
event_value        object
user_id_hash       category
dtypes: category(1), object(3), uint64(1)
memory usage: 21.2 GB


## timestamp convert and split

In [4]:
events['timestamp'] = pd.to_datetime(events['event_timestamp'], utc=True, unit='ms')

In [5]:
max_time = events['timestamp'].max()
min_time = events['timestamp'].min()

In [11]:
print("the lastest data in events is: "+str(max_time))
print("the earliest data in events is: "+str(min_time))

the lastest data in events is: 2018-12-14 23:59:59.977000+00:00
the earliest data in events is: 2018-10-01 07:00:04.526000+00:00


In [6]:
from datetime import timedelta

In [7]:
cut_off_7 = max_time - timedelta(days=7)

In [8]:
cut_off_14 = max_time - timedelta(days=14)

In [12]:
print("the split point on 7 days in events is: "+str(cut_off_7))
print("the split point on 14 days in events is: "+str(cut_off_14))

the split point on 7 days in events is: 2018-12-07 23:59:59.977000+00:00
the split point on 14 days in events is: 2018-11-30 23:59:59.977000+00:00


#### write the splitted data into feather format

In [9]:
event_7 = events[events['timestamp']>cut_off_7]
feather.write_dataframe(event_7, 'lable_7.feather')

In [10]:
event_14 = events[events['timestamp']>cut_off_14]
feather.write_dataframe(event_14, 'lable_14.feather')

In [15]:
event_7_b = events[events['timestamp']<=cut_off_7]
feather.write_dataframe(event_7_b, 'lable_7_b.feather')

In [17]:
event_14_b = events[events['timestamp']<=cut_off_14]
feather.write_dataframe(event_14_b, 'lable_14_b.feather')

#### check after split length

In [21]:
assert(len(event_7) + len(event_7_b) == len(events))

In [22]:
assert(len(event_14) + len(event_14_b) == len(events))

## compute the labels

In [24]:
event_7.event.value_counts()

45                              3749365
63                               311509
47                               270225
1                                267815
5                                222263
6                                207271
57                               206945
4                                187849
55                               177183
40                               153653
41                               110903
14                               103303
3                                 99137
42                                90610
50                                77536
7                                 68704
64                                61818
54                                43270
.a5008055943430144                22306
49                                14715
0                                 14148
8                                 12074
48                                10139
.m5295687445250048                10035
56                                 6678


In [25]:
event_14.event.value_counts()

45                              8950574
1                                665852
5                                533066
6                                498105
4                                455059
40                               370631
63                               343904
47                               303149
41                               270684
57                               268814
3                                245383
14                               224488
42                               219093
55                               193261
7                                160886
50                                88038
64                                67807
54                                50647
.a5008055943430144                44016
0                                 39979
8                                 31009
.m5295687445250048                28179
49                                17175
48                                11621
56                                 7903


In [26]:
purchase_7 = event_7[event_7.event == "8"].user_id_hash.drop_duplicates().to_frame().reset_index(drop=True)
purchase_14 = event_14[event_14.event == "8"].user_id_hash.drop_duplicates().to_frame().reset_index(drop=True)

In [27]:
print("the number of unique users who purchased in last 7 days: "+str(len(purchase_7)))
print("the number of unique users who purchased in last 14 days: "+str(len(purchase_14)))

the number of unique users who purchased in last 7 days: 3286
the number of unique users who purchased in last 14 days: 6126


#### check if the people purchased in 7 days will automatically have purchase label in 14 days

In [28]:
assert(np.where(purchase_7.user_id_hash.isin(purchase_14.user_id_hash) == False))

In [29]:
purchase_7['label_7'] = np.ones(3286)
purchase_14['label_14'] = np.ones(6126)

In [30]:
labels = purchase_14.set_index('user_id_hash').join(purchase_7.set_index('user_id_hash'))

In [31]:
labels.fillna(0).to_csv('labels.csv')

In [33]:
Labels = pd.read_csv('labels.csv')

In [34]:
Labels

Unnamed: 0,user_id_hash,label_14,label_7
0,280a7625f66e60896e558eaeee6b3eb0419c488eed15ef...,1.0,0.0
1,0e02d992cae31bf3cc09708fc70db2cf935288bf715aa7...,1.0,0.0
2,f885fd72310deaa1b7591bb3af277e632c364d36bdda5e...,1.0,0.0
3,5fb8780d18dfc69e2c968508d624828ecff7b504bf2719...,1.0,1.0
4,75ce9e8bf0517bc7dec67b715f3caf38871135c5cd462b...,1.0,0.0
5,965ab5f1063f10193ab6361fdaa395065a2b9b1c4cc36b...,1.0,1.0
6,7ab6d7a12a41aa35981369dafd3fba4d4fbc2ffaac6fae...,1.0,1.0
7,b1c50668d9790138db22cc23cbf3fde97269d1b5680c90...,1.0,1.0
8,e45e7f7c4a4fe840a7efd93dd3450b5b36d05bb25df4a0...,1.0,1.0
9,684543899327eb0472e13706880bd4e20298d55d9d6834...,1.0,1.0


In [36]:
print("number of unique users in events.csv is: "+str(len(events.user_id_hash.unique())))
print("number of unique users in lastest 7 days is: "+str(len(event_7.user_id_hash.unique())))
print("number of unique users in lastest 14 days is: "+str(len(event_14.user_id_hash.unique())))

number of unique users in events.csv is: 621001
number of unique users in lastest 7 days is: 49328
number of unique users in lastest 14 days is: 89291
