In [1]:
import pandas as pd

Trabajo en el siguiente rango de dias:

In [2]:
firstDay = pd.Timestamp(2019, 4, 18)
lastDay = pd.Timestamp(2019, 4, 21)

Los datos son los siguientes

In [3]:
location = 'data/'
eventsFile = location + 'events.csv'

Cargo los datos de los primeros 3 dias

In [4]:
dtype = {
    'date': 'str',
    'event_id': 'int8',
    'ref_type': 'category',
    'ref_hash': 'int64',
    'application_id': 'int64',
    'attributed': 'bool',
    'device_os_version': 'float64',
    'device_brand': 'float64',
    'device_model': 'float64',
    'device_city': 'float64',
    'session_user_agent': 'float64',
    'trans_id': 'str',
    'user_agent': 'float64',
    'event_uuid': 'str',
    'carrier': 'float64',
    'kind': 'float64',
    'device_os': 'category',
    'wifi': 'bool',
    'connection_type': 'category',
    'ip_address': 'int64',
    'device_language': 'category'
}

In [5]:
chunksize = 50000

In [6]:
eventsIterator = pd.read_csv(eventsFile, usecols=dtype.keys(), dtype=dtype, parse_dates=['date'], chunksize=chunksize)

In [7]:
events = pd.DataFrame()
for chunk in eventsIterator:
    chunkIsInDateInterval = (chunk['date'] >= firstDay) & (chunk['date'] < lastDay)
    events = pd.concat([events, chunk[chunkIsInDateInterval]], sort=False)

In [17]:
events['seconds'] = (events['date'] - firstDay).dt.total_seconds()
events['day'] = events['date'].dt.day - firstDay.day + 1

In [18]:
events = events.rename(columns={'ref_hash': 'device_id'})

In [19]:
events.head()

Unnamed: 0,date,event_id,ref_type,device_id,application_id,attributed,device_os_version,device_brand,device_model,device_city,...,event_uuid,carrier,kind,device_os,wifi,connection_type,ip_address,device_language,seconds,day
0,2019-04-20 01:42:49.120,0,1891515180541284343,5857744372586891366,210,False,,,4.318294e+18,,...,5b506964-5f47-4b28-a8c2-8a92d6c23379,,5.882882e+18,,False,,7544543351571901618,3.3013777759777e+18,178969.12,3
1,2019-04-20 01:42:49.340,1,1891515180541284343,7642521036780133571,210,False,,,,,...,f1fb9d15-1a7b-4116-8d3b-c4c403e197e2,,4.017674e+18,,False,,6949523255335024165,,178969.34,3
2,2019-04-20 01:42:49.365,1,1891515180541284343,2548841562898283198,210,False,,,,,...,c85a0b15-a5d7-472e-8116-6bfa3db19687,,4.017674e+18,,False,,6428537280982666957,,178969.365,3
3,2019-04-20 01:42:51.438,2,1891515180541284343,609402887625919085,210,False,,,,,...,f4aa0a97-2de6-4f22-95c6-1b3150112cb9,,6.168309e+18,,False,,7607371352198017145,,178971.438,3
4,2019-04-20 01:42:51.838,1,1891515180541284343,9114651763556439823,210,False,,,,,...,08e2f7f7-875f-4aa0-b337-b9b87b0d83ea,,4.017674e+18,,False,,2901772839007473756,,178971.838,3


In [20]:
events.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2261451 entries, 0 to 7744308
Data columns (total 23 columns):
date                  datetime64[ns]
event_id              int8
ref_type              object
device_id             int64
application_id        int64
attributed            bool
device_os_version     float64
device_brand          float64
device_model          float64
device_city           float64
session_user_agent    float64
trans_id              object
user_agent            float64
event_uuid            object
carrier               float64
kind                  float64
device_os             object
wifi                  bool
connection_type       object
ip_address            int64
device_language       object
seconds               float64
day                   int64
dtypes: bool(2), datetime64[ns](1), float64(9), int64(4), int8(1), object(6)
memory usage: 368.8+ MB


## Busqueda de features

In [22]:
features = pd.DataFrame(events['device_id'].unique(), columns=['device_id'])

In [23]:
features.head()

Unnamed: 0,device_id
0,5857744372586891366
1,7642521036780133571
2,2548841562898283198
3,609402887625919085
4,9114651763556439823


In [24]:
eventsCount = events[['device_id']].groupby(by=['device_id']).size()
eventsCount = eventsCount.reset_index().rename(columns={0: 'eventsCount'})
features = features.merge(eventsCount, how='left')

In [25]:
eventsMostFreqDay = events[['device_id', 'day']].groupby(by=['device_id']).agg(lambda x: x.value_counts().index[0])
eventsMostFreqDay = eventsMostFreqDay.reset_index().rename(columns={'day': 'eventsMostFreqDay'})
features = features.merge(eventsMostFreqDay, how='left')

In [26]:
eventsMeanInterval = events[['device_id', 'seconds']].sort_values(by='seconds').groupby(by='device_id').agg(lambda x: (x['seconds'] - x['seconds'].shift(1).fillna(0)).mean())
eventsMeanInterval = eventsMeanInterval.reset_index().rename(columns={'seconds': 'eventsMeanInterval'})
features = features.merge(eventsMeanInterval, how='left')

In [27]:
features['eventsAproxFreq'] = features['eventsMeanInterval']**-1

In [28]:
eventsStdDevInterval = events[['device_id', 'seconds']].sort_values(by='seconds').groupby(by='device_id').agg(lambda x: (x['seconds'] - x['seconds'].shift(1).fillna(0)).std())
eventsStdDevInterval = eventsStdDevInterval.reset_index().rename(columns={'seconds': 'eventsStdDevInterval'})
features = features.merge(eventsStdDevInterval, how='left')

In [30]:
eventsLast = events[['device_id', 'seconds']].groupby(by='device_id').max()
eventsLast = eventsLast.reset_index().rename(columns={'seconds': 'eventsLast'})
features = features.merge(eventsLast, how='left')

In [31]:
features.head()

Unnamed: 0,device_id,eventsCount,eventsMostFreqDay,eventsMeanInterval,eventsAproxFreq,eventsStdDevInterval,eventsLast
0,5857744372586891366,3,3,59656.373333,1.7e-05,103326.483103,178969.12
1,7642521036780133571,95,3,1888.432316,0.00053,12160.106713,179401.07
2,2548841562898283198,163,1,1455.97784,0.000687,10525.927194,237324.388
3,609402887625919085,141,3,1311.918482,0.000762,15052.515942,184980.506
4,9114651763556439823,60,3,3004.316733,0.000333,22188.844414,180259.004


In [32]:
features.to_csv('data/eventsFeaturesFirstThreeDays.csv', index=False)