In [1]:
import pandas as pd

Trabajo en el siguiente rango de dias:

In [2]:
firstDay = pd.Timestamp(2019, 4, 24)
lastDay = pd.Timestamp(2019, 4, 27)

Los datos son los siguientes

In [3]:
location = 'data/'
eventsFile = location + 'events.csv'

Cargo los datos de los primeros 3 dias

In [4]:
dtype = {
    'date': 'str',
    'event_id': 'int8',
    'ref_type': 'category',
    'ref_hash': 'int64',
    'application_id': 'int64',
    'attributed': 'bool',
    'device_os_version': 'float64',
    'device_brand': 'float64',
    'device_model': 'float64',
    'device_city': 'float64',
    'session_user_agent': 'float64',
    'trans_id': 'str',
    'user_agent': 'float64',
    'event_uuid': 'str',
    'carrier': 'float64',
    'kind': 'float64',
    'device_os': 'category',
    'wifi': 'bool',
    'connection_type': 'category',
    'ip_address': 'int64',
    'device_language': 'category'
}

In [5]:
chunksize = 50000

In [6]:
eventsIterator = pd.read_csv(eventsFile, usecols=dtype.keys(), dtype=dtype, parse_dates=['date'], chunksize=chunksize)

In [7]:
events = pd.DataFrame()
for chunk in eventsIterator:
    chunkIsInDateInterval = (chunk['date'] >= firstDay) & (chunk['date'] < lastDay)
    events = pd.concat([events, chunk[chunkIsInDateInterval]], sort=False)

In [8]:
events['seconds'] = (events['date'] - firstDay).dt.total_seconds()
events['day'] = events['date'].dt.day - firstDay.day + 1

In [9]:
events = events.rename(columns={'ref_hash': 'device_id'})

In [10]:
events.head()

Unnamed: 0,date,event_id,ref_type,device_id,application_id,attributed,device_os_version,device_brand,device_model,device_city,...,event_uuid,carrier,kind,device_os,wifi,connection_type,ip_address,device_language,seconds,day
3015,2019-04-25 21:25:34.650,1,1891515180541284343,809939361959643854,210,False,,,,,...,eb50de22-8c0f-42e3-85c2-fb6c54782bd6,,4.017674e+18,,False,,151684593053252001,,163534.65,2
3016,2019-04-25 21:25:33.165,1,1891515180541284343,2704332589081852700,210,False,,,,,...,fc0e2e76-475d-4d60-bc0e-86070792a44b,,4.017674e+18,,False,,6918006307204159217,,163533.165,2
3017,2019-04-25 21:25:33.097,1,1891515180541284343,2704332589081852700,210,False,,,,,...,1113aebd-1ac9-4583-a8d0-9da011013ec9,,4.017674e+18,,False,,6918006307204159217,,163533.097,2
3018,2019-04-25 21:25:34.227,0,1891515180541284343,6376777580200607439,210,False,,,6.871161e+18,,...,2a348381-848c-4fe4-bd83-48e6c1f8b4d2,,5.882882e+18,,False,,8378906526277633862,3.3013777759777e+18,163534.227,2
3019,2019-04-25 21:17:30.501,1,1891515180541284343,2602532777370559745,210,False,,,,,...,69b4057e-78f3-42c0-988b-9ad6d67613e7,,4.017674e+18,,False,,7090887066466907036,,163050.501,2


In [11]:
events.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2980427 entries, 3015 to 7742221
Data columns (total 23 columns):
date                  datetime64[ns]
event_id              int8
ref_type              object
device_id             int64
application_id        int64
attributed            bool
device_os_version     float64
device_brand          float64
device_model          float64
device_city           float64
session_user_agent    float64
trans_id              object
user_agent            float64
event_uuid            object
carrier               float64
kind                  float64
device_os             object
wifi                  bool
connection_type       object
ip_address            int64
device_language       object
seconds               float64
day                   int64
dtypes: bool(2), datetime64[ns](1), float64(9), int64(4), int8(1), object(6)
memory usage: 486.0+ MB


## Busqueda de features

In [12]:
features = pd.DataFrame(events['device_id'].unique(), columns=['device_id'])

In [13]:
features.head()

Unnamed: 0,device_id
0,809939361959643854
1,2704332589081852700
2,6376777580200607439
3,2602532777370559745
4,1392327325375134465


In [14]:
eventsCount = events[['device_id']].groupby(by=['device_id']).size()
eventsCount = eventsCount.reset_index().rename(columns={0: 'eventsCount'})
features = features.merge(eventsCount, how='left')

In [15]:
eventsMostFreqDay = events[['device_id', 'day']].groupby(by=['device_id']).agg(lambda x: x.value_counts().index[0])
eventsMostFreqDay = eventsMostFreqDay.reset_index().rename(columns={'day': 'eventsMostFreqDay'})
features = features.merge(eventsMostFreqDay, how='left')

In [16]:
eventsMeanInterval = events[['device_id', 'seconds']].sort_values(by='seconds').groupby(by='device_id').agg(lambda x: (x['seconds'] - x['seconds'].shift(1).fillna(0)).mean())
eventsMeanInterval = eventsMeanInterval.reset_index().rename(columns={'seconds': 'eventsMeanInterval'})
features = features.merge(eventsMeanInterval, how='left')

In [17]:
features['eventsAproxFreq'] = features['eventsMeanInterval']**-1

In [18]:
eventsStdDevInterval = events[['device_id', 'seconds']].sort_values(by='seconds').groupby(by='device_id').agg(lambda x: (x['seconds'] - x['seconds'].shift(1).fillna(0)).std())
eventsStdDevInterval = eventsStdDevInterval.reset_index().rename(columns={'seconds': 'eventsStdDevInterval'})
features = features.merge(eventsStdDevInterval, how='left')

In [19]:
eventsLast = events[['device_id', 'seconds']].groupby(by='device_id').max()
eventsLast = eventsLast.reset_index().rename(columns={'seconds': 'eventsLast'})
features = features.merge(eventsLast, how='left')

In [20]:
features.head()

Unnamed: 0,device_id,eventsCount,eventsMostFreqDay,eventsMeanInterval,eventsAproxFreq,eventsStdDevInterval,eventsLast
0,809939361959643854,40,2,4096.5292,0.000244,25839.686089,163861.168
1,2704332589081852700,546,3,472.258037,0.002117,5202.05722,257852.888
2,6376777580200607439,459,3,555.999065,0.001799,7486.065919,255203.571
3,2602532777370559745,62,2,4115.70321,0.000243,14779.361038,255173.599
4,1392327325375134465,94,2,1735.642904,0.000576,16660.849179,163150.433


In [21]:
features.to_csv('data/eventsFeatures7.csv', index=False)