In [1]:
import pandas as pd

Trabajo en el siguiente rango de dias:

In [2]:
firstDay = pd.Timestamp(2019, 4, 24)
lastDay = pd.Timestamp(2019, 4, 27)

Los datos son los siguientes

In [3]:
location = 'data/'
auctionsFile = location + 'auctions.csv'

Cargo los datos de las auctions en el rango de dias indicado

In [4]:
dtype = {
    'device_id': 'int64',
    'date': 'str',
    'ref_type_id': 'category',
    'source_id': 'category',
}

In [5]:
chunksize = 50000

In [6]:
auctionsIterator = pd.read_csv(auctionsFile, usecols=dtype.keys(), dtype=dtype, parse_dates=['date'], chunksize=chunksize)

In [7]:
auctions = pd.DataFrame()
for chunk in auctionsIterator:
    chunkIsInDateInterval = (chunk['date'] >= firstDay) & (chunk['date'] < lastDay)
    auctions = pd.concat([auctions, chunk[chunkIsInDateInterval]], sort = False)

In [8]:
auctions['day'] = auctions['date'].dt.day - firstDay.day + 1
auctions['hour'] = auctions['date'].dt.hour + auctions['date'].dt.minute / 60 + auctions['date'].dt.second / (60**2)
auctions['seconds'] = (auctions['date'] - firstDay).dt.total_seconds()

In [9]:
auctions.head()

Unnamed: 0,date,device_id,ref_type_id,source_id,day,hour,seconds
853132,2019-04-26 23:52:29.135354,1384623003476985820,1,7,3,23.874722,258749.135354
853133,2019-04-26 23:52:39.367477,3714738743084512188,1,7,3,23.8775,258759.367477
853134,2019-04-26 23:52:54.714361,5697386557321863111,1,7,3,23.881667,258774.714361
853135,2019-04-26 23:53:13.729835,5583037045722622336,1,7,3,23.886944,258793.729835
853136,2019-04-26 23:53:48.577115,6383034009915294411,1,7,3,23.896667,258828.577115


In [10]:
auctions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15836843 entries, 853132 to 46624682
Data columns (total 7 columns):
date           datetime64[ns]
device_id      int64
ref_type_id    category
source_id      object
day            int64
hour           float64
seconds        float64
dtypes: category(1), datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 860.9+ MB


## Busqueda de features

In [11]:
features = pd.DataFrame(auctions['device_id'].unique(), columns=['device_id'])

In [12]:
features.head()

Unnamed: 0,device_id
0,1384623003476985820
1,3714738743084512188
2,5697386557321863111
3,5583037045722622336
4,6383034009915294411


In [13]:
auctionsCount = auctions[['device_id']].groupby(by=['device_id']).size()
auctionsCount = auctionsCount.reset_index().rename(columns={0: 'auctionsCount'})
features = features.merge(auctionsCount, how='left')

In [14]:
auctionsMostFreqDay = auctions[['device_id', 'day']].groupby(by=['device_id']).agg(lambda x: x.value_counts().index[0])
auctionsMostFreqDay = auctionsMostFreqDay.reset_index().rename(columns={'day': 'auctionsMostFreqDay'})
features = features.merge(auctionsMostFreqDay, how='left')

In [15]:
auctionsMeanInterval = auctions[['device_id', 'seconds']].sort_values(by='seconds').groupby(by='device_id').agg(lambda x: (x['seconds'] - x['seconds'].shift(1).fillna(0)).mean())
auctionsMeanInterval = auctionsMeanInterval.reset_index().rename(columns={'seconds': 'auctionsMeanInterval'})
features = features.merge(auctionsMeanInterval, how='left')

In [16]:
features['auctionsAproxFreq'] = features['auctionsMeanInterval']**-1

In [17]:
auctionsStdDevInterval = auctions[['device_id', 'seconds']].sort_values(by='seconds').groupby(by='device_id').agg(lambda x: (x['seconds'] - x['seconds'].shift(1).fillna(0)).std())
auctionsStdDevInterval = auctionsStdDevInterval.reset_index().rename(columns={'seconds': 'auctionsStdDevInterval'})
features = features.merge(auctionsStdDevInterval, how='left')

In [18]:
auctionsLast = auctions[['device_id', 'seconds']].groupby(by='device_id').max()
auctionsLast = auctionsLast.reset_index().rename(columns={'seconds': 'auctionsLast'})
features = features.merge(auctionsLast, how='left')

In [19]:
features.head()

Unnamed: 0,device_id,auctionsCount,auctionsMostFreqDay,auctionsMeanInterval,auctionsAproxFreq,auctionsStdDevInterval,auctionsLast
0,1384623003476985820,1,3,258749.135354,4e-06,,258749.135354
1,3714738743084512188,303,3,854.076045,0.001171,4126.474758,258785.04163
2,5697386557321863111,6,3,43129.11906,2.3e-05,71127.649481,258774.714361
3,5583037045722622336,35,3,7394.106567,0.000135,12428.320871,258793.729835
4,6383034009915294411,68,1,3806.302605,0.000263,11689.377607,258828.577115


In [20]:
features.to_csv('data/auctionsFeatures7.csv', index=False)