In [1]:
import pandas as pd

Trabajo en el siguiente rango de dias:

In [2]:
firstDay = pd.Timestamp(2019, 4, 18)
lastDay = pd.Timestamp(2019, 4, 21)

Los datos son los siguientes

In [3]:
location = 'data/'
auctionsFile = location + 'auctions.csv'

Cargo los datos de las auctions en el rango de dias indicado

In [4]:
dtype = {
    'device_id': 'int64',
    'date': 'str',
    'ref_type_id': 'category',
    'source_id': 'category',
}

In [5]:
chunksize = 50000

In [6]:
auctionsIterator = pd.read_csv(auctionsFile, usecols=dtype.keys(), dtype=dtype, parse_dates=['date'], chunksize=chunksize)

In [7]:
auctions = pd.DataFrame()
for chunk in auctionsIterator:
    chunkIsInDateInterval = (chunk['date'] >= firstDay) & (chunk['date'] < lastDay)
    auctions = pd.concat([auctions, chunk[chunkIsInDateInterval]], sort = False)

In [8]:
auctions['day'] = auctions['date'].dt.day - firstDay.day + 1
auctions['hour'] = auctions['date'].dt.hour + auctions['date'].dt.minute / 60 + auctions['date'].dt.second / (60**2)
auctions['seconds'] = (auctions['date'] - firstDay).dt.total_seconds()

In [9]:
auctions.head()

Unnamed: 0,date,device_id,ref_type_id,source_id,day,hour,seconds
221726,2019-04-20 23:57:27.912838,1109595589636746168,7,0,3,23.9575,259047.912838
221727,2019-04-20 23:57:28.381114,5896614299191635403,1,0,3,23.957778,259048.381114
221728,2019-04-20 23:57:28.515423,4172466725848941608,1,0,3,23.957778,259048.515423
221729,2019-04-20 23:57:28.700884,2616279795187318849,7,0,3,23.957778,259048.700884
221730,2019-04-20 23:57:28.868312,8034952072073026056,1,0,3,23.957778,259048.868312


In [10]:
auctions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15644775 entries, 221726 to 47409527
Data columns (total 7 columns):
date           datetime64[ns]
device_id      int64
ref_type_id    category
source_id      object
day            int64
hour           float64
seconds        float64
dtypes: category(1), datetime64[ns](1), float64(2), int64(2), object(1)
memory usage: 850.4+ MB


## Busqueda de features

In [11]:
features = pd.DataFrame(auctions['device_id'].unique(), columns=['device_id'])

In [12]:
features.head()

Unnamed: 0,device_id
0,1109595589636746168
1,5896614299191635403
2,4172466725848941608
3,2616279795187318849
4,8034952072073026056


In [13]:
auctionsMeanHour = auctions[['device_id', 'hour']].groupby(by=['device_id']).mean()
auctionsMeanHour = auctionsMeanHour.reset_index().rename(columns={'hour': 'auctionsMeanHour'})
features = features.merge(auctionsMeanHour, how='left')

In [14]:
auctionsStdDevHour = auctions[['device_id', 'hour']].groupby(by=['device_id']).std()
auctionsStdDevHour = auctionsStdDevHour.reset_index().rename(columns={'hour': 'auctionsStdDevHour'})
features = features.merge(auctionsStdDevHour, how='left')

In [15]:
auctionsCount = auctions[['device_id']].groupby(by=['device_id']).size()
auctionsCount = auctionsCount.reset_index().rename(columns={0: 'auctionsCount'})
features = features.merge(auctionsCount, how='left')

In [16]:
auctionsMostFreqDay = auctions[['device_id', 'day']].groupby(by=['device_id']).agg(lambda x: x.value_counts().index[0])
auctionsMostFreqDay = auctionsMostFreqDay.reset_index().rename(columns={'day': 'auctionsMostFreqDay'})
features = features.merge(auctionsMostFreqDay, how='left')

In [17]:
auctionsMeanInstant = auctions[['device_id', 'seconds']].groupby(by=['device_id']).mean()
auctionsMeanInstant = auctionsMeanInstant.reset_index().rename(columns={'seconds': 'auctionsMeanInstant'})
features = features.merge(auctionsMeanInstant, how='left')

In [18]:
auctionsStdDevInstant = auctions[['device_id', 'seconds']].groupby(by=['device_id']).std()
auctionsStdDevInstant = auctionsStdDevInstant.reset_index().rename(columns={'seconds': 'auctionsStdDevInstant'})
features = features.merge(auctionsStdDevInstant, how='left')

In [19]:
import numpy as np

def topSeconds (dataFrame):
    hist, bin_edges = np.histogram(dataFrame['seconds'], bins=20)
    k = np.argmax(hist)
    return (bin_edges[k] + bin_edges[k+1]) / 2

In [20]:
auctionsTopSecond = auctions[['device_id', 'seconds']].groupby(by='device_id').agg(topSeconds)
auctionsTopSecond = auctionsTopSecond.reset_index().rename(columns={'seconds': 'auctionsTopSecond'})
features = features.merge(auctionsTopSecond, how='left')

In [21]:
auctionsMeanInterval = auctions[['device_id', 'seconds']].sort_values(by='seconds').groupby(by='device_id').agg(lambda x: (x['seconds'] - x['seconds'].shift(1).fillna(0)).mean())
auctionsMeanInterval = auctionsMeanInterval.reset_index().rename(columns={'seconds': 'auctionsMeanInterval'})
features = features.merge(auctionsMeanInterval, how='left')

In [22]:
features['auctionsAproxFreq'] = features['auctionsMeanInterval']**-1

In [23]:
auctionsStdDevInterval = auctions[['device_id', 'seconds']].sort_values(by='seconds').groupby(by='device_id').agg(lambda x: (x['seconds'] - x['seconds'].shift(1).fillna(0)).std())
auctionsStdDevInterval = auctionsStdDevInterval.reset_index().rename(columns={'seconds': 'auctionsStdDevInterval'})
features = features.merge(auctionsStdDevInterval, how='left')

In [24]:
auctionsLast = auctions[['device_id', 'seconds']].groupby(by='device_id').max()
auctionsLast = auctionsLast.reset_index().rename(columns={'seconds': 'auctionsLast'})
features = features.merge(auctionsLast, how='left')

In [25]:
features.head()

Unnamed: 0,device_id,auctionsMeanHour,auctionsStdDevHour,auctionsCount,auctionsMostFreqDay,auctionsMeanInstant,auctionsStdDevInstant,auctionsTopSecond,auctionsMeanInterval,auctionsAproxFreq,auctionsStdDevInterval,auctionsLast
0,1109595589636746168,20.344831,5.77598,128,3,217691.904097,78348.14561,252711.397215,2024.923399,0.000494,16024.489017,259190.195048
1,5896614299191635403,22.110936,5.441725,46,3,252399.878976,19590.204861,257059.673527,5631.657083,0.000178,28188.818505,259056.225835
2,4172466725848941608,21.253534,5.540618,252,3,227027.493094,65290.937857,253271.814659,1028.44949,0.000972,6203.952111,259169.271544
3,2616279795187318849,16.78558,7.204283,191,1,128734.373984,72254.139672,155144.580698,1356.284096,0.000737,7480.476963,259050.262283
4,8034952072073026056,9.002714,6.712023,166,2,117769.315541,59901.94988,98475.225948,1560.535351,0.000641,5658.585669,259048.868312


In [26]:
features.to_csv('data/auctionsFeaturesFirstThreeDays.csv', index=False)