In [9]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [10]:
dtypes = {'time': 'str', 'device': 'category', 'device_activated': 'category'}
parse_dates = ['time']
train_test = pd.read_csv('data/raw/device_activations.csv', dtype=dtypes, parse_dates=['time'])

In [11]:
# Utility function used for value aggregation
def is_activated(times):
    return times.any()

# We first pivot the device column into several columns for each existing device
train_test = train_test.pivot_table(values='device_activated', 
                                            index=['time'], 
                                            columns=['device'], 
                                            aggfunc=is_activated)
# Filling NaN to False if needed
train_test.fillna(False, inplace=True)
print(train_test.head(5))

# We resample with strategy already explained
train_test = train_test.resample('1H').apply(is_activated)
print(train_test.head(5))

# We restack the devices columns into one device column
train_test = train_test.stack().reset_index()
train_test.rename(columns={0: 'activated'}, inplace=True)
train_test.head(21)


device               device_1  device_2  device_3  device_4  device_5  \
time                                                                    
2016-07-01 04:23:32     False     False     False     False     False   
2016-07-01 06:52:57     False      True     False     False     False   
2016-07-01 06:53:00     False      True     False     False     False   
2016-07-01 06:56:41     False      True     False     False     False   
2016-07-01 07:00:01     False     False     False     False     False   

device               device_6  device_7  
time                                     
2016-07-01 04:23:32      True     False  
2016-07-01 06:52:57     False     False  
2016-07-01 06:53:00     False     False  
2016-07-01 06:56:41     False     False  
2016-07-01 07:00:01      True     False  
device               device_1  device_2  device_3  device_4  device_5  \
time                                                                    
2016-07-01 04:00:00     False     False     Fals

Unnamed: 0,time,device,activated
0,2016-07-01 04:00:00,device_1,False
1,2016-07-01 04:00:00,device_2,False
2,2016-07-01 04:00:00,device_3,False
3,2016-07-01 04:00:00,device_4,False
4,2016-07-01 04:00:00,device_5,False
5,2016-07-01 04:00:00,device_6,True
6,2016-07-01 04:00:00,device_7,False
7,2016-07-01 05:00:00,device_1,False
8,2016-07-01 05:00:00,device_2,False
9,2016-07-01 05:00:00,device_3,False


In [12]:
# Let's break down time
train_test['Day'] = train_test['time'].dt.day
train_test['DayOfWeek'] = train_test['time'].dt.dayofweek
train_test['Hour'] = train_test['time'].dt.hour

In [5]:
#train_test.head(21)

In [6]:
print("time unique values: ", len(train_test['time'].unique()))
print("device unique values: ", len(train_test['device'].unique()))
print("activated unique values: ", len(train_test['activated'].unique()))
print("Day unique values: ", len(train_test['Day'].unique()))
print("DayOfWeek unique values: ", len(train_test['DayOfWeek'].unique()))
print("Hour unique values: ", len(train_test['Hour'].unique()))

time unique values:  1478
device unique values:  7
activated unique values:  2
Day unique values:  31
DayOfWeek unique values:  7
Hour unique values:  24


In [7]:
train_test.drop('time', axis=1, inplace=True)

# Reorder the columns
train_test = train_test[['Day','DayOfWeek','Hour','device','activated']]
train, test = train_test_split(train_test, test_size=0.05, random_state=42, stratify=train_test['activated'])

train.to_csv('data/processed/train.csv', index=False)
test.to_csv('data/processed/test.csv', index=False)

In [8]:
train_test['activated'].unique()

array([False,  True])