In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import io
tqdm.pandas()

In [2]:
prefix = 'slim_'
colnames=['timestamp', 'entity_id', 'entity_value', 'activity_annotation'] 
df = pd.read_csv("./data/data", names=colnames, sep="\t")

In [3]:
df['timestamp'] = pd.to_datetime(df['timestamp'], format='ISO8601', utc=True)
df['timedelta_from_last_event'] = df['timestamp'].diff().dt.seconds
df['timedelta_from_last_event'] = df['timedelta_from_last_event'].fillna(0)

In [4]:
df

Unnamed: 0,timestamp,entity_id,entity_value,activity_annotation,timedelta_from_last_event
0,2009-06-10 00:00:00.024668+00:00,T003,19,,0.0
1,2009-06-10 00:00:46.069471+00:00,T005,18.5,,46.0
2,2009-06-10 00:00:47.047655+00:00,T003,18.5,,0.0
3,2009-06-10 00:01:17.070215+00:00,T005,18,,30.0
4,2009-06-10 00:01:18.036049+00:00,T004,19.5,,0.0
...,...,...,...,...,...
726529,2009-08-05 23:44:43.054933+00:00,T001,24,,16.0
726530,2009-08-05 23:44:59.058871+00:00,T001,23.5,,16.0
726531,2009-08-05 23:45:15.047153+00:00,T001,24,,15.0
726532,2009-08-05 23:50:00.062322+00:00,T001,23.5,,285.0


In [5]:
activities_df = df[~df['activity_annotation'].isnull()]
activities_df

Unnamed: 0,timestamp,entity_id,entity_value,activity_annotation,timedelta_from_last_event
270,2009-06-10 03:20:59.087874+00:00,M006,ON,Night wandering begin,14.0
293,2009-06-10 03:25:24.070558+00:00,M012,OFF,Night wandering end,0.0
342,2009-06-10 03:45:16.046068+00:00,M009,ON,Bed to toilet begin,326.0
379,2009-06-10 03:49:29.073763+00:00,M005,OFF,Bed to toilet end,1.0
380,2009-06-10 03:54:23.058206+00:00,M002,ON,Night wandering begin,293.0
...,...,...,...,...,...
726050,2009-08-05 20:20:34.081274+00:00,M005,OFF,R2 sleep end,0.0
726226,2009-08-05 20:41:40.049991+00:00,M006,ON,R1 sleep begin,1.0
726267,2009-08-05 20:44:54.000313+00:00,M007,OFF,R1 sleep end,1.0
726404,2009-08-05 23:30:02.031666+00:00,M005,ON,Night wandering begin,86.0


since there's no "switch" kind of events, i will construct those events for making full fram predictions. 

this is done by introducing activity entities for each of the event type:

Bed to toilet (30)
Breakfast (48)
R1 sleep (50)
R1 wake (53)
R1 work in office (46)
Dinner (42)
Laundry (10)
Leave home (69)
Lunch (37)
Night wandering (67)
R2 sleep (52)
R2 take medicine (44)
R2 wake (52)

In [6]:
def map_activity_name_to_type(act_name:str) -> str:
    return act_name.replace(' begin', '').replace(' end', '')

In [7]:
# find unique activities
activity_entity_names = df['activity_annotation'].dropna().unique().tolist()
# remove the begin/end remarks, they will be come entity values
activity_entity_type = [map_activity_name_to_type(entity_name) for entity_name in activity_entity_names]
# create an activity entity_id to name mapping
activity_entity_map = d = {s:f'A{str(i).zfill(3)}' for i, s in enumerate(activity_entity_type, 1)}
print(activity_entity_map)

{'Night wandering': 'A002', 'Bed to toilet': 'A004', 'R1 wake': 'A006', 'R2 wake': 'A008', 'R2 take medicine': 'A010', 'Breakfast': 'A012', 'Leave home': 'A014', 'Lunch': 'A016', 'Dinner': 'A018', 'R2 sleep': 'A020', 'R1 sleep': 'A022', 'R1 work in office': 'A024', 'Laundry': 'A026'}


In [8]:
_tmp_act_df = pd.DataFrame()
_tmp_act_df['entity_id'] = activities_df['activity_annotation'].map(lambda act_name: activity_entity_map[map_activity_name_to_type(act_name)])
_tmp_act_df['timestamp'] = activities_df['timestamp']
_tmp_act_df['timedelta_from_last_event'] = activities_df['timedelta_from_last_event']
_tmp_act_df['entity_value'] = activities_df['activity_annotation'].map(lambda act_name: 'begin' if act_name.endswith('begin') else 'end')

In [9]:
_tmp_act_df

Unnamed: 0,entity_id,timestamp,timedelta_from_last_event,entity_value
270,A002,2009-06-10 03:20:59.087874+00:00,14.0,begin
293,A002,2009-06-10 03:25:24.070558+00:00,0.0,end
342,A004,2009-06-10 03:45:16.046068+00:00,326.0,begin
379,A004,2009-06-10 03:49:29.073763+00:00,1.0,end
380,A002,2009-06-10 03:54:23.058206+00:00,293.0,begin
...,...,...,...,...
726050,A020,2009-08-05 20:20:34.081274+00:00,0.0,end
726226,A022,2009-08-05 20:41:40.049991+00:00,1.0,begin
726267,A022,2009-08-05 20:44:54.000313+00:00,1.0,end
726404,A002,2009-08-05 23:30:02.031666+00:00,86.0,begin


In [10]:
merged_act_sensor_df = pd.concat([df.drop('activity_annotation', axis=1), _tmp_act_df], ignore_index=True)
merged_act_sensor_df['entity_id'] = merged_act_sensor_df['entity_id'].astype(str)
merged_act_sensor_df.sort_values(by='timestamp')
# factorize the categorical features
temp_sensor_mask = merged_act_sensor_df['entity_id'].str.startswith('T')
merged_act_sensor_df['sensor_change'] = ~merged_act_sensor_df['entity_id'].str.startswith('A')
mapped_categories, state_dict = pd.factorize(merged_act_sensor_df['entity_value'][~temp_sensor_mask])
mapped_entities, entity_dict = pd.factorize(merged_act_sensor_df['entity_id'])
merged_act_sensor_df.loc[~temp_sensor_mask, 'entity_value'] = mapped_categories
merged_act_sensor_df['entity_id'] = mapped_entities
merged_act_sensor_df['entity_value'] = merged_act_sensor_df['entity_value'].astype(str)


In [11]:
print(f'total {len(state_dict)}: {state_dict}')
print(f'total {len(entity_dict)}: {entity_dict}')
merged_act_sensor_df.head(10)

total 4: Index(['ON', 'OFF', 'begin', 'end'], dtype='object')
total 45: Index(['T003', 'T005', 'T004', 'T001', 'T002', 'M005', 'M006', 'M002', 'M009',
       'M011', 'M012', 'M022', 'M008', 'M007', 'M003', 'M010', 'M014', 'M015',
       'M023', 'M001', 'M024', 'M021', 'M016', 'M018', 'M020', 'M019', 'M013',
       'M025', 'M027', 'M017', 'M004', 'M026', 'A002', 'A004', 'A006', 'A008',
       'A010', 'A012', 'A014', 'A016', 'A018', 'A020', 'A022', 'A024', 'A026'],
      dtype='object')


Unnamed: 0,timestamp,entity_id,entity_value,timedelta_from_last_event,sensor_change
0,2009-06-10 00:00:00.024668+00:00,0,19.0,0.0,True
1,2009-06-10 00:00:46.069471+00:00,1,18.5,46.0,True
2,2009-06-10 00:00:47.047655+00:00,0,18.5,0.0,True
3,2009-06-10 00:01:17.070215+00:00,1,18.0,30.0,True
4,2009-06-10 00:01:18.036049+00:00,2,19.5,0.0,True
5,2009-06-10 00:01:48.008924+00:00,2,20.0,29.0,True
6,2009-06-10 00:01:49.034019+00:00,0,19.0,1.0,True
7,2009-06-10 00:02:04.007968+00:00,2,19.5,14.0,True
8,2009-06-10 00:02:04.033741+00:00,0,18.5,0.0,True
9,2009-06-10 00:02:20.028116+00:00,2,20.0,15.0,True


In [12]:
# save the dataframe
merged_act_sensor_df.to_parquet(f'./data/{prefix}merged_act_sensor_df.parquet')

In [13]:
# unique entities
unique_entities = merged_act_sensor_df['entity_id'].unique()

print(f'{len(unique_entities)} unique entities: {unique_entities}')

45 unique entities: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44]


In [14]:
first_values = initial_values = merged_act_sensor_df.groupby('entity_id').first().sort_values(by='timestamp').reset_index()
first_values['timedelta_from_last_event'] = 0
first_values

Unnamed: 0,entity_id,timestamp,entity_value,timedelta_from_last_event,sensor_change
0,0,2009-06-10 00:00:00.024668+00:00,19.0,0,True
1,1,2009-06-10 00:00:46.069471+00:00,18.5,0,True
2,2,2009-06-10 00:01:18.036049+00:00,19.5,0,True
3,3,2009-06-10 00:41:04.052911+00:00,21.5,0,True
4,4,2009-06-10 00:41:35.035429+00:00,21.5,0,True
5,5,2009-06-10 01:28:39.066357+00:00,0.0,0,True
6,32,2009-06-10 03:20:59.087874+00:00,2.0,0,False
7,6,2009-06-10 03:20:59.087874+00:00,0.0,0,True
8,7,2009-06-10 03:21:01.038931+00:00,0.0,0,True
9,8,2009-06-10 03:21:08.033939+00:00,0.0,0,True


In [15]:
# create default values for each sensor before they receive their first update
# for temp sensors, the init values are their first values
# for Movement sensors, the init values are opposite of their first values, which is OFF
# for Act type, the init values are opposite of their first values, which is end
def map_init_value(x):
    try:
        float(x)
        return x
    except ValueError:
        if x == str(state_dict.get_loc('ON')):
            return str(state_dict.get_loc('OFF'))
        elif x == str(state_dict.get_loc('begin')):
            return str(state_dict.get_loc('end'))
    return x

initial_frame = pd.DataFrame()
initial_frame['entity_id'] = first_values['entity_id']
initial_frame['entity_value'] = first_values['entity_value'].map(map_init_value)
initial_frame


Unnamed: 0,entity_id,entity_value
0,0,19.0
1,1,18.5
2,2,19.5
3,3,21.5
4,4,21.5
5,5,0.0
6,32,2.0
7,6,0.0
8,7,0.0
9,8,0.0


# create snapshot data frames

In [16]:
# Pivot the dataframe so each entity_id becomes a column, and entity_value are the values
pivot_df = merged_act_sensor_df.pivot(index='timestamp', columns='entity_id', values='entity_value')
# Reset the index to have 'timestamp' as a column again
snapshot_df = pivot_df.reset_index()
# Prepend the initial states to the snapshot df
initial_state = initial_frame.set_index('entity_id').T
initial_state['timestamp'] = merged_act_sensor_df['timestamp'].min() - pd.Timedelta(seconds=1)  # Assume initial timestamp before first event

# prefix the dummy initial state to the dataset, offset by 1 second before the actual head of the events
snapshot_df = pd.concat([initial_state, snapshot_df], ignore_index=True)
# Forward-fill to propagate the last known state for each entity over time
snapshot_df = snapshot_df.ffill()

snapshot_df = snapshot_df.merge(merged_act_sensor_df, on='timestamp')
snapshot_df.rename(columns={'entity_id': 'changed_entity_id', 'entity_value': 'changed_entity_value'}, inplace=True)
# snapshot_df['sensor_change'] = ~snapshot_df['changed_entity_id'].str.startswith('A')
snapshot_df['second'] = snapshot_df['timestamp'].dt.second
snapshot_df['minute'] = snapshot_df['timestamp'].dt.minute
snapshot_df['hour'] = snapshot_df['timestamp'].dt.hour
snapshot_df['dayofweek'] = snapshot_df['timestamp'].dt.day
snapshot_df['weekofmonth'] = (snapshot_df['timestamp'].dt.day - 1) // 7 + 1
snapshot_df['monthofyear'] = snapshot_df['timestamp'].dt.month
# drop the first row as it's dummy initial values
snapshot_df = snapshot_df.drop(index=0)

In [17]:
snapshot_df[~snapshot_df['sensor_change']].head(10)

Unnamed: 0,0,1,2,3,4,5,32,6,7,8,...,changed_entity_id,changed_entity_value,timedelta_from_last_event,sensor_change,second,minute,hour,dayofweek,weekofmonth,monthofyear
271,16.5,16.0,17.5,21.0,20.5,1,2,0,0,0,...,32,2,14.0,False,59,20,3,10,2,6
295,16.5,16.0,17.5,21.0,20.5,1,3,1,1,1,...,32,3,0.0,False,24,25,3,10,2,6
345,16.5,15.5,17.0,20.5,20.0,1,3,1,1,0,...,33,2,326.0,False,16,45,3,10,2,6
383,16.5,15.5,17.0,20.5,20.0,1,3,1,1,1,...,33,3,1.0,False,29,49,3,10,2,6
385,16.5,15.5,17.0,20.5,20.0,1,2,1,0,1,...,32,2,293.0,False,23,54,3,10,2,6
445,16.5,15.5,17.0,20.5,20.0,1,3,1,1,1,...,32,3,0.0,False,45,58,3,10,2,6
635,15.5,14.5,16.0,19.5,19.0,1,3,1,1,0,...,34,2,192.0,False,0,46,5,10,2,6
716,15.5,15.0,16.0,19.5,19.0,1,3,1,1,1,...,34,3,1.0,False,39,51,5,10,2,6
837,16.0,15.5,16.0,19.5,19.5,1,3,1,1,0,...,35,2,5.0,False,10,59,5,10,2,6
1133,17.0,16.0,16.5,19.5,19.5,1,3,1,1,1,...,35,3,3.0,False,30,18,6,10,2,6


In [18]:
snapshot_df.to_parquet(f'./data/{prefix}snapshot_dataset.parquet')

  table = self.api.Table.from_pandas(df, **from_pandas_kwargs)


# create sequence dataframe

In [19]:
import io
import torch
from pandas import Timedelta

buffer = io.BytesIO()
# Helper function to convert NumPy arrays to bytes
def tensor_to_bytes(single_tensor):
    buffer.seek(0)
    buffer.truncate(0)
    torch.save(single_tensor, buffer)
    return buffer.getvalue()

def create_fullframe_sequences(snapshot_df, output_file_prefix, row_limit:int = None, window_size:Timedelta=Timedelta(minutes=5), chunk_size:int=256):
    # Define the window size (T minutes)
    window_size = pd.Timedelta(minutes=5)
    # Filter the dataframe to start T minutes after the first timestamp
    start_time = snapshot_df['timestamp'].min() + window_size
    filtered_df = snapshot_df[snapshot_df['timestamp'] >= start_time].copy()
    
    # Create an empty list to hold the rows (for batch processing)
    rows = []
    
    # Keep track of file count to generate unique file names
    file_count = 1

    row_processed = 0
    
    # Iterate through each row in the filtered_df
    for i, row in tqdm(filtered_df.sample(frac=1).iterrows()):
        end_time = row['timestamp']
        if end_time < start_time: continue
        start_time_window = end_time - window_size
        
        # Get the past T minutes of snapshots (keep multi-dimensional structure)
        sequence = snapshot_df[(snapshot_df['timestamp'] > start_time_window) & 
                               (snapshot_df['timestamp'] <= end_time)]
        # it's important to remove the last row because that's the activity event to prevent leakage 
        sequence = sequence.iloc[:-1,]
        # adding a new column to provide time relationship to the action event 
        sequence = sequence.copy()
        event_time = row['timestamp']
        sequence['time_from'] = (event_time - sequence['timestamp']).dt.seconds
        # Convert the sequence tensor to bytes
        # slim down
        columns_to_drop = ['second', 'minute', 'hour', 'dayofweek', 'weekofmonth', 'monthofyear','timestamp','changed_entity_id','changed_entity_value','sensor_change','timedelta_from_last_event']
        single_sequence = np.array(sequence.drop(columns=columns_to_drop).values)
        single_sequence_tensor = torch.tensor(single_sequence.astype(np.float32))
        sequence_bytes = tensor_to_bytes(single_sequence_tensor)
        
        # Add the sequence (as bytes) and the timestamp to the list of rows
        rows.append({
            'timestamp': end_time,
            'sequence': sequence_bytes,  # Storing as bytes
            'changed_entity_id': row['changed_entity_id'],
            'changed_entity_value': row['changed_entity_value'],
            'sensor_change': row['sensor_change'],
            'month': row['monthofyear'],
            'week': row['weekofmonth'],
            'day': row['dayofweek'],
            'hour': row['hour'],
            'min': row['minute'],
            'secs_from_last': row['timedelta_from_last_event'],
        })
        
        # Write in chunks to Parquet to avoid holding too much in memory
        if len(rows) >= chunk_size:
            # Convert rows to a DataFrame
            df_chunk = pd.DataFrame(rows)
            
            # Save the chunk to a separate Parquet file
            df_chunk.to_parquet(f'{output_file_prefix}{prefix}{file_count}.parquet', index=False)
            
            # Increment the file counter
            file_count += 1
            
            # Clear the rows list for the next chunk
            rows = []

        # limit row count check
        row_processed += 1
        if row_limit and row_processed == row_limit: break
    
    # Write any remaining rows after the loop
    if rows:
        df_chunk = pd.DataFrame(rows)
        df_chunk.to_parquet(f'{output_file_prefix}{prefix}{file_count}.parquet', index=False)
    
    return row_processed

In [20]:
activity_df = snapshot_df[~snapshot_df['sensor_change']]
activity_output_file_prefix = './data/training_act/training_data_chunk_slim_non_activity_'
procssed_activity_sequences_count = create_fullframe_sequences(activity_df, output_file_prefix=activity_output_file_prefix, row_limit=None)
print(f'Generated {procssed_activity_sequences_count} sequences')

1198it [00:04, 289.72it/s]

Generated 1198 sequences





In [21]:
non_activity_df = snapshot_df[snapshot_df['sensor_change']]
non_activity_output_file_prefix = './data/training_act/training_data_chunk_slim_non_activity_'
non_activity_sequence_count = create_fullframe_sequences(non_activity_df, output_file_prefix=non_activity_output_file_prefix, row_limit=procssed_activity_sequences_count*2)
print(f'Generated {non_activity_sequence_count} sequences')

2395it [00:21, 112.15it/s]


Generated 2396 sequences


In [22]:
def bytes_to_tensor(tensor_bytes):
    buffer = io.BytesIO(tensor_bytes)
    loaded_tensor = torch.load(buffer, weights_only=True)
    return loaded_tensor

In [23]:
df_chunk = pd.read_parquet(f'{activity_output_file_prefix}{prefix}1.parquet')
df_chunk.sample(10)

Unnamed: 0,timestamp,sequence,changed_entity_id,changed_entity_value,sensor_change,month,week,day,hour,min,secs_from_last
70,2009-06-19 07:21:12.087558+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,1,18,True,6,3,19,7,21,1.0
1,2009-07-11 11:40:24.035866+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,10,1,True,7,2,11,11,40,0.0
161,2009-07-09 14:01:45.093631+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,9,0,True,7,2,9,14,1,0.0
110,2009-08-02 11:43:07.098496+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,16,0,True,8,1,2,11,43,3.0
154,2009-07-02 17:57:55.034701+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,5,0,True,7,1,2,17,57,0.0
76,2009-07-04 08:01:11.076922+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,20,1,True,7,1,4,8,1,1.0
248,2009-06-15 06:44:17.092995+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,8,1,True,6,3,15,6,44,2.0
118,2009-07-20 20:39:49.099365+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,9,0,True,7,3,20,20,39,0.0
138,2009-07-08 21:31:21.099856+00:00,"b""PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...",14,1,True,7,2,8,21,31,1.0
85,2009-08-03 23:27:49.049683+00:00,"b""PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...",3,24,True,8,1,3,23,27,127.0


In [24]:
df_chunk = pd.read_parquet(f'{non_activity_output_file_prefix}{prefix}1.parquet')
df_chunk.sample(10)

Unnamed: 0,timestamp,sequence,changed_entity_id,changed_entity_value,sensor_change,month,week,day,hour,min,secs_from_last
250,2009-06-22 15:20:18.046903+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,5,0,True,6,4,22,15,20,0.0
44,2009-07-24 20:12:12.015157+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,5,1,True,7,4,24,20,12,1.0
58,2009-07-11 20:56:22.017388+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,13,1,True,7,2,11,20,56,1.0
162,2009-07-23 13:24:34.035141+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,26,0,True,7,4,23,13,24,7.0
67,2009-07-27 14:44:59.043898+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,9,1,True,7,4,27,14,44,2.0
207,2009-06-23 18:18:24.093368+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,14,0,True,6,4,23,18,18,4.0
61,2009-06-22 16:40:52.031519+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,20,0,True,6,4,22,16,40,0.0
73,2009-06-26 17:38:25.000573+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,5,1,True,6,4,26,17,38,2.0
184,2009-06-22 07:25:31.099032+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,7,0,True,6,4,22,7,25,0.0
115,2009-08-03 11:17:21.007389+00:00,b'PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x...,18,0,True,8,1,3,11,17,0.0


In [25]:
bytes_to_tensor(df_chunk.iloc[0]['sequence'])

tensor([[ 22.,  23.,  22.,  ...,   3.,   3., 296.],
        [ 22.,  23.,  22.,  ...,   3.,   3., 294.],
        [ 22.,  23.,  22.,  ...,   3.,   3., 292.],
        ...,
        [ 22.,  23.,  22.,  ...,   3.,   3.,   5.],
        [ 22.,  23.,  22.,  ...,   3.,   3.,   4.],
        [ 22.,  23.,  22.,  ...,   3.,   3.,   0.]])