# Create Flat Toy Log

## Setup 

In [1]:
import pandas as pd
import sys
sys.path.append('..')
import pandas as pd
import json
from datetime import datetime, timedelta
from pathlib import Path
from src.oced.time_objects import TimeObject
from src.oced.oced_data_query import OCEDDataQuery

## Load Data 

In [2]:
# Get OCED-mHealth Data from JSON File
dataQuery = OCEDDataQuery()  
oced_data_file = f"player_107631_oced_data_time_bouts_notifications_stress_location.json"
data_dict = dataQuery.load_json(oced_data_file)

In [3]:
# Analyze the schema of the loaded data
schema_analysis = dataQuery.analyze_schema()


OCED Data Schema Analysis Report

Object Types and Their Attributes:
----------------------------------------

day (count: 28):
  Attributes:
    - date:
      Type: str
      Count: 28
      Example values: 2025-04-07, 2025-04-27, 2025-05-10
    - day_of_week:
      Type: str
      Count: 28
      Example values: Tuesday, Wednesday, Saturday
  Relationships:
    - belongs_to

intervention (count: 1):
  Attributes:
    - end_date:
      Type: str
      Count: 1
      Example values: 2025-06-09T00:00:00
    - goal:
      Type: str
      Count: 1
      Example values: Illustrate the use of process mining in mHealth applications
    - start_date:
      Type: str
      Count: 1
      Example values: 2025-05-09T00:00:00
  Relationships:
    - participant

location_segment (count: 339):
  Attributes:
    - end_time:
      Type: str
      Count: 339
      Example values: 2025-05-23 19:57:29, 2025-05-23 15:51:28, 2025-05-12 12:25:48
    - location_type:
      Type: str
      Count: 339
      

## Query Event Data 

Get Mood behavior events self-reporting stress

In [4]:
mood_df = dataQuery.get_mood_events_2D(data_dict)
if 'location' in mood_df.columns:
    print("Location data available for", mood_df['location'].notna().sum(), "mood events")
mood_df.head()

Location data available for 52 mood events


Unnamed: 0,timestamp,valence,arousal,stress,location,occurred_on
75,2025-03-20 15:39:28.698,7,5,6,,
74,2025-03-21 11:12:32.086,5,5,5,,
73,2025-03-24 12:16:29.428,5,5,5,,
72,2025-03-25 00:21:36.166,6,3,3,,
71,2025-03-25 03:22:13.885,6,5,3,,


Get physical activity bout behavior events

In [5]:
pa_bouts_df = dataQuery.get_physical_activity_bout_events(data_dict)
if pa_bouts_df is not None:
    print(f"Found {len(pa_bouts_df)} PA bouts")
    if 'bout_type' in pa_bouts_df.columns:
        print("Bout types:", pa_bouts_df['bout_type'].unique())
    if 'lifecycle' in pa_bouts_df.columns:
        print("Lifecycle values:", pa_bouts_df['lifecycle'].unique())
    if 'location_type' in pa_bouts_df.columns:
        print("Location types:", pa_bouts_df['location_type'].unique())
pa_bouts_df

Found 264 PA bouts
Bout types: ['MODERATE-VIGOROUS_PA' 'LIGHT_PA']
Lifecycle values: ['START' 'END']


Unnamed: 0,timestamp,lifecycle,bout_type,location,occurred_on
0,2025-05-10T10:18:01.735000,START,MODERATE-VIGOROUS_PA,invalid,
1,2025-05-10T10:21:21.695000,END,MODERATE-VIGOROUS_PA,invalid,
2,2025-05-10T12:22:41.735000,START,MODERATE-VIGOROUS_PA,other,
3,2025-05-10T12:26:01.695000,END,MODERATE-VIGOROUS_PA,other,
4,2025-05-10T16:06:31.735000,START,MODERATE-VIGOROUS_PA,gym,
...,...,...,...,...,...
259,2025-05-24T13:20:06.695000,END,LIGHT_PA,home,
260,2025-05-24T13:23:46.735000,START,LIGHT_PA,home,
261,2025-05-24T13:26:11.695000,END,LIGHT_PA,home,
262,2025-05-24T15:33:56.735000,START,LIGHT_PA,invalid,


Get notification behavior events

In [6]:
notification_df = dataQuery.get_notification_events(data_dict)
if notification_df is not None:
    print(f"Found {len(notification_df)} notification events")
    if 'notification_type' in notification_df.columns:
        print("Notification types:", notification_df['notification_type'].unique())
    if 'occurred_on' in notification_df.columns:
        print(f"Events with occurred_on relationship: {notification_df['occurred_on'].notna().sum()}")
notification_df.head()

Found 158 notification events
Events with occurred_on relationship: 0


Unnamed: 0,timestamp,action,location,occurred_on
157,2025-05-09 08:51:37.242,RECEIVED,home,
156,2025-05-09 08:51:55.800,READ,home,
155,2025-05-09 11:55:02.267,RECEIVED,other,
154,2025-05-09 11:55:11.964,READ,other,
153,2025-05-09 15:00:37.186,RECEIVED,other,


Get location behavior events

In [7]:
location_df = dataQuery.get_location_behavior_events(data_dict)
if location_df is not None:
    print(f"Found {len(location_df)} location events")
    if 'lifecycle' in location_df.columns:
        print("Lifecycle values:", location_df['lifecycle'].unique())
    if 'location_type' in location_df.columns:
        print("Location types:", location_df['location_type'].unique())

location_df.head()

Total number of behavior events found: 1176

Detailed analysis of first location event:
1. Event type: location_event
2. Time: 2025-05-23T00:00:29
3. All available keys: ['id', 'behaviorEventType', 'time', 'behaviorEventTypeAttributes', 'relationships']
4. behaviorEventTypeAttributes: [{'name': 'lifecycle', 'value': 'Entering'}, {'name': 'location_type', 'value': 'invalid'}]
5. relationships: [{'type': 'object', 'id': '7a5de350-fc9c-471a-8d9a-2ae128539237', 'qualifier': 'derived_from'}, {'type': 'object', 'id': 'd360ac20-326e-4f6f-8113-baa47ef536ef', 'qualifier': 'performed_by'}, {'type': 'object', 'id': '653667bf-f307-4c40-8692-80556139a71e', 'qualifier': 'occurred_on'}]

Full event structure:
{
  "id": "115fd3ce-49ef-4aca-90f6-7f6789b144ad",
  "behaviorEventType": "location_event",
  "time": "2025-05-23T00:00:29",
  "behaviorEventTypeAttributes": [
    {
      "name": "lifecycle",
      "value": "Entering"
    },
    {
      "name": "location_type",
      "value": "invalid"
    }
  ]

Unnamed: 0,timestamp,lifecycle,location_type,occurred_on
14,2025-05-09 08:51:05,Entering,home,
15,2025-05-09 09:17:04,Exiting,home,
16,2025-05-09 09:17:04,Entering,other,
17,2025-05-09 09:49:06,Exiting,other,
18,2025-05-09 09:49:06,Entering,home,


In [8]:
import gc 
del data_dict
gc.collect()

0

In [9]:
import pandas as pd
mood_df.to_json('toy_moods.json', orient='records', indent=4)
pa_bouts_df.to_json('toy_pa_bouts.json', orient='records', indent=4)
notification_df.to_json('toy_notifications.json', orient='records', indent=4)
location_df.to_json('toy_location.json', orient='records', indent=4)

## Build Event Log 

In [10]:
# ensure all dataframes have a common timestamp column
# Convert timestamp columns to datetime if they aren't already
for df in [mood_df, pa_bouts_df, notification_df, location_df]:
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'])

In [11]:
# Add event_type column to each dataframe to identify the source
mood_df['event_type'] = 'self-report'
pa_bouts_df['event_type'] = 'physical_activity'
notification_df['event_type'] = 'notification'
location_df['event_type'] = 'location_change'
mood_df.head()

Unnamed: 0,timestamp,valence,arousal,stress,location,occurred_on,event_type
75,2025-03-20 15:39:28.698,7,5,6,,,self-report
74,2025-03-21 11:12:32.086,5,5,5,,,self-report
73,2025-03-24 12:16:29.428,5,5,5,,,self-report
72,2025-03-25 00:21:36.166,6,3,3,,,self-report
71,2025-03-25 03:22:13.885,6,5,3,,,self-report


In [12]:
# Concatenate all dataframes
event_log = pd.concat([mood_df, pa_bouts_df, notification_df, location_df], ignore_index=True)
event_log = event_log.sort_values('timestamp')

In [13]:
# Example: if your timestamp column is in ms and named 'timestamp'
event_log['timestamp'] = pd.to_datetime(event_log['timestamp'], unit='ms')

In [14]:
# Add day column (date only)
event_log['day'] = event_log['timestamp'].dt.strftime('%Y-%m-%d')
event_log = event_log.drop('occurred_on', axis=1)

In [15]:
#event_log['stress_qual'] = event_log['stress'].apply(lambda x: 'high' if x >= 5 else 'low')
event_log['stress_qual'] = event_log['stress'].apply(lambda x: ( 'high' if x >= 5 else 'low' ) if pd.notna(x) else None)

In [16]:
def get_detailed_event_type(row):
    if row['event_type'] == 'physical_activity':
        # Concatenate (lifecycle + bout_type) for physical_activity events.
        return (row['lifecycle'] + "_" + str(row['bout_type']) if pd.notna(row['lifecycle']) and pd.notna(row['bout_type']) else None)
    elif row['event_type'] == 'location_change':
         # Concatenate (lifecycle + location_type + “location”) for location events.
         return (row['lifecycle'] + "_" + str(row['location_type']) + "_location" if pd.notna(row['lifecycle']) and pd.notna(row['location_type']) else None)
    elif row['event_type'] == 'self-report':
         # Concatenate (“stress_report” + stress_qual) for mood (self-report) events.
         return ("stress_report_" + str(row['stress_qual']) if pd.notna(row['stress_qual']) else None)
    elif row['event_type'] == 'notification':
         # Concatenate (“notification” + action) for notification events.
         return ("notification_" + str(row['action']) if pd.notna(row['action']) else None)
    else:
         return None

# Apply the function (or a lambda) to create the new column “detailed_event_type”:
event_log['detailed_event_type'] = event_log.apply(get_detailed_event_type, axis=1)

# (Optional) Print a few rows (or the head) to verify the new column:
print(event_log[['event_type', 'detailed_event_type']].head(10))

    event_type detailed_event_type
0  self-report  stress_report_high
1  self-report  stress_report_high
2  self-report  stress_report_high
3  self-report   stress_report_low
4  self-report   stress_report_low
5  self-report  stress_report_high
6  self-report  stress_report_high
7  self-report   stress_report_low
8  self-report  stress_report_high
9  self-report   stress_report_low


In [17]:
# Display basic information about the event log
print(f"Total number of events: {len(event_log)}")
print("\nEvent types distribution:")
print(event_log['event_type'].value_counts())
print("\nColumns in the event log:")
print(event_log.columns.tolist())
print("\nFirst few rows of the event log:")
display(event_log.head())

Total number of events: 1176

Event types distribution:
event_type
location_change      678
physical_activity    264
notification         158
self-report           76
Name: count, dtype: int64

Columns in the event log:
['timestamp', 'valence', 'arousal', 'stress', 'location', 'event_type', 'lifecycle', 'bout_type', 'action', 'location_type', 'day', 'stress_qual', 'detailed_event_type']

First few rows of the event log:


Unnamed: 0,timestamp,valence,arousal,stress,location,event_type,lifecycle,bout_type,action,location_type,day,stress_qual,detailed_event_type
0,2025-03-20 15:39:28.698,7.0,5.0,6.0,,self-report,,,,,2025-03-20,high,stress_report_high
1,2025-03-21 11:12:32.086,5.0,5.0,5.0,,self-report,,,,,2025-03-21,high,stress_report_high
2,2025-03-24 12:16:29.428,5.0,5.0,5.0,,self-report,,,,,2025-03-24,high,stress_report_high
3,2025-03-25 00:21:36.166,6.0,3.0,3.0,,self-report,,,,,2025-03-25,low,stress_report_low
4,2025-03-25 03:22:13.885,6.0,5.0,3.0,,self-report,,,,,2025-03-25,low,stress_report_low


In [18]:
# Save the unified event log
event_log.to_json('2d_toy_event_log.json', orient='records', indent=4)
event_log.to_csv('2d_toy_event_log.csv', index=False)