# Load libraries

In [356]:
import pandas as pd
from pandasql import sqldf
import numpy as np
import math
import os
import sys

# Load configuration

In [357]:
# Participant to process
USER = 'participant13'
# USER = f'participant{sys.argv[1]}'

# Collection dataset
# COLLECTION = '2-person'
# COLLECTION = '10-person'
COLLECTION = '3-person'

# Define base path
BASE_DATA_PATH = '/workspaces/data'

# Desired interval
#   * 15min: 15 minutes
#   * 30min: 30 minutes
#   * 1h: 1 hour
#   * 15s: 15 seconds
TARGET_FREQ_AS_INT = 15     # 15|1
TARGET_FREQ_UNIT1 = 'min'   # min|s
TARGET_FREQ_UNIT2 = 'm'     # m|s
TARGET_FREQ = f'{TARGET_FREQ_AS_INT}{TARGET_FREQ_UNIT1}'
TARGET_FREQ2 = f'{TARGET_FREQ_AS_INT}{TARGET_FREQ_UNIT2}'

WEARING_OFF_COLUMNS = {
  "Timestamp": "timestamp",
  "Wearing Off": "wearing_off",
  "started_at": "wo_start", "finished_at": "wo_end",
  "tremors": "wo_tremors",
  "slowdown": "wo_slowdown",
  "moodchange": "wo_moodchange",
  "rigidity": "wo_rigidity",
  "pain": "wo_pain",
  "impairment_hands": "wo_impairment_hands",
  "slow_thoughts": "wo_slow_thoughts",
  "anxiety": "wo_anxiety",
  "muscle_spasm": "wo_muscle_spasm"
}

DRUG_INTAKE_COLUMNS = {
  "started_at": "drug_intake_start",
  "finished_at": "drug_intake_end",
  "tremors": "drug_intake_tremors",
  "slowdown": "drug_intake_slowdown",
  "moodchange": "drug_intake_moodchange",
  "rigidity": "drug_intake_rigidity",
  "pain": "drug_intake_pain",
  "impairment_hands": "drug_intake_impairment_hands",
  "slow_thoughts": "drug_intake_slow_thoughts",
  "anxiety": "drug_intake_anxiety",
  "muscle_spasm": "drug_intake_muscle_spasm"
}

# Load dataset

In [358]:
heart_rate = pd.read_excel(f'{BASE_DATA_PATH}/{COLLECTION}/garmin.xlsx',
                           sheet_name='heart_rate', index_col='Timestamp'
                           ).query(f'participant == "{USER}"')

In [359]:
steps = pd.read_excel(f'{BASE_DATA_PATH}/{COLLECTION}/garmin.xlsx',
                      sheet_name='steps', index_col='Timestamp'
                      ).query(f'participant == "{USER}"')

In [360]:
stress = pd.read_excel(f'{BASE_DATA_PATH}/{COLLECTION}/garmin.xlsx',
                       sheet_name='stress', index_col='Timestamp'
                       ).query(f'participant == "{USER}"')

In [361]:
sleep = pd.read_excel(f'{BASE_DATA_PATH}/{COLLECTION}/garmin.xlsx',
                      sheet_name='sleep', index_col='Calendar Date'
                      ).query(f'participant == "{USER}"')

In [362]:
wearing_off_periods = pd.read_excel(
  f'{BASE_DATA_PATH}/{COLLECTION}/wearing_off_periods.xlsx',
  sheet_name='wearing_off_periods', index_col='wearing_off_id'
).query(
  f'participant == "{USER}"'
).drop(columns=['participant'])

In [363]:
drug_intake_with_symptoms = pd.read_excel(
  f'{BASE_DATA_PATH}/{COLLECTION}/drug_intake_with_symptoms.xlsx',
  sheet_name='drug_intake_with_symptoms', index_col='drug_intake_id'
).query(
  f'participant == "{USER}"'
).drop(columns=['participant'])

# Resampling

There are two steps in resampling the dataset.
1. Complete reference dataframe w.r.t. to the collection period, dataset's interval.
2. Resample to desired interval e.g., 15-minute, 3-minute.


## Complete reference dataframe
* Ensures equal spacing from the collection period's start and end date.
* Fills missing values according to how Garmin reports missing values.

### Heart rate
* Fill missing values with -1, as per Garmin's documentation for missing values before resampling
* Missing values for the expected period indicate that the Garmin vivosmart4 was not worn

In [364]:
# Garmin's heart rate interval
heart_rate_freq = '15s'

# Create reference dataframe based on the start date and end date.
reference = pd.DataFrame(
  index=pd.date_range(
    start=heart_rate.index.min(),
    end=heart_rate.index.max(),
    freq=heart_rate_freq,
    name='Timestamp'
  )
)

# Resample by dataset's interval
# Merge reference with heart rate data
# Fill missing values based on dataset's handling of missing values
heart_rate = reference.merge(
  heart_rate.resample(heart_rate_freq).agg({
    'heart_rate': 'mean',
    'participant': 'first'
  }), on='Timestamp', how='left'
).fillna({
  'heart_rate': -1,
  'participant': USER
})

### Steps
* Fill missing values with -1 to standardize with other Garmin datasets
* Missing values for the expected period indicate that the Garmin vivosmart4 was not worn

In [365]:
# Garmin's step interval
steps_freq = '15min'

# Create reference dataframe based on the start date and end date.
reference = pd.DataFrame(
  index=pd.date_range(
    start=steps.index.min(),
    end=steps.index.max(),
    freq=steps_freq,
    name='Timestamp'
  )
)

# Resample by dataset's interval
# Merge reference with step data
# Fill missing values based on dataset's handling of missing values
steps = reference.merge(
  steps.resample(steps_freq).agg({
    'steps': 'mean',
    'participant': 'first'
  }), on='Timestamp', how='left'
).fillna({
  'steps': -1,
  'participant': USER
})

### Stress
* Fill missing values with -1, as per Garmin's documentation for missing values before resampling
* Missing values for the expected period indicate that the Garmin vivosmart4 was not worn

In [366]:
# Garmin's stress interval
stress_freq = '3min'

# Create reference dataframe based on the start date and end date.
reference = pd.DataFrame(
  index=pd.date_range(
    start=stress.index.min(),
    end=stress.index.max(),
    freq=stress_freq,
    name='Timestamp'
  )
)

# Resample by dataset's interval
# Merge reference with stress data
# Fill missing values based on dataset's handling of missing values
stress = reference.merge(
  stress.resample(stress_freq).agg({
    'stress_score': 'mean',
    'stress_interpretation': 'first',
    'participant': 'first'
  }), on='Timestamp', how='left'
).fillna({
  'stress': -1,
  'stress_interpretation': 'Not enough data',
  'participant': USER
})

### Sleep

In [367]:
# Compute duration in minutes
sleep['duration'] = (sleep['End Time'] - sleep['Start Time']
                     ) / np.timedelta64(1, 'm')

# Transform sleep data by sleep stage
sleep = sleep.pivot_table(
    index='Calendar Date',
    columns='Sleep Type',
    values='duration',
    aggfunc='sum'
)

# Fill missing sleep stage with 0 minutes
sleep = pd.DataFrame(sleep.to_records()).set_index('Calendar Date').fillna(0)

# Set index as DateTimeIndex type
sleep.index = pd.to_datetime(sleep.index)
sleep.index.name = 'Timestamp'

# Compute other sleep features
sleep['nonrem_total'] = (sleep['deep'] + sleep['light'])
sleep['total'] = (sleep['nonrem_total'] + sleep['rem'])
sleep['nonrem_percentage'] = sleep['nonrem_total'] / sleep['total']
sleep['sleep_efficiency'] = sleep['total'] / (sleep['total'] + sleep['awake'])

# Ignore unmeasurable sleep classifcation
if 'unmeasurable' in sleep.columns:
  sleep.drop(columns='unmeasurable', inplace=True)

### Combine Garmin dataset

Resample to desired interval e.g., 15-minute, 3-minute.

Fill missing values due to resampling:
* Fill using previous known value.
* `ffill()` does this step.

In [368]:
start = [heart_rate.index.min(),
         steps.index.min(),
         stress.index.min(),
         sleep.index.min()]

end = [heart_rate.index.max(),
       steps.index.max(),
       stress.index.max(),
       sleep.index.max()]

# Get minimum and maximum date from start
start = min(start)
end = max(end)

print("Check min and max value from datasets")
print(f"min date: {start}")
print(
  f'Heart rate: {heart_rate.index.min()}\n',
  f'Steps: {steps.index.min()}\n',
  f'Stress: {stress.index.min()}\n',
  f'Sleep: {sleep.index.min()}\n',
)

print(f"max date: {end}")
print(
  f'Heart rate: {heart_rate.index.max()}\n',
  f'Steps: {steps.index.max()}\n',
  f'Stress: {stress.index.max()}\n',
  f'Sleep: {sleep.index.max()}\n',
)

Check min and max value from datasets
min date: 2022-12-01 00:00:00
Heart rate: 2022-12-01 00:00:00
 Steps: 2022-12-01 00:00:00
 Stress: 2022-12-01 00:00:00
 Sleep: 2022-12-01 00:00:00

max date: 2023-01-30 23:59:45
Heart rate: 2023-01-30 23:59:45
 Steps: 2023-01-30 23:45:00
 Stress: 2023-01-30 23:57:00
 Sleep: 2023-01-30 00:00:00



In [369]:
# Create reference dataframe based on the start date and end date.
reference = pd.DataFrame(
  index=pd.date_range(
    start=start,
    end=end,
    freq=TARGET_FREQ,
    name='Timestamp'
  )
)

# Combine each Garmin dataset to reference
#   * mean: get the average
#   * first: get the first value i.e., the value on that time
#   * last: get the last value i.e., the value on the previous time
# For example,
#   .resample(target_freq).agg({
#     'heart_rate': 'mean',
#   })

garmin_data = reference.merge(
  # Resample heart rate (15sec) to target frequency (e.g., 15min)
  heart_rate.resample(TARGET_FREQ).agg({
    'heart_rate': 'mean',
  }), on='Timestamp', how='left'
).ffill()

garmin_data = garmin_data.merge(
  # Resample steps (15min) to target frequency (e.g., 15min)
  steps.resample(TARGET_FREQ).agg({
    'steps': 'mean',
  }), on='Timestamp', how='left'
).ffill()

garmin_data = garmin_data.merge(
  # Resample stress (3min) to target frequency (e.g., 15min)
  stress.resample(TARGET_FREQ).agg({
    'stress_score': 'mean',
  }), on='Timestamp', how='left'
).ffill()

garmin_data = garmin_data.merge(
  # Resample sleep (1d) to target frequency (e.g., 15min)
  sleep.resample(TARGET_FREQ).mean(), on='Timestamp', how='left'
).ffill()

display(garmin_data)

Unnamed: 0_level_0,heart_rate,steps,stress_score,awake,deep,light,rem,nonrem_total,total,nonrem_percentage,sleep_efficiency
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-12-01 00:00:00,74.500000,0.0,24.0,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,0.991150
2022-12-01 00:15:00,76.266667,0.0,14.4,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,0.991150
2022-12-01 00:30:00,74.166667,0.0,12.2,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,0.991150
2022-12-01 00:45:00,70.966667,0.0,13.8,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,0.991150
2022-12-01 01:00:00,70.733333,0.0,17.0,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,0.991150
...,...,...,...,...,...,...,...,...,...,...,...
2023-01-30 22:45:00,86.450000,0.0,41.0,3.0,81.0,288.0,109.0,369.0,478.0,0.771967,0.993763
2023-01-30 23:00:00,77.516667,0.0,25.0,3.0,81.0,288.0,109.0,369.0,478.0,0.771967,0.993763
2023-01-30 23:15:00,76.933333,0.0,22.0,3.0,81.0,288.0,109.0,369.0,478.0,0.771967,0.993763
2023-01-30 23:30:00,82.283333,9.0,20.2,3.0,81.0,288.0,109.0,369.0,478.0,0.771967,0.993763


# Combine datasets

## Match wearing-off periods with Garmin data

In [370]:
def pysqldf(q): return sqldf(q, globals())


cond_join = '''
  select 
    garmin.*,
    wearing_off_periods.*,
    case
      when wearing_off_periods.[started_at] is not null THEN 1
    else 0
    end as 'Wearing Off'
  from garmin_data as garmin
  left join wearing_off_periods
  on garmin.[Timestamp] BETWEEN
    wearing_off_periods.[started_at] AND wearing_off_periods.[finished_at]
'''

# Merge Garmin data with wearing off periods
# Then, update columns
combined_data = pysqldf(cond_join).rename(columns=WEARING_OFF_COLUMNS)

# Drop duplicates based on timestamp
combined_data.drop_duplicates(subset='timestamp', inplace=True)

# Set timestamp as index, of type DateTimeIndex
combined_data.set_index('timestamp', inplace=True)
combined_data.index = pd.to_datetime(combined_data.index)

## Match drug intake time to combined data

In [371]:
# Add and substract TARGET_FREQ to the start date
#   to get the start and end date of the wearing off period
#   e.g., 15min -> 15min after
# drug_intake_with_symptoms['started_at_before_buffer'] = (
#   drug_intake_with_symptoms['started_at'] -
#     np.timedelta64(TARGET_FREQ_AS_INT, TARGET_FREQ_UNIT2)
# )

drug_intake_with_symptoms['started_at_after_buffer'] = (
  drug_intake_with_symptoms['started_at'] +
    np.timedelta64(TARGET_FREQ_AS_INT, TARGET_FREQ_UNIT2)
)

drug_intake_with_symptoms

Unnamed: 0_level_0,started_at,pain,tremors,anxiety,rigidity,slowdown,slow_thoughts,impairment_hands,moodchange,muscle_spasm,started_at_after_buffer
drug_intake_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
229,2022-12-19 13:55:21,0,1,0,0,0,1,0,0,0,2022-12-19 14:10:21
294,2023-01-13 12:13:06,0,0,0,0,0,0,0,0,0,2023-01-13 12:28:06
314,2023-01-14 15:17:33,0,0,0,0,0,0,0,0,0,2023-01-14 15:32:33
463,2022-12-01 07:00:00,0,0,0,0,0,0,0,0,0,2022-12-01 07:15:00
464,2022-12-02 07:00:00,0,0,0,0,0,0,0,0,0,2022-12-02 07:15:00
...,...,...,...,...,...,...,...,...,...,...,...
701,2023-01-27 17:00:00,0,0,0,0,0,0,0,0,0,2023-01-27 17:15:00
701,2023-01-27 20:00:00,0,0,0,0,0,0,0,0,0,2023-01-27 20:15:00
701,2023-01-27 20:00:00,0,0,0,0,0,0,0,0,0,2023-01-27 20:15:00
701,2023-01-27 20:00:00,0,0,0,0,0,0,0,0,0,2023-01-27 20:15:00


In [372]:
def pysqldf(q): return sqldf(q, globals())


cond_join = '''
  select 
    combined_data.*,
    drug_intake_with_symptoms.*,
    case
      when drug_intake_with_symptoms.[started_at] is not null THEN 1
    else 0
    end as 'drug_intake'
  from combined_data
  left join drug_intake_with_symptoms
  on combined_data.[timestamp] BETWEEN
  drug_intake_with_symptoms.[started_at] AND
    drug_intake_with_symptoms.[started_at_after_buffer]
'''

# Merge combined data with drug intake times
# Then, update columns
combined_data = pysqldf(cond_join).rename(columns=DRUG_INTAKE_COLUMNS)

# Drop duplicates based on timestamp
combined_data.drop_duplicates(subset='timestamp', inplace=True)
combined_data.drop(
  columns=['started_at_after_buffer'],
  inplace=True)

# Set timestamp as index, of type DateTimeIndex
combined_data.set_index('timestamp', inplace=True)
combined_data.index = pd.to_datetime(combined_data.index)
combined_data

Unnamed: 0_level_0,heart_rate,steps,stress_score,awake,deep,light,rem,nonrem_total,total,nonrem_percentage,...,drug_intake_pain,drug_intake_tremors,drug_intake_anxiety,drug_intake_rigidity,drug_intake_slowdown,drug_intake_slow_thoughts,drug_intake_impairment_hands,drug_intake_moodchange,drug_intake_muscle_spasm,drug_intake
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-12-01 00:00:00,74.500000,0.0,24.0,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,...,,,,,,,,,,0
2022-12-01 00:15:00,76.266667,0.0,14.4,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,...,,,,,,,,,,0
2022-12-01 00:30:00,74.166667,0.0,12.2,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,...,,,,,,,,,,0
2022-12-01 00:45:00,70.966667,0.0,13.8,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,...,,,,,,,,,,0
2022-12-01 01:00:00,70.733333,0.0,17.0,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-30 22:45:00,86.450000,0.0,41.0,3.0,81.0,288.0,109.0,369.0,478.0,0.771967,...,,,,,,,,,,0
2023-01-30 23:00:00,77.516667,0.0,25.0,3.0,81.0,288.0,109.0,369.0,478.0,0.771967,...,,,,,,,,,,0
2023-01-30 23:15:00,76.933333,0.0,22.0,3.0,81.0,288.0,109.0,369.0,478.0,0.771967,...,,,,,,,,,,0
2023-01-30 23:30:00,82.283333,9.0,20.2,3.0,81.0,288.0,109.0,369.0,478.0,0.771967,...,,,,,,,,,,0


## Generate final symptoms

Initial symptoms + Symptoms after drug intake

In [373]:
wo_symptoms = ['wo_pain', 'wo_tremors', 'wo_anxiety', 'wo_rigidity',
               'wo_slowdown', 'wo_slow_thoughts', 'wo_impairment_hands',
               'wo_moodchange', 'wo_muscle_spasm']

drug_intake_symptoms = ['drug_intake_pain', 'drug_intake_tremors',
                        'drug_intake_anxiety', 'drug_intake_rigidity',
                        'drug_intake_slowdown', 'drug_intake_slow_thoughts',
                        'drug_intake_impairment_hands',
                        'drug_intake_moodchange', 'drug_intake_muscle_spasm']

symptoms = ['pain', 'tremors', 'anxiety', 'rigidity', 'slowdown', 'slow_thoughts',
            'impairment_hands', 'moodchange', 'muscle_spasm', 'wearing_off_post_meds']


def generate_final_symptoms(row):
  values = []
  for wo_symptom, drug_intake_symptom in zip(wo_symptoms, drug_intake_symptoms):
    if math.isnan(row[drug_intake_symptom]):
      if row[wo_symptom] is None or math.isnan(row[wo_symptom]):
        values.append(0)
      else:
        values.append(row[wo_symptom])
    else:
      values.append(row[drug_intake_symptom])
  if sum(values) >= 1:
    values.append(1)
  else:
    values.append(0)
  return pd.Series(values)

In [374]:
final_symptoms = combined_data.apply(
  lambda row: generate_final_symptoms(row), axis=1
)
final_symptoms.columns = symptoms
combined_data = combined_data.join(final_symptoms)

## Generate final wearing-off

In [375]:
def combine_wearing_offs(n):
  if n > 0:
    return 1
  else:
    return 0


combined_data["final_wearing_off"] = (
  combined_data.wearing_off + combined_data.wearing_off_post_meds
).apply(
  lambda n: combine_wearing_offs(n)
).values

In [376]:
display(
  combined_data.head(5)
)

display(
  combined_data.columns
)

Unnamed: 0_level_0,heart_rate,steps,stress_score,awake,deep,light,rem,nonrem_total,total,nonrem_percentage,...,tremors,anxiety,rigidity,slowdown,slow_thoughts,impairment_hands,moodchange,muscle_spasm,wearing_off_post_meds,final_wearing_off
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-12-01 00:00:00,74.5,0.0,24.0,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2022-12-01 00:15:00,76.266667,0.0,14.4,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2022-12-01 00:30:00,74.166667,0.0,12.2,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2022-12-01 00:45:00,70.966667,0.0,13.8,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2022-12-01 01:00:00,70.733333,0.0,17.0,4.0,97.0,233.0,118.0,330.0,448.0,0.736607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


Index(['heart_rate', 'steps', 'stress_score', 'awake', 'deep', 'light', 'rem',
       'nonrem_total', 'total', 'nonrem_percentage', 'sleep_efficiency',
       'wearing_off_id', 'wo_start', 'wo_end', 'wo_pain', 'wo_tremors',
       'wo_anxiety', 'wo_rigidity', 'wo_slowdown', 'wo_slow_thoughts',
       'wo_impairment_hands', 'wo_moodchange', 'wo_muscle_spasm',
       'wearing_off', 'drug_intake_id', 'drug_intake_start',
       'drug_intake_pain', 'drug_intake_tremors', 'drug_intake_anxiety',
       'drug_intake_rigidity', 'drug_intake_slowdown',
       'drug_intake_slow_thoughts', 'drug_intake_impairment_hands',
       'drug_intake_moodchange', 'drug_intake_muscle_spasm', 'drug_intake',
       'pain', 'tremors', 'anxiety', 'rigidity', 'slowdown', 'slow_thoughts',
       'impairment_hands', 'moodchange', 'muscle_spasm',
       'wearing_off_post_meds', 'final_wearing_off'],
      dtype='object')

# Post-Processing

## Compute for wearing-off duration

* Compute wearing-off duration
* Compute wearing-off duration until the next reported wearing-off

In [377]:
# Compute wearing-off duration
combined_data['wo_duration'] = (
  pd.to_datetime(combined_data.index) -
  pd.to_datetime(combined_data['wo_start'])
) / np.timedelta64(1, TARGET_FREQ_UNIT2)
combined_data['wo_duration'] = combined_data['wo_duration'].fillna(0)

combined_data.loc[:, ['final_wearing_off',
                      'wo_duration', 'wo_start']].iloc[120:145]

Unnamed: 0_level_0,final_wearing_off,wo_duration,wo_start
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-12-02 06:00:00,1,0.0,2022-12-02 06:00:00.000000
2022-12-02 06:15:00,1,15.0,2022-12-02 06:00:00.000000
2022-12-02 06:30:00,1,30.0,2022-12-02 06:00:00.000000
2022-12-02 06:45:00,1,45.0,2022-12-02 06:00:00.000000
2022-12-02 07:00:00,0,0.0,
2022-12-02 07:15:00,0,0.0,
2022-12-02 07:30:00,0,0.0,
2022-12-02 07:45:00,0,0.0,
2022-12-02 08:00:00,0,0.0,
2022-12-02 08:15:00,0,0.0,


In [378]:
# # Compute wearing-off duration until the next reported wearing-off
# combined_data['wo_duration'] = (
#   pd.to_datetime(combined_data.index) -
#   pd.to_datetime(combined_data['wo_start'])
# ) / np.timedelta64(1, TARGET_FREQ_UNIT2)

# gid = combined_data['wo_duration'].notnull().cumsum()
# dg = combined_data.groupby(gid)
# base = dg['wo_duration'].transform('last')
# combined_data['wo_duration'] = (base + (dg.cumcount()) * TARGET_FREQ_AS_INT)

# combined_data.loc[:, ['final_wearing_off',
#                       'wo_duration', 'wo_start']].iloc[120:145]

## Compute time from last drug taken

Find the difference between the **reference timestamp** & **drug intake start**

In [379]:
# Compute time from last drug intake
combined_data['time_from_last_drug_taken'] = (
  pd.to_datetime(combined_data.index) -
  pd.to_datetime(combined_data['drug_intake_start'])
) / np.timedelta64(1, TARGET_FREQ_UNIT2)

# Then, fill records after drug intake report by adding target frequency
gid = combined_data['time_from_last_drug_taken'].notnull().cumsum()
dg = combined_data.groupby(gid)
base = dg['time_from_last_drug_taken'].transform('last')
combined_data['time_from_last_drug_taken'] = (
  base + (dg.cumcount()) * TARGET_FREQ_AS_INT)

if combined_data['time_from_last_drug_taken'].isna().any():
  combined_data['time_from_last_drug_taken'] = combined_data['time_from_last_drug_taken'].fillna(
    0)

combined_data.loc[:, ['drug_intake_id', 'drug_intake_start',
                      'time_from_last_drug_taken']].iloc[40:70]

Unnamed: 0_level_0,drug_intake_id,drug_intake_start,time_from_last_drug_taken
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-12-01 10:00:00,,,180.0
2022-12-01 10:15:00,,,195.0
2022-12-01 10:30:00,,,210.0
2022-12-01 10:45:00,,,225.0
2022-12-01 11:00:00,,,240.0
2022-12-01 11:15:00,,,255.0
2022-12-01 11:30:00,,,270.0
2022-12-01 11:45:00,,,285.0
2022-12-01 12:00:00,,,300.0
2022-12-01 12:15:00,,,315.0


## Include hour & day of the week


In [380]:
combined_data['timestamp_hour'] = combined_data.index.hour
combined_data['timestamp_dayofweek'] = combined_data.index.dayofweek

## Encode hour-features as cyclical features
Include sine(hour) and cosine(hour) as features.

In [381]:
# Fix timestamp format
date_time = pd.to_datetime(combined_data.index, format='%d.%m.%Y %H:%M:%S')

# Convert to timestamp
timestamp_s = date_time.map(pd.Timestamp.timestamp)

# Get seconds per day
day = 24 * 60 * 60
# Get seconds per year
year = 365.2425 * day

# Get sine(), cosine() for hour-feature
combined_data['timestamp_hour_sin'] = np.sin(timestamp_s * (2 * np.pi / day))
combined_data['timestamp_hour_cos'] = np.cos(timestamp_s * (2 * np.pi / day))

# Save combined & processed dataset

In [382]:
# Choose features
# Garmin features
features = ['heart_rate', 'steps', 'stress_score',
            'awake', 'deep', 'light', 'rem',
            'nonrem_total', 'total', 'nonrem_percentage', 'sleep_efficiency']

# FonLog features
features += ['time_from_last_drug_taken', 'wo_duration']

# Additional features
features += ['timestamp_hour', 'timestamp_dayofweek',
             'timestamp_hour_sin', 'timestamp_hour_cos']

# wearing_off | wearing_off_post_meds | final_wearing_off
TARGET_COLUMN = 'final_wearing_off'
features.append(TARGET_COLUMN)

# columns = ['timestamp'] + features + ['participant']

In [383]:
# Select features to include
for_saving = combined_data.loc[:, features].copy()

# Rename target column
for_saving.rename(columns={
  TARGET_COLUMN: 'wearing_off'
}, inplace=True)

# Add back the participant column
for_saving['participant'] = USER

# Save combined and processed dataset to file
#  If the Excel file exists, append to the existing file
#  If the Excel file does not exist, create a new file
for_saving_file = f'{BASE_DATA_PATH}/{COLLECTION}/combined_data.xlsx'
if os.path.exists(for_saving_file):
  with pd.ExcelWriter(for_saving_file, mode='a', if_sheet_exists='overlay') as writer:
    for_saving.reset_index().to_excel(
      writer, sheet_name='combined_dataset',
      startrow=writer.sheets['combined_dataset'].max_row,
      header=None, index=False
    )
else:
  with pd.ExcelWriter(for_saving_file) as writer:
    for_saving.to_excel(writer, sheet_name='combined_dataset')

In [384]:
USER

'participant13'