# Load libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandasql import sqldf
import seaborn
import math
import os
import json
from IPython.display import Javascript

# Configurations & Constants

In [None]:
user = 'participant13' # participant1-10
target_freq_as_int = 15 # 15|1
target_freq_unit1 = 'min' # min|s
target_freq_unit2 = 'm' # m|s
dataset_type = '' # '' | time_series_

In [None]:
# participant information
participant_dictionary = json.load(open(f'./data/participant_dictionary.json'))

target_freq = f'{target_freq_as_int}{target_freq_unit1}'
target_freq2 = f'{target_freq_as_int}{target_freq_unit2}'

customer_id = participant_dictionary[user]['fonlog_id']
user_id = participant_dictionary[user]['fonlog_user_id']
start_date = participant_dictionary[user]['start_date']
end_date_plus_one = participant_dictionary[user]['end_date_plus_one']
end_date_plus_two = participant_dictionary[user]['end_date_plus_two']

color = {
    "boxes": "Blue",
    "whiskers": "Black",
    "medians": "Red",
    "caps": "Gray"
}

wo_columns = {
    "Timestamp": "timestamp",
    "Heart Rate (in Beats per minute)": "heart_rate",
    "Stress Score": "stress_score",
    "Stress Interpretation": "stress_level",
    "Number of Steps": "steps",
    "Wearing Off": "wearing_off",
    "started_at": "wo_start",
    "finished_at": "wo_end",
    "Tremors": "wo_tremors",
    "Slowing down of movement": "wo_slowdown",
    "Change in mood or depression": "wo_moodchange",
    "Rigidity of muscles": "wo_rigidity",
    "Sharp pain or prolonged dull pain": "wo_pain",
    "Impairment of complex movements of the hand and fingers": "wo_impairment_hands",
    "Difficulty integrating thoughts or slowing down of thought": "wo_slow_thoughts",
    "Anxiety or panic attacks": "wo_anxiety",
    "Muscle spasm": "wo_muscle_spasm",
    "activity_target.activity_id": "report_id"
}

drug_intake_columns = {
    "started_at": "drug_intake_start",
    "finished_at": "drug_intake_end",
    "Sharp pain or prolonged dull pain": "drug_intake_tremors",
    "Tremors": "drug_intake_slowdown",
    "Anxiety or panic attacks": "drug_intake_moodchange",
    "Rigidity of muscles": "drug_intake_rigidity",
    "Slowing down of movement": "drug_intake_pain",
    "Difficulty integrating thoughts or slowing down of thought": "drug_intake_impairment_hands",
    "Impairment of complex movements of the hand and fingers": "drug_intake_slow_thoughts",
    "Change in mood or depression": "drug_intake_anxiety",
    "Muscle spasm": "drug_intake_muscle_spasm"
}

symptoms_dictionary = {
    "ふるえる": "Tremors",
    "動作が遅くなる": "Slowing down of movement",
    "気分が変化する または おちこむ": "Change in mood or depression",
    "体のどこかがこわばる": "Rigidity of muscles",
    "するどい痛み または 長ぐ続ぐこぶい痛みがある": "Sharp pain or prolonged dull pain",
    "手先の細かい作業が うまくできない": "Impairment of complex movements of the hand and fingers",
    "思考がまとまらない または 頭の回転がおそい": "Difficulty integrating thoughts or slowing down of thought",
    "不安になる または パニック状態になる": "Anxiety or panic attacks",
    "筋肉がひきつる": "Muscle spasm"
}

# FonLog

## Process wearing-off dataset

In [None]:
# import deepl 
# translator = deepl.Translator(auth_key="c1c505b8-e89a-6d26-12c7-1e02442d5779:fx") 

# def translate(df):
#     result = translator.translate_text(df, target_lang="EN-GB")
#     return [ r.text for r in result ]

In [None]:
fonlog_data = pd.read_csv(f'./data/fonlog/records 2023 2 - 10.csv')
# fonlog_data['activity_type.name.translated'] = translate(fonlog_data['activity_type.name'])
# fonlog_data['record_type.name.translated'] = translate(fonlog_data['record_type.name'])
# fonlog_data['customer.name.translated'] = translate(fonlog_data['customer.name'])

# important_columns = {
#     "activity_target.activity_id":           "activity_type_id",
#     "activity_type.activity_type_group_id":  "activity_type_group_id",
#     "activity_type_group.name":              "activity_type_group_name",
#     "activity.activity_type_id":             "activity.activity_type_id",
#     "record_type.activity_type_id":          "record_type.activity_type_id",
#     "activity_type.name":                    "activity_type_name",
#     "record_type.name":                      "record_type.name",
#     "record_type_id":                        "record_type_id",
#     "value":                                 "value",
#     "activity.started_at":                   "started_at",
#     "activity.finished_at":                  "finished_at",
#     "activity_target.user_id":               "activity_target.user_id",
#     "activity.user_id":                      "activity.user_id",
#     "activity_target.customer_id":           "patient_id",
#     "customer.name":                         "patient_name",
#     "customer.birthdate":                    "patient_birthdate",
#     "customer.sex":                          "patient_sex"
# }

important_columns = {
    "activity_target.activity_id":           "activity_target.activity_id",
    "activity_type.activity_type_group_id":  "activity_type.activity_type_group_id",
    "activity_type_group.name":              "activity_type_group.name",
    "activity.activity_type_id":             "activity.activity_type_id",
    "record_type.activity_type_id":          "record_type.activity_type_id",
    "activity_type.name":                    "activity_type.name",
    "record_type.name":                      "record_type.name",
    "record_type_id":                        "record_type_id",
    "value":                                 "value",
    "activity.started_at":                   "activity.started_at",
    "activity.finished_at":                  "activity.finished_at",
    "activity_target.user_id":               "activity_target.user_id",
    "activity.user_id":                      "activity.user_id",
    "activity_target.customer_id":           "activity_target.customer_id",
    "customer.name":                         "patient_name",
    "customer.birthdate":                    "patient_birthdate",
    "customer.sex":                          "patient_sex"
}
fonlog_data = fonlog_data.reindex(columns=important_columns).copy()
fonlog_data.rename(columns=important_columns, inplace=True)

fonlog_data.to_excel(f'./data/fonlog/records.xlsx', sheet_name="records", index=False)

In [None]:
activity_type_id = 17 # for nurse-kitakyu 
# 12 for nurse-kitakyu patient 
# 2 # wearing-off's for noelact
# activity_type_id = 12 # wearing-off's

# Load from file
fonlog_data = pd.read_excel(f'./data/fonlog/records.xlsx',
                           sheet_name='records', engine='openpyxl')

# Make sure to use datetime data type
fonlog_data['activity.started_at'] = pd.to_datetime(fonlog_data['activity.started_at'])
fonlog_data['activity.finished_at'] = pd.to_datetime(fonlog_data['activity.finished_at'])

In [None]:
filtered_fonlog_data = fonlog_data.query(
  # Filter by activity group
  f'`activity_type_group.name` == "Wearing-Off by Nurse"'
  # f'`activity_type_group.name`.str.contains("Wearing-Off", na=False)', engine="python"
).query(
  # Filter by activity
  f'`record_type.activity_type_id` == {activity_type_id}'
).query(
  # Filter by patient
  f"`activity_target.customer_id` == {customer_id} or `activity_target.user_id` == {user_id}"
).query(
  f'`activity.started_at` >= "{start_date}" and `activity.started_at` < "{end_date_plus_two}"'
).copy()
# ).groupby([fonlog_data['activity.started_at'].dt.date]).count()
# add if you want to get count by date

In [None]:
from datetime import datetime as dt
import ast

def convert_list_of_string_periods(started_at, string_of_periods):
  EMPTY_VALUE = np.NaN
  results = []
  try:
    list_of_string = ast.literal_eval(string_of_periods)
  except ValueError:
    list_of_string = None
      
  if list_of_string is None:
    return [[EMPTY_VALUE, EMPTY_VALUE]]
  else:
    for period in list_of_string:
      started_at = dt.combine(
        started_at.date(),
        dt.strptime(
            period.split(' - ')[0],
            "%H:%M"
        ).time()
      )
      finished_at = dt.combine(
        started_at.date(),
        dt.strptime(
            period.split(' - ')[1],
            "%H:%M"
        ).time()
      )
      results.append( [started_at, finished_at] )

    return results

In [None]:
wearing_off_periods = None

for i, v in filtered_fonlog_data.loc[:, ['activity_target.activity_id', 'value', 'activity.started_at']].dropna().iterrows():        
  new_wearing_off_periods = pd.DataFrame(
    convert_list_of_string_periods(v['activity.started_at'], v.value), 
    columns=['started_at', 'finished_at']
  ).assign(
    wearing_off = v['activity_target.activity_id']
  )
  if wearing_off_periods is None:
    wearing_off_periods = new_wearing_off_periods
  else:
    wearing_off_periods = pd.concat(
      [wearing_off_periods, new_wearing_off_periods]
    )
  # if (v.value != '[null]'):
  #     # print(v)
  #     print(ast.literal_eval(v))
  #     print()
    

wearing_off_periods.set_index('wearing_off', inplace=True)
wearing_off_periods.index.name = "activity_target.activity_id"
wearing_off_periods['activity_target.activity_id'] = wearing_off_periods.index
wearing_off_periods.rename(
  inplace=True,
  columns={
    "activity.started_at": "started_at",
    "activity.finished_at": "finished_at",
    "activity_target.activity_id": "wearing_off_id" }
)
wearing_off_periods.sort_values(by="started_at", inplace=True)
# display(wearing_off_periods)
# print(wearing_off_periods.size)
display(
  wearing_off_periods.set_index('started_at').resample('D').count()
)

## Process drug intake dataset

In [None]:
activity_type_ids = [18, 19]
record_type_ids = [87, 88]

filtered_fonlog_data = fonlog_data.query(
  # Filter by activity group
  f'`activity_type_group.name` == "Wearing-Off by Nurse"'
  # f'`activity_type_group.name`.str.contains("Wearing-Off", na=False)', engine="python"
).query(
  # Filter by activity
  f'`record_type.activity_type_id` in {activity_type_ids}'
).query(
  # Filter by report type ID
  f'`record_type_id` in {record_type_ids}'
).query(
  # Filter by patient
  f"`activity_target.customer_id` == {customer_id} or `activity_target.user_id` == {user_id}"
).query(
  f'`activity.started_at` >= "{start_date}" and `activity.started_at` < "{end_date_plus_two}"'
).copy()

In [None]:
drug_intake = None

for i, v in filtered_fonlog_data.loc[:, ['activity_target.activity_id', 'value', 'activity.started_at', 'record_type.activity_type_id']].dropna().iterrows():        
  new_drug_intake = pd.DataFrame(
    convert_list_of_string_periods(v['activity.started_at'], v.value), 
    columns=['started_at', 'finished_at']
  ).assign(
    drug_intake_id = v['activity_target.activity_id'],
    activity_type_id = v['record_type.activity_type_id']
  )
  if drug_intake is None:
    drug_intake = new_drug_intake
  else:
    drug_intake = pd.concat(
        [drug_intake, new_drug_intake]
    )
  # if (v.value != '[null]'):
  #     # print(v)
  #     print(ast.literal_eval(v))
  #     print()
    

drug_intake.set_index('drug_intake_id', inplace=True)
drug_intake.index.name = "activity_target.activity_id"
drug_intake['activity_target.activity_id'] = drug_intake.index
drug_intake.rename(
  inplace=True,
  columns={
    "activity.started_at": "started_at",
    "activity.finished_at": "finished_at",
    'activity_target.activity_id': 'drug_intake_id'
  }
)
drug_intake.sort_values(by="started_at", inplace=True)
display(drug_intake)
print(drug_intake.size)

# Garmin

## Heart rate

In [None]:
heart_rate = pd.read_excel(f'./data/garmin/{user}.xlsx',
                           sheet_name='Heart Rate', index_col='Timestamp',
                           engine='openpyxl')
heart_rate.sort_values('Timestamp', inplace=True)
heart_rate = heart_rate.loc[start_date:end_date_plus_one].rename(
    columns={'Heart Rate (in Beats per minute)': 'heart_rate'}
)
# display(heart_rate.describe())
# display(heart_rate.head())
# display(heart_rate.tail())

## Steps

In [None]:
steps = pd.read_excel(f'./data/garmin/{user}.xlsx',
                           sheet_name='Steps', index_col='Timestamp',
                           engine='openpyxl')
steps.sort_values('Timestamp', inplace=True)
steps = steps.loc[start_date:end_date_plus_one].rename(
    columns={'Number of Steps': 'steps'}
)
# display(steps.describe())
# display(steps.head())
# display(steps.tail())

## Stress

In [None]:
stress = pd.read_excel(f'./data/garmin/{user}.xlsx',
                           sheet_name='Stress', index_col='Timestamp',
                           engine='openpyxl')
stress.sort_values('Timestamp', inplace=True)
stress = stress.loc[start_date:end_date_plus_one].rename(
    columns={'Stress Score': 'stress_score', 'Stress Interpretation': 'stress_interpretation'}
)
# display(stress.describe())
# display(stress.head())
# display(stress.tail())

## Sleep

In [None]:
sleep = pd.read_excel(f'./data/garmin/{user}.xlsx',
                           sheet_name='Sleep', index_col='Calendar Date',
                           engine='openpyxl')
sleep.sort_values('Start Time', inplace=True)
sleep = sleep.loc[start_date:end_date_plus_one]

# Compute duration in minutes
sleep['Duration'] = (sleep['End Time'] - sleep['Start Time']) / np.timedelta64(1, "m")

# # Show original sleep data format
# display('Original')
# display(sleep.head())
# display(sleep.tail())

# Transform sleep data by sleep classification type
sleep = sleep.pivot_table(
    index = 'Calendar Date',
    columns = 'Sleep Type',
    values = 'Duration',
    aggfunc = 'sum'
)
sleep = pd.DataFrame(sleep.to_records()).set_index('Calendar Date').fillna(0)
# Make sure that sleep index is a DateTimeIndex type
sleep.index = pd.to_datetime(sleep.index)
sleep.index.name = 'Timestamp'

# Compute total non-rem sleep
sleep['nonrem_total'] = (sleep['deep'] + sleep['light'])
sleep['total'] = (sleep['nonrem_total'] + sleep['rem'])
sleep['nonrem_percentage'] = sleep['nonrem_total'] / sleep['total']
sleep['sleep_efficiency'] = sleep['total'] / (sleep['total'] + sleep['awake'])

# Ignore unmeasurable column from sleep dataset
if 'unmeasurable' in sleep.columns:
    sleep.drop(columns=['unmeasurable'], inplace=True)

# # Show transformed sleep data
# display('Summary')
# display(sleep.describe())
# display('Transformed')
# display(sleep)

## Complete collection period before resampling

First, fill missing values according to Garmin's documentation and within the collection period.

### Heart rate
* Fill missing values with -1, as per Garmin's documentation for missing values before resampling
* Missing values for the expected period indicate that the Garmin vivosmart4 was not worn

In [None]:
heart_rate_freq = '15s'
reference = pd.DataFrame(
    index = pd.date_range(
        start_date, end_date_plus_two,
        freq = heart_rate_freq, name='Timestamp'
    ).drop(
        pd.Timestamp(end_date_plus_two)
    )
)

heart_rate = reference.merge(
    heart_rate.resample(heart_rate_freq).mean(), on='Timestamp', how='left'
)#.fillna(-1)

# display(heart_rate.describe())
# display(heart_rate.head())
# display(heart_rate.tail())

### Steps
* Fill missing values with -1 to standardize with other Garmin datasets
* Missing values for the expected period indicate that the Garmin vivosmart4 was not worn

In [None]:
# Steps
#     fill missing values with -1 to standardize with other Garmin dataset
#     missing values for the expected period indicates Garmin vivosmart4 was not worn
steps_freq = '15min'
reference = pd.DataFrame(
    index = pd.date_range(
        start_date, end_date_plus_two,
        freq = steps_freq, name='Timestamp'
    ).drop(
        pd.Timestamp(end_date_plus_two)
    )
)

steps = reference.merge(
    steps.resample(steps_freq).mean(), on='Timestamp', how='left'
)#.fillna(-1)

# display(steps.describe())
# display(steps.head())
# display(steps.tail())

### Stress
* Fill missing values with -1, as per Garmin's documentation for missing values before resampling
* Missing values for the expected period indicate that the Garmin vivosmart4 was not worn

In [None]:
stress_freq = '3min'
reference = pd.DataFrame(
    index = pd.date_range(
        start_date, end_date_plus_two,
        freq = stress_freq, name='Timestamp'
    ).drop(
        pd.Timestamp(end_date_plus_two)
    )
)

stress = reference.merge(
    stress.resample(stress_freq).mean(), on='Timestamp', how='left'
)#.fillna(-1)

# display(stress.describe())
# display(stress.head())
# display(stress.tail())

### Sleep
* Fill missing values with -1 to standardize with other Garmin datasets
* Missing values for the expected period indicate that the Garmin vivosmart4 was not worn

In [None]:
sleep_freq = 'D'
reference = pd.DataFrame(
    index = pd.date_range(
        start_date, end_date_plus_two,
        freq = sleep_freq, name = 'Timestamp'
    ).drop(
        pd.Timestamp(end_date_plus_two)
    )
)
sleep = reference.merge(
    sleep.resample(sleep_freq).mean(), on="Timestamp", how='left'
)#.fillna(-1)

# # Show transformed sleep data
# display('Summary')
# display(sleep.describe())
# display('Transformed')
# display(sleep)


In [None]:
# multi_res = pd.concat([heart_rate, steps, stress], axis=1).sort_index(ascending=True)# .to_excel("garmin_multiresolution.xlsx")
# multi_res

# sleep_multi_res = pd.concat([
#     pd.DataFrame(
#         index = pd.date_range(
#             start_date, end_date_plus_two,
#             freq = '15s', name = 'Timestamp'
#         ).drop(
#             pd.Timestamp(end_date_plus_two)
#         )
#     ).merge(
#         sleep.resample('15s').mean(), on='Timestamp', how='left'
#     ).ffill(),

#     pd.DataFrame(
#         index = pd.date_range(
#             start_date, end_date_plus_two,
#             freq = '15min', name = 'Timestamp'
#         ).drop(
#             pd.Timestamp(end_date_plus_two)
#         )
#     ).merge(
#         sleep.resample('15min').mean(), on='Timestamp', how='left'
#     ).ffill(),

#     pd.DataFrame(
#         index = pd.date_range(
#             start_date, end_date_plus_two,
#             freq = '3min', name = 'Timestamp'
#         ).drop(
#             pd.Timestamp(end_date_plus_two)
#         )
#     ).merge(
#         sleep.resample('3min').mean(), on='Timestamp', how='left'
#     ).ffill() 
# ]).reset_index().drop_duplicates(subset=['Timestamp']).set_index('Timestamp').sort_index(ascending=True)
# sleep_multi_res

# multi_res.merge(
#     sleep_multi_res, on='Timestamp', how='left'
# ).sort_index(ascending=True).to_excel("garmin_multiresolution.xlsx")

## Combine Garmin Dataset
Resample according to resampling plan.

Missing values due to resampling:
* Fill using previous known value.
* ``ffill()`` does this step.

In [None]:
# Create reference timestamp dataframe for the collection period
reference = pd.DataFrame(
    index = pd.date_range(
        start_date, end_date_plus_two,
        freq = target_freq, name = 'Timestamp'
    ).drop(
        pd.Timestamp(end_date_plus_two)
    )
)
# display('Reference timestamp')
# display(reference)

# Combine each Garmin dataset to reference timestamp dataframe
garmin_data = reference.merge(
    # downsample heart rate from 15sec to 1min
    #   missing values = -1 same treatment with Garmin with regards to missing value, fitness tracker not worn
    heart_rate.resample(target_freq).mean(), on='Timestamp', how='left'
)#.ffill()
garmin_data = garmin_data.merge(
    steps.resample(target_freq).mean(), on='Timestamp', how='left'
)#.ffill()
garmin_data = garmin_data.merge(
    stress.resample(target_freq).mean(), on='Timestamp', how='left'
)#.ffill()
garmin_data = garmin_data.merge(
    sleep.resample(target_freq).mean().ffill(), on='Timestamp', how='left'
)#.ffill()
# display('Combined Data')
# display(garmin_data)

In [None]:
from IPython.core.display import display, HTML

def display_side_by_side(dfs:list, captions:list, tablespacing=5):
    """Display tables side by side to save vertical space
    Input:
        dfs: list of pandas.DataFrame
        captions: list of table captions
    """
    output = ""
    for (caption, df) in zip(captions, dfs):
        output += df.style.set_table_attributes("style='display:inline'").set_caption(caption)._repr_html_()
        output += tablespacing * "\xa0"
    display(HTML(output))

In [None]:
display_side_by_side([
  garmin_data.head(5),
  garmin_data.tail(5),
  wearing_off_periods.set_index('started_at').resample("D").count()
], ["head", "tail", "wearing-off"])

In [None]:
garmin_data.to_excel(f'./data/steps/{user}/3-{user}_combined_garmin_data_only.xlsx')

# Combine Datasets
Match wearing-off to combined Garmin data based on wearing-off start and end

In [None]:
pysqldf = lambda q: sqldf(q, globals())
cond_join= '''
    select 
        garmin.*,
        wearing_off_periods.*,
        case
            when wearing_off_periods.[started_at] is not null THEN 1
        else 0
        end as 'Wearing Off'
    from garmin_data as garmin
    left join wearing_off_periods
    on garmin.[Timestamp] BETWEEN wearing_off_periods.[started_at] AND wearing_off_periods.[finished_at]
'''

# Change wearing-off columns
combined_data = pysqldf(cond_join).rename(columns=wo_columns)

# Drop duplicates based on timestamp
combined_data = combined_data.drop_duplicates(subset=['timestamp'])

# Set timestamp as index
combined_data['timestamp'] = pd.to_datetime(combined_data['timestamp'])
combined_data = combined_data.set_index('timestamp')

Compute for wearing-off duration

In [None]:
combined_data['wo_duration'] = ''
combined_data['wo_duration'] = (
    pd.to_datetime(combined_data.index) - pd.to_datetime(combined_data['wo_start'])
) / np.timedelta64(1, target_freq_unit2)

# gid = combined_data['wo_duration'].notnull().cumsum()
# dg = combined_data.groupby(gid)
# base = dg['wo_duration'].transform('last')
# combined_data['wo_duration'] = (  base + ( dg.cumcount() ) * target_freq_as_int)

# display(combined_data.iloc[140:150, ])

Match drug intake to combined Garmin data based on drug intake start and end

In [None]:
pysqldf = lambda q: sqldf(q, globals())
cond_join= '''
    select 
        combined_data.*,
        drug_intake.*,
        case
            when drug_intake.[started_at] is not null THEN 1
        else 0
        end as 'drug_intake'
    from combined_data
    left join drug_intake
    on combined_data.[timestamp] BETWEEN drug_intake.[started_at] AND drug_intake.[finished_at]
'''

# Change drug intake columns
combined_data = pysqldf(cond_join).rename(columns={
    "started_at": "drug_intake_start",
    "finished_at": "drug_intake_end"
}).drop_duplicates(subset=['timestamp'])

# Drop duplicates based on timestamp
combined_data['timestamp'] = pd.to_datetime(combined_data['timestamp'])
combined_data = combined_data.set_index('timestamp')
combined_data = combined_data.drop(columns=['activity_target.activity_id'])

Compute time from last drug taken
* find the difference between the reference timestamp and the drug intake start
* convert difference to target frequency's unit e.g. minute, seconds

In [None]:
combined_data['time_from_last_drug_taken'] = ''
combined_data['time_from_last_drug_taken'] = (
    pd.to_datetime(combined_data.index) - pd.to_datetime(combined_data['drug_intake_start'])
) / np.timedelta64(1, target_freq_unit2)

## Not Needed

Fill records after a drug intake report by adding target frequency value
* add 1 minute, 15 seconds or 15 minutes to succeeding records after the drug intake end
* reference: https://stackoverflow.com/a/42748625/2303766

gid = combined_data['time_from_last_drug_taken'].notnull().cumsum()
dg = combined_data.groupby(gid)
base = dg['time_from_last_drug_taken'].transform('last')
combined_data['time_from_last_drug_taken'] = (  base + ( dg.cumcount() ) * target_freq_as_int)

if combined_data['time_from_last_drug_taken'].isna().any():
    combined_data['time_from_last_drug_taken'] = combined_data['time_from_last_drug_taken'].fillna(0)

Generate final symptoms i.e., initial symptom + symptom after drug intake

wo_symptoms = ['wo_pain', 'wo_tremors', 'wo_anxiety', 'wo_rigidity',
               'wo_slowdown', 'wo_slow_thoughts', 'wo_impairment_hands', 
               'wo_moodchange', 'wo_muscle_spasm']

drug_intake_symptoms = ['drug_intake_pain', 'drug_intake_tremors',
                        'drug_intake_anxiety', 'drug_intake_rigidity',
                        'drug_intake_slowdown', 'drug_intake_slow_thoughts',
                        'drug_intake_impairment_hands', 
                        'drug_intake_moodchange', 'drug_intake_muscle_spasm']

def generate_final_symptoms(row):
    values = []
    for wo_symptom, drug_intake_symptom in zip(wo_symptoms, drug_intake_symptoms):
        if math.isnan(row[drug_intake_symptom]):
            if row[wo_symptom] is None or math.isnan(row[wo_symptom]):
                values.append(0)
            else:
                values.append(row[wo_symptom])
        else:
            values.append(row[drug_intake_symptom])
    if sum(values) >= 1:
        values.append(1)
    else:
        values.append(0)
    return pd.Series(values)

symptoms = combined_data.apply(lambda row: generate_final_symptoms(row), axis=1)
symptoms.columns = ['pain', 'tremors', 'anxiety', 'rigidity', 'slowdown', 'slow_thoughts',
                    'impairment_hands', 'moodchange', 'muscle_spasm', 'wearing_off_post_meds']
# combined_data = combined_data.drop(columns=['y'])
combined_data = combined_data.join(symptoms)

Compute for final wearing_off based on
* wearing_off: reported from WoQ-9 Part 1 (symptoms)
* wearing_off_post_meds: reported from WoQ-9 Part 2 (Medicine Intake & its effect on the symptoms)

def combine_wearing_offs(n):
    if n > 0:
        return 1
    else:
        return 0

combined_data["wearing_off"] = (combined_data.wearing_off + combined_data.wearing_off_post_meds).apply(
    lambda n: combine_wearing_offs(n)
).values

# Post-Processing

## Include hour & day of the week
Include hour and day of the week

In [None]:
combined_data['timestamp_hour'] = combined_data.index.hour
combined_data['timestamp_dayofweek'] = combined_data.index.dayofweek

## Encode hour-features as cyclical features
Include hour sine & hour cosine

In [None]:
# Fix timestamp format
date_time = pd.to_datetime(combined_data.index, format='%d.%m.%Y %H:%M:%S')

# Convert to timestamp
timestamp_s = date_time.map(pd.Timestamp.timestamp)

# Get seconds per day
day = 24 * 60 * 60 
# Get seconds per year
year = 365.2425 * day

# Get sine(), cosine() for hour-feature
combined_data['timestamp_hour_sin'] = np.sin(timestamp_s * (2 * np.pi / day))
combined_data['timestamp_hour_cos'] = np.cos(timestamp_s * (2 * np.pi / day))

# Move `wearing_off` feature at the end of the dataframe
tmp = combined_data.pop('timestamp_dayofweek')
combined_data['timestamp_dayofweek'] = tmp
tmp = combined_data.pop('wearing_off')
combined_data['wearing_off'] = tmp

In [None]:
# An "interface" to matplotlib.axes.Axes.hist() method
n, bins, patches = plt.hist(x=combined_data['timestamp_dayofweek'], bins='auto', color='#0504aa',
                            alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution Days of the Week')
# plt.text(23, 45, r'$\mu=15, b=3$')
maxfreq = n.max()
# Set a clean upper y-axis limit.
plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
plt.figure(figsize=(24, 10))
plt.show()

In [None]:
np.sin(combined_data['time_from_last_drug_taken'] * (2 * np.pi / combined_data['time_from_last_drug_taken'].max())).plot(figsize=(24,10))
np.cos(combined_data['time_from_last_drug_taken'] * (2 * np.pi / combined_data['time_from_last_drug_taken'].max())).plot(figsize=(24,10))

In [None]:
combined_data.isna().any()

In [None]:
combined_data.to_excel(f'./data/4-combined_data_{user}_{target_freq}.xlsx')

# Combine and reporting combine data

In [None]:
dfs = []

for i in [1,2,3,4,5,6,7,8,9,10,12,13]:
  tmp = pd.read_excel(f'./data/4-combined_data_participant{i}_15min.xlsx',
                    index_col="timestamp",
                    engine='openpyxl')
  # tmp.groupby(tmp.index.date).sum().wearing_off
  tmp = tmp.assign(participant=i)
  dfs.append(tmp)

# Combined data
combined_participants = pd.concat(dfs, axis=0).drop(
  columns=['activity_type_id']
).copy()

def custom_function_sum(values):
  if np.sum(values) > 0:
    return 1
  else:
    return 0

# Report
# Wearing-Off
wearing_off_summary = combined_participants.pivot_table(
  index=[lambda x: x.date],
  columns=['participant'],
  values='wearing_off',
  aggfunc='sum'
)
wearing_off_summary

# Wearing-Off Binary
wearing_off_summary_binary = combined_participants.pivot_table(
  index=[lambda x: x.date],
  columns=['participant'],
  values='wearing_off',
  aggfunc=custom_function_sum
)
wearing_off_summary_binary

# Wearing-Off By Hour
new_combined_data = combined_participants.copy()
new_combined_data['date'] = new_combined_data.index.date
new_combined_data['hour'] = new_combined_data.index.hour
new_combined_data.set_index(['date', 'hour'], inplace=True)
wearing_off_summary_by_hour_binary = new_combined_data.pivot_table(
  index=["participant", "date"],
  columns=["hour"],
  values="wearing_off",
  aggfunc=custom_function_sum
)
wearing_off_summary_by_hour_binary['total'] = wearing_off_summary_by_hour_binary.sum(axis=1)
wearing_off_summary_by_hour_binary

# Garmin Summary
garmin_summary = combined_participants.pivot_table(
  index=[lambda x: x.date],
  columns=['participant'],
  values=['heart_rate', 'stress_score', 'steps', 'total'],
  aggfunc='mean'
)
garmin_summary

def custom_function_mean(values):
  if np.mean(values) > 0:
    return 1
  else:
    return 0

# Garmin Summary Binary
garmin_summary_binary = combined_participants.pivot_table(
  index=[lambda x: x.date],
  columns=['participant'],
  values=['heart_rate', 'stress_score', 'steps', 'total'],
  aggfunc=custom_function_mean
)
garmin_summary_binary

wearing_off_x_garmin = ( 
  ( wearing_off_summary_binary.fillna(0) * 0.5 ) + 
  garmin_summary_binary.total.fillna(0) )

# Write to Excel file
combined_data_filename = f'./data/4-combined_data.xlsx'
with pd.ExcelWriter(combined_data_filename, engine='openpyxl', mode='w') as writer:
    combined_participants.to_excel(writer, sheet_name="combined")
    wearing_off_summary_binary.to_excel(writer, sheet_name="Wearing-Off Summary Binary")
    wearing_off_summary.to_excel(writer, sheet_name="Wearing-Off Summary Count")
    wearing_off_summary_by_hour_binary.to_excel(writer, sheet_name="Wearing-Off Summary by Hour")
    garmin_summary.to_excel(writer, sheet_name="Garmin Summary Average")
    garmin_summary_binary.to_excel(writer, sheet_name="Garmin Summary Binary")
    wearing_off_x_garmin.to_excel(writer, sheet_name="Wearing-Off x Garmin")