In [None]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm


def make_index_date(value):
    return pd.Timestamp(year = value.year, month = value.month, day = value.day)

tqdm.pandas()

df = pd.read_csv("steps(id).csv")
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['index_date'] = df['timestamp'].progress_apply(make_index_date)

del df['id']

In [None]:
# Calculate sunrise and sunset times based on Korean local time

df_suntime = pd.read_excel("sun_2015_2023_with_wake_up.xlsx")
df_suntime['양력(일)'] = pd.to_datetime(df_suntime['양력(일)'])
df_suntime['일출몰(출)'] = pd.to_datetime(df_suntime['일출몰(출)'])
df_suntime['일출몰(몰)'] = pd.to_datetime(df_suntime['일출몰(몰)'])

df = pd.merge(df, df_suntime, left_on='index_date', right_on='양력(일)')
del df['양력(일)']
del df['recommend_wake_up']
df.rename(columns={'일출몰(출)':'sun_time', '일출몰(몰)':'night_time'}, inplace=True)

In [None]:
df_enddate = pd.read_csv("patient_end_date.csv")

In [None]:
import datetime

def fill_enddate(value):
    if pd.isna(value):
        return datetime.datetime(9999,12,31)
    else:
        return value

df = pd.merge(df, df_enddate, left_on='fitbit_id', right_on='patient_id', how='left')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['end_date'] = pd.to_datetime(df['end_date'])

df['end_date'] = df['end_date'].apply(fill_enddate)
del df['patient_id']
df.rename(columns={'fitbit_id':'patient_id'}, inplace=True)
df = df[df['timestamp'] <= df['end_date']]

In [None]:
df_patient = pd.read_csv("all_patient.csv")
df = df[df['patient_id'].isin(df_patient['patient_id'])]

In [None]:
df.sort_values(['patient_id', 'timestamp'], inplace=True)

In [None]:
my_dict = {}
arr = df.values

In [None]:
from tqdm import tqdm

for i in tqdm(range(len(arr)), miniters=1, mininterval=1):
    p = arr[i][0]
    if p not in my_dict:
        my_dict[p] = {}
    current_date = datetime.datetime(arr[i][1].year,arr[i][1].month,arr[i][1].day)
    hour = arr[i][1].hour
    now = arr[i][1].hour + arr[i][1].minute / 60
    if hour <= 3:
        current_date = current_date - datetime.timedelta(days=1)
    if current_date not in my_dict[p] :
        my_dict[p][current_date] = {}
        my_dict[p][current_date]['day'] = []
        my_dict[p][current_date]['bedtime'] = []
        my_dict[p][current_date]['morning'] = []
        my_dict[p][current_date]['afternoon'] = []
        my_dict[p][current_date]['evening'] = []
        my_dict[p][current_date]['daytime'] = []
    my_dict[p][current_date]['day'].append(arr[i][2])
    if hour >= 22 or hour < 4:
        my_dict[p][current_date]['bedtime'].append(arr[i][2])
    elif 4 <= hour < 10:
        my_dict[p][current_date]['morning'].append(arr[i][2])
    elif 10 <= hour < 16:
        my_dict[p][current_date]['afternoon'].append(arr[i][2])
    elif 16 < now < 22:
        my_dict[p][current_date]['evening'].append(arr[i][2])

    sun_start = arr[i][-3].hour + arr[i][-3].minute / 60
    sun_end = arr[i][-2].hour + arr[i][-2].minute / 60
    if sun_start <= hour <= sun_end:
        my_dict[p][current_date]['daytime'].append(arr[i][2])
        

In [None]:
my_dict_all_day = {}

for p in my_dict:
    keys = sorted(my_dict[p])
    start_date = keys[0]
    end_date = keys[-1]

    now = start_date
    my_dict_all_day[p] = {}
    while now <= end_date:
        my_dict_all_day[p][now] = {}
        my_dict_all_day[p][now]['day'] = np.nan
        my_dict_all_day[p][now]['bedtime'] = np.nan
        my_dict_all_day[p][now]['morning'] = np.nan
        my_dict_all_day[p][now]['afternoon'] = np.nan
        my_dict_all_day[p][now]['evening'] = np.nan
        my_dict_all_day[p][now]['daytime'] = np.nan
        my_dict_all_day[p][now]['day_categorical'] = np.nan
        my_dict_all_day[p][now]['mean_over'] = np.nan
        now = now + datetime.timedelta(days=1)

In [None]:
for p in my_dict:
    keys = sorted(my_dict[p])
    sum_of_total = 0
    count_day = 0
    
    for day in keys:
        count_day = count_day + 1
       
        my_dict_all_day[p][day]['day'] = np.sum(my_dict[p][day]['day'])
        sum_of_total = sum_of_total + my_dict_all_day[p][day]['day'] 
        
        my_dict_all_day[p][day]['bedtime'] = np.sum(my_dict[p][day]['bedtime'])
        my_dict_all_day[p][day]['morning'] = np.sum(my_dict[p][day]['morning'])
        my_dict_all_day[p][day]['afternoon'] = np.sum(my_dict[p][day]['afternoon'])
        my_dict_all_day[p][day]['evening'] = np.sum(my_dict[p][day]['evening'])
        my_dict_all_day[p][day]['daytime'] = np.sum(my_dict[p][day]['daytime'])
        
        if my_dict_all_day[p][day]['day'] < 7000:
            my_dict_all_day[p][day]['day_categorical'] = 1
        elif my_dict_all_day[p][day]['day'] < 8000:
            my_dict_all_day[p][day]['day_categorical'] = 2
        elif my_dict_all_day[p][day]['day'] >= 8000:
            my_dict_all_day[p][day]['day_categorical'] = 3
        if count_day == 1:
            my_dict_all_day[p][day]['mean_over'] = np.nan
        elif my_dict_all_day[p][day]['day'] >= (sum_of_total / count_day):
            my_dict_all_day[p][day]['mean_over'] = 1
        else:
            my_dict_all_day[p][day]['mean_over'] = 0

In [None]:
my_list = []
for p in my_dict_all_day:
    for day in sorted(my_dict_all_day[p]):
        row = [p,day.date()]
        row.append(my_dict_all_day[p][day]['day'])
        row.append(my_dict_all_day[p][day]['daytime'])
        row.append(my_dict_all_day[p][day]['day_categorical'])
        row.append(my_dict_all_day[p][day]['morning'])
        row.append(my_dict_all_day[p][day]['afternoon'])
        row.append(my_dict_all_day[p][day]['evening'])
        row.append(my_dict_all_day[p][day]['bedtime'])
        row.append(my_dict_all_day[p][day]['mean_over'])
        my_list.append(row)

df = pd.DataFrame(my_list, columns=["patient_id","timestamp","day_steps_total","sunrise_and_sunset_steps_total","day_steps_total_categorical",
                                    "morning_steps", "afternoon_steps", "evening_steps", "bedtime_steps", "average_step_over"])
df                  

In [None]:
df.to_csv("feature_steps.csv", index=False, encoding="utf-8-sig")