In [1]:
import pandas as pd
import numpy as np
import os
import copy
import time

import tqdm

import Config
import Utils

# DATA_FILE_PATH = Config.DATA_FILE_PATH_CONTINUOUS_EXTRACTED
DATA_FILE_PATH = Config.DATA_FILE_PATH_CONTINUOUS_RESAMPLED
LABEL_FILE_PATH = Config.LABEL_FILE_PATH

In [None]:
file_name = 'continuousrri'
label_name = 'PHQ9'
data = pd.DataFrame()
labels = []
df_label = pd.read_csv(f"{LABEL_FILE_PATH}/scale.csv", encoding="utf-8")
ex_id_list = [int(_.split('.')[0]) for _ in os.listdir(f"{DATA_FILE_PATH}/{file_name}")]
ex_id_list.sort()
print('loading data:')
for ex_id in tqdm(ex_id_list):
    label_before = 0
    label_after = 0
    if label_name == 'PHQ9':
        label_before = 1 if df_label.loc[df_label['external_id'] == ex_id, label_name + '-1'].item() > 4 else 0
        label_after = 1 if df_label.loc[df_label['external_id'] == ex_id, label_name + '-2'].item() > 4 else 0
    
    df_data = pd.read_csv(f"{DATA_FILE_PATH}/{file_name}/{ex_id}.csv", encoding="utf-8").drop('timestamp', axis=1)
    
    labels += [label_before if _ < 15 else label_after for _ in range(31)]
    data = pd.concat([data, df_data], axis=0, ignore_index=True)

In [2]:
df_data = pd.read_csv(f"{DATA_FILE_PATH}/continuousrri/1.csv", encoding="utf-8")
df_data = Utils.abnormal_processing(df_data, 'rri', Utils.mask_over_3std, Utils.fill_nearest)
df_data = df_data.sort_values(by=['timestamp'])
df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2008800 entries, 0 to 2008799
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   rri        float64
 1   sqi        int64  
 2   timestamp  int64  
 3   missing    int64  
dtypes: float64(1), int64(3)
memory usage: 76.6 MB


In [3]:
df_before = df_data[df_data['timestamp'] < 1638288000000]
df_after = df_data[df_data['timestamp'] >= 1638288000000]
print(df_before.info())
print(df_after.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1036800 entries, 0 to 1036799
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   rri        1036800 non-null  float64
 1   sqi        1036800 non-null  int64  
 2   timestamp  1036800 non-null  int64  
 3   missing    1036800 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 39.6 MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 972000 entries, 1036800 to 2008799
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   rri        972000 non-null  float64
 1   sqi        972000 non-null  int64  
 2   timestamp  972000 non-null  int64  
 3   missing    972000 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 37.1 MB
None


In [4]:
df_data = pd.read_csv(f"{DATA_FILE_PATH}/continuousheartrate/1.csv", encoding="utf-8")
df_data = Utils.abnormal_processing(df_data, 'heartRate(beats/min)', Utils.mask_over_3std, Utils.fill_nearest)
df_data = df_data.sort_values(by=['timestamp'])
df_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22891 entries, 56 to 7786
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   heartRate(beats/min)  22887 non-null  float64
 1   timestamp             22891 non-null  int64  
dtypes: float64(1), int64(1)
memory usage: 536.5 KB


In [4]:
import plotly.express as px
import plotly.graph_objects as go

In [5]:
def timestamp_trans(df: pd.DataFrame):
    df_trans = df.copy()
    df_trans['time'] = df['timestamp'].apply(Utils.timestamp2time)
    df_trans = df_trans.drop('timestamp', axis=1)
    return df_trans

In [9]:
df_before_trans = timestamp_trans(df_before[10000:10000+1000])
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_before_trans['time'], y=df_before_trans['rri'], 
                         mode='lines+markers', name='rri'))
fig.add_trace(go.Scatter(x=df_before_trans['time'].loc[df_before_trans['sqi'] < 50], 
                         y=df_before_trans['rri'].loc[df_before_trans['sqi'] < 50], 
                         mode='markers', name='rri(sqi<50)'))
fig.show()

In [9]:
fig = px.line(timestamp_trans(df_data[:10000]), x='time', y='heartRate(beats/min)')
fig.show()

In [7]:
def generate_summary(df):
    df_summary = pd.DataFrame({
        "name": ["missing", "resampled"],
        "count": [df_data.query("missing == 1").shape[0], df_data.query("missing == 0").shape[0]]
    })
    return df_summary

generate_summary(df_data)

Unnamed: 0,name,count
0,missing,619524
1,resampled,1389276


In [9]:
fig = px.pie(generate_summary(df_data), values='count', names='name')
fig.show()

In [10]:
print(df_data.query("missing == 0")['rri'].mean())

rri        920.502660
sqi         82.645795
missing      0.000000
dtype: float64
