In [1]:
import pandas as pd
import numpy as np
import os
import copy
import datetime

import plotly.express as px
import plotly.graph_objects as go

import tqdm

import Config
import Utils

LABEL_FILE_PATH = Config.LABEL_FILE_PATH
BEGIN_TIME = Utils.timestr2time(Config.BEGIN_TIME_STR)
DAY = datetime.timedelta(days=1)
HOUR = datetime.timedelta(hours=1)

In [2]:
def filter_day(df, date, timestamp_col_name):
    begin_timestamp_day = Utils.time2timestamp(date)
    end_timestamp_day = Utils.time2timestamp(date + DAY)
    df_day = df[(begin_timestamp_day <= df[timestamp_col_name]) & (df[timestamp_col_name] < end_timestamp_day)]
    return df_day

def filter_day_section(df, date_from, date_to, timestamp_col_name):
    begin_timestamp_day = Utils.time2timestamp(date_from)
    end_timestamp_day = Utils.time2timestamp(date_to + DAY)
    df_days = df[(begin_timestamp_day <= df[timestamp_col_name]) & (df[timestamp_col_name] < end_timestamp_day)]
    return df_days

### 抽取数据

continuousrri
continuousbloodoxygensaturation
continuousheartrate
dailyworkoutdetail

In [40]:
def generate_figure_extracted(upload_time, external_id, date, file_name):
    extract_info = Config.EXTRACT_INFO[file_name]
    df = filter_day(pd.read_csv(
        f"{Config.APP_DATA_FILE_PATH}/{upload_time}/extracted/{file_name}/{external_id}.csv",
        encoding="utf-8"
    ).astype(extract_info['dtype']), date, 'record_time').sort_values(by=['record_time'])
    df['record_time'] = df['record_time'].apply(Utils.timestamp2time)
    fig = go.Figure()
    for col_name in df.columns:
        if col_name in ['record_time', 'external_id', 'activity_name', 'sqi']:
            continue
        fig.add_trace(go.Scattergl(
            x=df['record_time'], 
            y=df[col_name], mode='lines+markers', 
            name=f"{col_name}({extract_info['processor']['detail_unit'][col_name]})"))
        if file_name == 'continuousrri':
            fig.add_trace(go.Scattergl(
                x=df['record_time'].loc[df['sqi'] == 0], 
                y=df['rri'].loc[df['sqi'] == 0], 
                mode='markers', name='rri(ms)(sqi=0)'))
        if len(df.columns) <= 4:
            fig.update_yaxes(title_text=f"{col_name}({extract_info['processor']['detail_unit'][col_name]})")
    fig.update_xaxes(title_text='record_time')
    fig.update_layout(
        plot_bgcolor='white',
        yaxis=dict(showgrid=True, gridcolor='lightgray')
    )

    return fig

fig = generate_figure_extracted(1683703843555, 17, BEGIN_TIME + DAY, 'continuousbloodoxygensaturation')
fig.show()

### 重采样数据

minute
continuousrri

In [None]:
def generate_figure_resampled(upload_time, external_id, date, file_name):
    df = filter_day(pd.read_csv(
        f"{Config.APP_DATA_FILE_PATH}/{upload_time}/resampled/{file_name}/{external_id}.csv",
        encoding="utf-8"
    ), date, 'timestamp').sort_values(by=['timestamp'])
    df['record_time'] = df['timestamp'].apply(Utils.timestamp2time)
    fig = go.Figure()
    if file_name == 'minute':
        for col_name in df.columns:
            if col_name in ['timestamp', 'record_time']:
                continue
            fig.add_trace(go.Scattergl(
                x=df['record_time'], 
                y=df[col_name], mode='lines+markers', 
                name=col_name))
    elif file_name == 'continuousrri':
        fig.add_trace(go.Scattergl(
            x=df['record_time'], 
            y=df['rri'], 
            mode='lines+markers', name="rri(ms)"))
        fig.add_trace(go.Scattergl(
            x=df['record_time'].loc[df['sqi'] < 50], 
            y=df['rri'].loc[df['sqi'] < 50], 
            mode='markers', name='rri(ms)(sqi<50)'))
        fig.update_yaxes(title_text="rri(ms)")
    fig.update_xaxes(
        title_text='record_time',
        range=[df.iloc[0]['record_time'], df.iloc[int(df.shape[0] / 9)]['record_time']]
    )
    fig.update_layout(
        plot_bgcolor='white',
        yaxis=dict(showgrid=True, gridcolor='lightgray')
    )
    
    return fig

fig = generate_figure_resampled(1683703843555, 4, BEGIN_TIME, 'minute')
fig.show()

### 单人统计数据

continuousrri
continuousbloodoxygensaturation
continuousheartrate
dailyworkoutdetail

In [6]:
def quantile_25(x):
    return x.quantile(0.25)

def quantile_75(x):
    return x.quantile(0.75)

def get_hour(timestamp):
    return int((Utils.timestamp2time(timestamp) - BEGIN_TIME).total_seconds() // 3600)

def get_day(timestamp):
    return (Utils.timestamp2time(timestamp) - BEGIN_TIME).days + 1

def get_week(timestamp):
    return (Utils.timestamp2time(timestamp) - BEGIN_TIME).days // 7 + 1

In [18]:
def generate_col_statistics(df, col_name, pid, groupby):
    if 'record_time' in df.columns:
        df['timestamp'] = df['record_time']
    df['record_time'] = df['timestamp'].apply(Utils.timestamp2time)
    df['hour'] = df['timestamp'].apply(get_hour)
    df['day'] = df['timestamp'].apply(get_day)
    df['week'] = df['timestamp'].apply(get_week)

    df_features_by_day = df.groupby(groupby)[col_name].agg(
        ['mean', 'median', 'min', 'max', quantile_25, quantile_75, 'std', 'skew', 'count']
    ).reset_index().dropna()
    
    df_features_by_day['pid'] = pid
    return df_features_by_day

In [39]:
def generate_figure_statistics_single(upload_time, external_id, date_from, date_to, file_type, file_name, col_name, groupby):
    df = pd.DataFrame()
    if file_type == 'extracted':
        extract_info = Config.EXTRACT_INFO[file_name]
        df = filter_day_section(pd.read_csv(
            f"{Config.APP_DATA_FILE_PATH}/{upload_time}/extracted/{file_name}/{external_id}.csv",
            encoding="utf-8"
        ).astype(extract_info['dtype']), date_from, date_to, 'record_time').sort_values(by=['record_time'])
    elif file_type == 'resampled':
        df = filter_day_section(pd.read_csv(
            f"{Config.APP_DATA_FILE_PATH}/{upload_time}/resampled/{file_name}/{external_id}.csv",
            encoding="utf-8"
        ), date_from, date_to, 'timestamp').sort_values(by=['timestamp'])
        df = df[~(df[col_name] == 0)]
    df_agg = generate_col_statistics(df, col_name, 0, groupby)

    fig = go.Figure()
    for agg_col in df_agg.columns:
        if agg_col in [groupby, 'pid']:
            continue
        fig.add_trace(go.Scatter(
            x=df_agg[groupby], 
            y=df_agg[agg_col], mode='lines+markers', 
            name=agg_col))
    fig.update_xaxes(
        title_text=groupby, 
        tickmode='array', 
        tickvals=df_agg[groupby], 
        range=[
            df_agg.iloc[0][groupby], 
            df_agg.iloc[10][groupby] if df_agg.shape[0] >= 10 else df_agg.iloc[-1][groupby]
        ]
    )
    fig.update_yaxes(
        title_text=f"{col_name}({extract_info['processor']['detail_unit'][col_name]})" 
        if file_type == 'extracted' else col_name
    )
    fig.update_layout(
        plot_bgcolor='white',
        yaxis=dict(showgrid=True, gridcolor='lightgray')
    )

    return fig

# fig = generate_figure_statistics_single(1683703843555, 1, BEGIN_TIME, BEGIN_TIME + 31 * DAY, 'extracted', 'continuousbloodoxygensaturation', 'blood_oxygen_saturation', 'day')
fig = generate_figure_statistics_single(1683703843555, 1, BEGIN_TIME, BEGIN_TIME + 31 * DAY, 'resampled', 'minute', 'blood_oxygen_saturation', 'day')
fig.show()

### 多人统计数据

In [20]:
def generate_col_statistics_single_fun(df, col_name, pid, groupby, fun_name):
    if 'record_time' in df.columns:
        df['timestamp'] = df['record_time']
    df['record_time'] = df['timestamp'].apply(Utils.timestamp2time)
    df['hour'] = df['timestamp'].apply(get_hour)
    df['day'] = df['timestamp'].apply(get_day)
    df['week'] = df['timestamp'].apply(get_week)

    if fun_name == 'quantile_25':
        fun_name = quantile_25
    elif fun_name == 'quantile_75':
        fun_name = quantile_75

    df_features_by_day = df.groupby(groupby)[col_name].agg(
        [fun_name]
    ).reset_index().dropna()
    
    df_features_by_day['pid'] = pid
    return df_features_by_day

In [21]:
def get_pid(upload_time, external_id):
    return external_id

In [42]:
def generate_figure_statistics_multiple(upload_time, external_id_list, date_from, date_to, file_type, file_name, col_name, mode):
    df = pd.DataFrame()
    for external_id in external_id_list:
        if file_type == 'extracted':
            df_ex = filter_day_section(pd.read_csv(
                f"{Config.APP_DATA_FILE_PATH}/{upload_time}/extracted/{file_name}/{external_id}.csv",
                encoding="utf-8"
            ).astype(Config.EXTRACT_INFO[file_name]['dtype']), date_from, date_to, 'record_time').sort_values(by=['record_time'])
            df_ex['pid'] = get_pid(upload_time, external_id)
            df = pd.concat([df, df_ex], axis=0)
        elif file_type == 'resampled':
            df_ex = filter_day_section(pd.read_csv(
                f"{Config.APP_DATA_FILE_PATH}/{upload_time}/resampled/{file_name}/{external_id}.csv",
                encoding="utf-8"
            ), date_from, date_to, 'timestamp').sort_values(by=['timestamp'])
            df_ex = df_ex[~(df_ex[col_name] == 0)]
            df_ex['pid'] = get_pid(upload_time, external_id)
            df = pd.concat([df, df_ex], axis=0)
        
    fig = go.Figure()
    if mode == 'violin':
        fig = px.violin(df, x='pid', y=col_name, color='pid')
    elif mode == 'box':
        fig = px.box(df, x='pid', y=col_name, color='pid')
    elif mode == 'hist':
        fig = px.histogram(df, x='pid').update_layout(bargap=0.2)
    elif mode == 'heat':
        print(df.info())
        df['day'] = df['timestamp'].apply(get_day)
        fig = px.density_heatmap(df, x='pid', y='day')
    fig.update_layout(
        plot_bgcolor='white',
        yaxis=dict(showgrid=True, gridcolor='lightgray')
    )
    return fig

# fig = generate_figure_statistics_multiple(1683703843555, [i for i in range(1, 5)], BEGIN_TIME, BEGIN_TIME + 31 * DAY, 'extracted', 'continuousbloodoxygensaturation', 'blood_oxygen_saturation', 'box')
fig = generate_figure_statistics_multiple(1683703843555, [i for i in range(1, 5)], BEGIN_TIME, BEGIN_TIME + 31 * DAY, 'resampled', 'minute', 'blood_oxygen_saturation', 'heat')
fig.show()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4488 entries, 222 to 30788
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   timestamp                4488 non-null   int64  
 1   blood_oxygen_saturation  4488 non-null   float64
 2   heart_rate               4488 non-null   int64  
 3   calories_burned          4488 non-null   float64
 4   climb_height             4488 non-null   float64
 5   distance                 4488 non-null   float64
 6   heart_rate_workout       4488 non-null   int64  
 7   step                     4488 non-null   int64  
 8   pid                      4488 non-null   int64  
dtypes: float64(4), int64(5)
memory usage: 350.6 KB
None


### 量表数据

In [None]:
def generate_figure_scale(upload_time, file_name, col_name):
    extract_info = Config.EXTRACT_INFO[file_name]
    df = pd.read_csv(
        f"{Config.APP_DATA_FILE_PATH}/{upload_time}/extracted/{file_name}/{external_id}.csv",
        encoding="utf-8"
    ).astype(extract_info['dtype']).sort_values(by=['record_time'])
    df['record_time'] = df['record_time'].apply(Utils.timestamp2time)
    fig = go.Figure()
    for col_name in df.columns:
        if col_name in ['record_time', 'external_id', 'activity_name', 'sqi']:
            continue
        fig.add_trace(go.Scattergl(
            x=df['record_time'], 
            y=df[col_name], mode='lines+markers', 
            name=f"{col_name}({extract_info['processor']['detail_unit'][col_name]})"))
        if file_name == 'continuousrri':
            fig.add_trace(go.Scattergl(
                x=df['record_time'].loc[df['sqi'] == 0], 
                y=df['rri'].loc[df['sqi'] == 0], 
                mode='markers', name='rri(ms)(sqi=0)'))
        if len(df.columns) <= 4:
            fig.update_yaxes(title_text=f"{col_name}({extract_info['processor']['detail_unit'][col_name]})")
    fig.update_xaxes(title_text='record_time')
    
    return fig

fig = generate_figure_extracted(1683703843555, 'continuousbloodoxygensaturation', 'blood_oxygen_saturation')
fig.show()