### Data preprocessing

### Import Module

In [None]:
import os
import glob
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import math
from scipy.signal import butter, filtfilt
import seaborn as sns

## Calculate Oxy

In [None]:
def Oxy_Cal(df):
    '''
    return: Oxygenated blood information

    inputs:
        df: df of LED information
    '''
    eHHb_730 = 1.102
    eHHb_850 = 0.691
    eHbO_730 = 0.39
    eHbO_850 = 1.058
    DPF_730 = 223.3+0.05624*(15**0.8493)-5.723*10**(-7)*(730**3)+0.001245*(730**2)+(-0.9025)*730
    DPF_850 = 223.3+0.05624*(15**0.8493)-5.723*10**(-7)*(850**3)+0.001245*(850**2)+(-0.9025)*850
    Oxy_data = pd.DataFrame() 
    for i in range(0, 4):
        ori_730 = df.iloc[:10, 2*i].mean()
        ori_850 = df.iloc[:10, 2*i+1].mean()
        OD_730 = np.log10(df.iloc[:, 2*i] / ori_730)
        OD_850 = np.log10(df.iloc[:, 2*i+1] / ori_850)
        Oxy_data['CH'+str(i+1)+'_Oxy'] = 10*(eHHb_730*(OD_850/DPF_850) - eHHb_850*(OD_730/DPF_730)) / ((eHHb_730*eHbO_850 - eHHb_850*eHbO_730)*0.03)
        Oxy_data['CH'+str(i+1)+'_Deoxy'] = 10*(eHbO_850*(OD_730/DPF_730) - eHbO_730*(OD_850/DPF_850)) / ((eHHb_730*eHbO_850 - eHHb_850*eHbO_730)*0.03)

    Oxy_data = Oxy_data.iloc[10:]
    return Oxy_data

## Fill NaN

In [None]:
def fill_nan(df):
    '''
    return: df without NaN

    inputs:
        df: uncleared dataframe
    '''
    OwO_wona = pd.DataFrame()
    df[np.isinf] = np.nan
    OwO_wona = df.fillna(method='ffill')
    return OwO_wona

## Filter

In [None]:
def filtering(df):
    '''
    return: filterd dataframe

    inputs:
        df: dataframe without filter
    '''
    # butt
    N = 4
    nyq = 0.5 * 17
    OwO_filterd = pd.DataFrame()
    for col in df:
        b, a = butter(N, 0.1/nyq, 'low')
        filterd = filtfilt(b, a, df[col])
        OwO_filterd[col] = filterd

    OwO_filterd.index = df.index
    return OwO_filterd

## Normalize

In [None]:
def normalize(df):
    '''
    return: normalized dataframe

    inputs:
        df: dataframe
    '''
    OwO_normalized = pd.DataFrame()
    OwO_normalized = (df - df.min())/(df.max()-df.min())
    return OwO_normalized

In [None]:
group_dir = []
for dirPath, dirNames, fileNames in os.walk(r'path\1_ori_data'):
    group_dir.append(dirPath)
group_dir.pop(0)

all_feature = pd.DataFrame()
for group in group_dir:
    split_list = group.split('\\')
    data_link = os.path.join('..\\', split_list[-4], split_list[-1], "*.csv")
    file_list = glob.glob(data_link)
    for file in file_list:
        df = pd.read_csv(file)
        df_time = df.set_index('Time_Host')
        df_clean = fill_nan(df_time[['CH1_Oxy', 'CH1_Deoxy', 'CH2_Oxy', 'CH2_Deoxy', 'CH3_Oxy', 'CH3_Deoxy', 'CH4_Oxy', 'CH4_Deoxy']])
        df_filter = filtering(df_clean)
        df_norm = normalize(df_filter)
        df_norm.index = df_time.index
        df_norm.index = df_norm.index - df_norm.index[0]
        save_path = file.replace('1_ori_data', '2_processed_data')
        df_norm.to_csv(save_path, index=True)