# Basic

In [None]:
import os
import pandas as pd
import json
import csv
import numpy as np
from sklearn import metrics # for evaluations
from sklearn.cluster import KMeans 
import statistics
from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt
import matplotlib.cm as cm

with open("env.json") as f: # inpuy your env file path
    envs = json.load(f)

# 폰트 경로 지정
font_path = envs['FONT_PATH']
prop = FontProperties(fname=font_path)

# Function

In [50]:
# Function 
def attatch_prefix_condition(df, prefix,exclude_col = ['pnum','start_second','end','date','matching']):
    df.columns = [f"{prefix}_{col}" if col not in exclude_col else col for col in df.columns]
    return df

In [None]:
def signal_sum(sampled_signal):
    return np.sum(sampled_signal)

def signal_mean(sampled_signal):

    return np.mean(sampled_signal)

def signal_min(sampled_signal):
    return np.min(sampled_signal)

def signal_max(sampled_signal):
    return np.max(sampled_signal)

def signal_q1(sampled_signal):
    return np.percentile(sampled_signal, 25)

def signal_q2(sampled_signal):
    return np.percentile(sampled_signal, 50)

def signal_q3(sampled_signal):
    return np.percentile(sampled_signal, 75)

def signal_20th(sampled_signal):
    return np.percentile(sampled_signal, 20)

def signal_80th(sampled_signal):
    return np.percentile(sampled_signal, 80)

def signal_standard_deviation(sampled_signal): # 10초에 한번 씩 수집되기 때문에
    try:
        return statistics.stdev(sampled_signal)
    except:
        return np.nan

In [None]:
def get_features(signal):
    features = []
    # mean
    features.append(signal_mean(signal[:, 0].reshape(-1))) # x
    features.append(signal_mean(signal[:, 1].reshape(-1))) # y
    features.append(signal_mean(signal[:, 2].reshape(-1))) # z
    features.append(signal_mean(signal[:, 3].reshape(-1))) # magnitude

    # std
    features.append(signal_standard_deviation(signal[:, 0].reshape(-1)))
    features.append(signal_standard_deviation(signal[:, 1].reshape(-1)))
    features.append(signal_standard_deviation(signal[:, 2].reshape(-1)))
    features.append(signal_standard_deviation(signal[:, 3].reshape(-1))) # magnitude

    # min
    features.append(signal_min(signal[:, 0].reshape(-1)))
    features.append(signal_min(signal[:, 1].reshape(-1)))
    features.append(signal_min(signal[:, 2].reshape(-1)))
    features.append(signal_min(signal[:, 3].reshape(-1))) # magnitude

    # max
    features.append(signal_max(signal[:, 0].reshape(-1)))
    features.append(signal_max(signal[:, 1].reshape(-1)))
    features.append(signal_max(signal[:, 2].reshape(-1)))
    features.append(signal_max(signal[:, 3].reshape(-1))) # magnitude

    # ql
    features.append(signal_q1(signal[:, 0].reshape(-1)))
    features.append(signal_q1(signal[:, 1].reshape(-1)))
    features.append(signal_q1(signal[:, 2].reshape(-1)))
    features.append(signal_q1(signal[:, 3].reshape(-1))) # magnitude

    # q2
    features.append(signal_q2(signal[:, 0].reshape(-1)))
    features.append(signal_q2(signal[:, 1].reshape(-1)))
    features.append(signal_q2(signal[:, 2].reshape(-1)))
    features.append(signal_q2(signal[:, 3].reshape(-1))) # magnitude

    # q3
    features.append(signal_q3(signal[:, 0].reshape(-1)))
    features.append(signal_q3(signal[:, 1].reshape(-1)))
    features.append(signal_q3(signal[:, 2].reshape(-1)))
    features.append(signal_q3(signal[:, 3].reshape(-1))) # magnitude

    return features

In [None]:
def get_hr_features(signal):
    features = []
    # mean
    features.append(signal_mean(signal)) 

    if ~np.isnan(signal_standard_deviation(signal)):
        # std
        features.append(signal_standard_deviation(signal))

        # min
        features.append(signal_min(signal))

        # max
        features.append(signal_max(signal))

        # 20 percentile
        features.append(signal_20th(signal))

        # q2
        features.append(signal_q2(signal))

        # 80 percentile
        features.append(signal_80th(signal))

    else: # 만약 데이터 개수가 1개 미만이면 np.nan으로 추출
        features.extend([np.nan]*6)

    return features


def get_step_features(signal):
    features = []
    # sum 
    features.append(signal_sum(signal)) 
    
    if ~np.isnan(signal_standard_deviation(signal)):
        # mean
        features.append(signal_mean(signal)) 

        # std
        features.append(signal_standard_deviation(signal))

        # min
        features.append(signal_min(signal))

        # max
        features.append(signal_max(signal))

        # q1 percentile
        features.append(signal_q1(signal))

        # q2
        features.append(signal_q2(signal))

        # q3 percentile
        features.append(signal_q3(signal))

    else: # 만약 데이터 개수가 1개 미만이면 np.nan으로 추출
        features.extend([np.nan]*7)

    return features

In [None]:
from datetime import datetime
import pytz
seoul_tz = pytz.timezone('Asia/Seoul')
def find_timestamp(date):
    dt = seoul_tz.localize(datetime.fromtimestamp(date.timestamp()))
    return dt.timestamp()*1000

In [None]:
def kmeans_clutering(X, range_n_clusters):

    # For each number of clusters, perform Silhouette analysis and visualize the results.
    for n_clusters in range_n_clusters:
        
        # Perform k-means.
        kmeans = KMeans(n_clusters=n_clusters, 
                        init = 'k-means++',
                        tol=1e-4,
                        verbose=0,
                        random_state=10)
        y_pred = kmeans.fit_predict(X)
        
        
        # Compute the Silhouette Coefficient for each sample.
        s = metrics.silhouette_samples(X, y_pred)
        
        # Compute the mean Silhouette Coefficient of all data points.
        s_mean = metrics.silhouette_score(X, y_pred)
        
        # For plot configuration -----------------------------------------------------------------------------------
        fig, ax1 = plt.subplots(1, 1)
        fig.set_size_inches(18, 7)
        
        # Configure plot.
        plt.suptitle('Silhouette analysis for K-Means clustering with n_clusters: {}'.format(n_clusters),
                    fontsize=14, fontweight='bold')
        
        # Configure 1st subplot.
        ax1.set_title('Silhouette Coefficient for each sample (Mean Silhouette score: {})'.format(s_mean))
        ax1.set_xlabel("The silhouette coefficient values", fontsize=20,fontproperties=prop)
        ax1.set_ylabel("Cluster", fontsize=20,fontproperties=prop)
        ax1.set_xlim([-1, 1])
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
        # ax1.tick_params(axis='x', labelsize='large')
        # ax1.tick_params(axis='y', labelsize='large')
        
        
        # For 1st subplot ------------------------------------------------------------------------------------------
    
        # Plot Silhouette Coefficient for each sample
        cmap = cm.get_cmap("viridis")
        y_lower = 10
        for i in range(n_clusters):
            ith_s = s[y_pred == i]
            ith_s.sort()
            size_cluster_i = ith_s.shape[0]
            y_upper = y_lower + size_cluster_i
            color = cmap(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_s,
                            facecolor=color, edgecolor=color, alpha=0.7)
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            y_lower = y_upper + 10
            
        # Plot the mean Silhouette Coefficient using red vertical dash line.
        ax1.axvline(x=s_mean, color="red", linestyle="--",label="Mean Silhouette score: {}".format(s_mean.round(3)))
        prop_2 = FontProperties(fname=font_path,size=20)
        ax1.legend(loc='lower left', prop=prop_2)

# CALL LOG

In [None]:
CALL_LOG_SOURCE_PATH = os.path.join(envs['DATA_PATH'],"2_preprocessed","CALL_LOG")
CALL_LOG_DEST_PATH = os.path.join(envs['DATA_PATH'],"3_feature_extraction")

In [None]:
def call_log_feature_enginering(df):
    df = df.copy()
    df['question_len'] = df['question'].apply(lambda x: len(str(x)) if pd.notna(x) else np.nan)
    df['answer_len'] = df['answer'].apply(lambda x: len(str(x)) if pd.notna(x) else np.nan)
    df.loc[(df['answer_len'].isnull()) & (df['question_len'] >= 0), 'answer_len'] = 0

    # delete
    # df['mute_ratio'] = df.apply(lambda row: row['mute'] / row['total_duration'] if row['total_duration'] != 0 else 0, axis=1)
    # df['mute'] = df['mute'].apply(lambda x: 0 if x==0 else ( 1 if x >0 else x) )
    # df = df.drop(['question','answer'], axis =1)
    return df

call_log = pd.read_csv(os.path.join(CALL_LOG_SOURCE_PATH ,"call_log.csv"),parse_dates=['start_second','end','date'])
call_log = call_log_feature_enginering(call_log)
call_log = attatch_prefix_condition(call_log,'call_log')

# work activity features
call_log = call_log[['pnum','start_second','date','call_log_total_duration','call_log_complain','call_log_mute','call_log_question_len', 'call_log_answer_len']]
call_log.columns = ['pnum','start_second','date','(S)_call_log_duration','(S)_call_log_complaint','(NR)_call_log_mute','(S)_call_log_question_len','(R)_call_log_answer_len']

# temporal features
call_log['(S)_temporal_info_weekday'] = call_log['date'].dt.dayofweek + 1 # 월요일은 1, 일요일은 7
call_log['hour'] = call_log['start_second'].dt.hour
call_log['(S)_temporal_info_hour_category'] = pd.cut(call_log['hour'], bins=[0,8,10, 12, 14, 16, 18,21, np.inf ], labels=[0,1, 2, 3, 4, 5, 6,7])

call_log = call_log[['pnum','start_second','date','(S)_call_log_duration','(S)_call_log_complaint','(NR)_call_log_mute','(S)_temporal_info_weekday','(S)_temporal_info_hour_category']]

call_log.to_csv(os.path.join(CALL_LOG_DEST_PATH ,"call_log.csv"),index=False)

call_log['timestamp_start'] = call_log['start_second'].apply(lambda x: find_timestamp(x))
call_log['timestamp_end'] = call_log['end'].apply(lambda x: find_timestamp(x))

# ACC

In [None]:
ACC_SOURCE_PATH = os.path.join(envs['DATA_PATH'],"2_preprocessed","ACC")
ACC_DEST_PATH = os.path.join(envs['DATA_PATH'],"3_feature_extraction")

In [None]:
acc_columns=['pnum', 'start_second', 'end','data_count',
                                                                 
        # mean 
        '(NR)_acc_x_mean', '(NR)_acc_y_mean','(NR)_acc_z_mean','(NR)_acc_magnitude_mean',
        # std
        '(NR)_acc_x_std','(NR)_acc_y_std','(NR)_acc_z_std','(NR)_acc_magnitude_std',

        # min
        '(NR)_acc_x_min', '(NR)_acc_y_min', '(NR)_acc_z_min', '(NR)_acc_magnitude_min',

        # max
        '(NR)_acc_x_max', '(NR)_acc_y_max', '(NR)_acc_z_max', '(NR)_acc_magnitude_max',

        # q1
        '(NR)_acc_x_q1', '(NR)_acc_y_q1', '(NR)_acc_z_q1', '(NR)_acc_magnitude_q1',

        # q2
        '(NR)_acc_x_q2', '(NR)_acc_y_q2', '(NR)_acc_z_q2', '(NR)_acc_magnitude_q2',

        # q3
        '(NR)_acc_x_q3', '(NR)_acc_y_q3', '(NR)_acc_z_q3', '(NR)_acc_magnitude_q3'
    ]


In [None]:
acc_all = pd.read_csv(os.path.join(ACC_SOURCE_PATH,"acc_win.csv"),index_col=0)
pnum_list = acc_all['pnum'].unique()
acc_final = pd.DataFrame(columns = acc_columns)

for pnum in pnum_list:

    # 뽑아낼 pnum 
    pnum_call_log = call_log.query('pnum==@pnum')
    pnum_acc = acc_all.query('pnum==@pnum')

    # call log에 맞게 acc 뽑기 
    for idx, row in pnum_call_log.iterrows():
        start_second = row['start_second']
        end_second = row['end']

        start = row['timestamp_start']
        end = row['timestamp_end']
        break_ = row['break']
        
        filtered_call_acc = pnum_acc.query('(Timestamp >= @start) and (Timestamp <=@end)')

        if not filtered_call_acc.empty:
            call_features = get_features(filtered_call_acc[[' accX',' accY',' accZ','magnitude']].values)
            count_call_acc = len(filtered_call_acc[[' accX',' accY',' accZ','magnitude']].notnull())
        else:
            call_features = [np.nan]*28
            count_call_acc = 0
        
        add_row = [pnum,start_second,end_second,count_call_acc]
        add_row.extend(call_features) 
        acc_final.loc[len(acc_final)] = add_row

acc_final = acc_final[['pnum', 'start_second',
                                                                 
        # mean 
        '(NR)_acc_x_mean', '(NR)_acc_y_mean','(NR)_acc_z_mean','(NR)_acc_magnitude_mean',
        # std
        '(NR)_acc_x_std','(NR)_acc_y_std','(NR)_acc_z_std','(NR)_acc_magnitude_std',

        # min
        '(NR)_acc_x_min', '(NR)_acc_y_min', '(NR)_acc_z_min', '(NR)_acc_magnitude_min',

        # max
        '(NR)_acc_x_max', '(NR)_acc_y_max', '(NR)_acc_z_max', '(NR)_acc_magnitude_max'
]]

# Save Data
acc_final.to_csv(os.path.join(ACC_DEST_PATH,"acc.csv"),index=False)

# ENV

In [None]:
ENV_SOURCE_PATH = os.path.join(envs['DATA_PATH'],"2_preprocessed","ENV")
ENV_DEST_PATH = os.path.join(envs['DATA_PATH'],"3_feature_extraction")

In [None]:
env_columns=['pnum', 'start_second', 'end','data_count',
                                                                
        # mean 
        '(S)_temperature_mean', '(S)_humidity_mean','(S)_co2_mean','(S)_tvoc_mean',
        # std
        '(S)_temperature_std','(S)_humidity_std','(S)_co2_std','(S)_tvoc_std',

        # min
        '(S)_temperature_min', '(S)_humidity_min', '(S)_co2_min', '(S)_tvoc_min',

        # max
        '(S)_temperature_max', '(S)_humidity_max', '(S)_co2_max', '(S)_tvoc_max',

        # q1
        '(S)_temperature_q1', '(S)_humidity_q1', '(S)_co2_q1', '(S)_tvoc_q1',

        # q2
        '(S)_temperature_q2', '(S)_humidity_q2', '(S)_co2_q2', '(S)_tvoc_q2',

        # q3
        '(S)_temperature_q3', '(S)_humidity_q3', '(S)_co2_q3', '(S)_tvoc_q3',
    ]


In [None]:
env_all = pd.read_csv(os.path.join(ENV_SOURCE_PATH,"env.csv"))
pnum_list = env_all['pnum'].unique()
env_final = pd.DataFrame(columns = env_columns)

for pnum in pnum_list:
    # 뽑아낼 pnum
    pnum_call_log = call_log.query('pnum==@pnum')
    pnum_env = env_all.query('pnum==@pnum')

    # call log에 맞게 env 뽑기
    for idx, row in pnum_call_log.iterrows():
        start_second = row['start_second']
        end_second = row['end']

        start = row['timestamp_start']
        end = row['timestamp_end']
        break_ = row['break']

        filtered_call_env = pnum_env.query('(Timestamp >= @start) and (Timestamp <@end)')

        if not filtered_call_env.empty:
            call_features = get_features(filtered_call_env[[' Temperature', ' Humidity', ' CO2'," TVOC"]].values)
            count_call_env = len(filtered_call_env[[' Temperature', ' Humidity', ' CO2'," TVOC"]].notnull())
        else:
            call_features = [np.nan]*28
            count_call_env = 0
        
        add_row = [pnum,start_second,end_second, count_call_env]
        add_row.extend(call_features)
        env_final.loc[len(env_final)] = add_row

env_final = env_final[['pnum', 'start_second',
                                                                
        # mean 
        '(S)_temperature_mean', '(S)_humidity_mean','(S)_co2_mean',
        # std
        '(S)_temperature_std','(S)_humidity_std','(S)_co2_std',

        # min
        '(S)_temperature_min', '(S)_humidity_min', '(S)_co2_min',

        # max
        '(S)_temperature_max', '(S)_humidity_max', '(S)_co2_max']]

env_final.to_csv(os.path.join(ENV_DEST_PATH,"env.csv"),index=False)

# Fitbit

In [None]:
FITBIT_SOURCE_PATH = os.path.join(envs['DATA_PATH'],"2_preprocessed","FITBIT")
FITBIT_DEST_PATH = os.path.join(envs['DATA_PATH'],"3_feature_extraction")

In [63]:
fitbit_hr_all = pd.read_csv(os.path.join(FITBIT_SOURCE_PATH,'fitbit_hr.csv'),parse_dates=['ReadableTimestamp'])
fitbit_step_all = pd.read_csv(os.path.join(FITBIT_SOURCE_PATH,'fitbit_step.csv'),parse_dates=['ReadableTimestamp'])

fitbit_hr_all['Timestamp'] = fitbit_hr_all['ReadableTimestamp'].apply(lambda x: find_timestamp(x))
fitbit_step_all['Timestamp'] = fitbit_step_all['ReadableTimestamp'].apply(lambda x: find_timestamp(x))

In [None]:

hr_columns=['pnum', 'start_second', 'end','hr_data_count',                            
        ## (NR)_fitbit ##                               
        # mean 
        '(NR)_fitbit_hr_mean', 
        # std
        '(NR)_fitbit_hr_std',

        # min
        '(NR)_fitbit_hr_min', 

        # max
        '(NR)_fitbit_hr_max', 

        # 20
        '(NR)_fitbit_hr_20',

        # q2
        '(NR)_fitbit_hr_q2', 

        # 80
        '(NR)_fitbit_hr_q80', 
    ]


step_columns=[ 'step_data_count'
        # sum
        '(NR)_fitbit_step_sum',

        # mean 
        '(NR)_fitbit_step_mean',
        
        # std
        '(NR)_fitbit_step_std',

        # min
        '(NR)_fitbit_step_min', 

        # max
        '(NR)_fitbit_step_max',

        # 20
        '(NR)_fitbit_step_q1', 

        # q2
        '(NR)_fitbit_step_q2',

        # 80
        '(NR)_fitbit_step_q3' 
    ]



In [None]:
pnum_list = fitbit_hr_all['pnum'].unique()
fitbit_final = pd.DataFrame(columns = hr_columns + step_columns)

for pnum in pnum_list:
    # 뽑아낼 pnum
    pnum_call_log = call_log.query('pnum==@pnum')
    pnum_fitbit_hr = fitbit_hr_all.query('pnum==@pnum')
    pnum_fitbit_step = fitbit_step_all.query('pnum==@pnum')

    # call log에 맞게 fitbit 뽑기
    for idx, row in pnum_call_log.iterrows():
        start_second = row['start_second']
        end_second = row['end']

        start = row['timestamp_start']
        end = row['timestamp_end']
        break_ = row['break']

        filtered_call_fitbit_step = pnum_fitbit_step.query('(Timestamp >= @start) and (Timestamp <=@end)')
        filtered_call_fitbit_hr = pnum_fitbit_hr.query('(Timestamp >= @start) and (Timestamp <=@end)')


        if not filtered_call_fitbit_hr.empty:
                hr_features = get_hr_features(filtered_call_fitbit_hr['heart-intraday_value'])
                count_fitbit_hr = len(filtered_call_fitbit_hr[['heart-intraday_value']].notnull())
        else:
            hr_features = [np.nan]*7
            count_fitbit_hr = 0


        break_timestamp = start - break_ * 1000
        filtered_call_fitbit_step = pnum_fitbit_step.query('(Timestamp < @start) and (Timestamp >= @break_timestamp)')

        # step
        if not filtered_call_fitbit_step.empty:
            step_features = get_step_features(filtered_call_fitbit_step['steps-intraday_value'])
            count_fitbit_step =len(filtered_call_fitbit_step[['steps-intraday_value']].notnull())
        else:
            step_features  = [np.nan]*8
            count_fitbit_step = 0

        add_row = [pnum, start_second, end_second, count_fitbit_hr]
        add_row.extend(hr_features)
        add_row.extend(count_fitbit_step)
        add_row.extend(step_features)
        fitbit_final.loc[len(fitbit_final)] = add_row

In [None]:
fitbit_final = fitbit_final[['pnum', 'start_second',                           
        ## (NR)_fitbit ##                               
        # mean 
        '(NR)_fitbit_hr_mean', 
        # std
        '(NR)_fitbit_hr_std',

        # min
        '(NR)_fitbit_hr_min', 

        # max
        '(NR)_fitbit_hr_max', 

        # 20
        '(NR)_fitbit_hr_20',

        # q2
        '(NR)_fitbit_hr_q2', 

        # 80
        '(NR)_fitbit_hr_q80' ]]

In [None]:
fitbit_final.to_csv(os.path.join(FITBIT_DEST_PATH,"fitbit.csv"),index=False)

# Individual factor

In [None]:
INDI_DEST_PATH = os.path.join(envs['DATA_PATH'],"3_feature_extraction")

DEMO_SOURCE_PATH = os.path.join(envs['DATA_PATH'],"2_preprocessed","INDIVID_FACTOR")
demo_df = pd.read_csv(os.path.join(DEMO_SOURCE_PATH,"individual_factor.csv"))

exclude_pnum = [15,16,17]
demo_df = demo_df.query('pnum not in @exclude_pnum')

In [None]:
# cluster 개수 정하는 용도
# kmeans_clutering(individual_factor_df[['age']].values, [2,3,4,5])

In [None]:
kmeans = KMeans(n_clusters=4, 
                init = 'k-means++',
                tol=1e-4,
                verbose=0,
                random_state=10)
y_pred = kmeans.fit_predict(demo_df[['age']].values)
demo_df['age_category'] = y_pred

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
# cluster 개수 정하는 용도
# kmeans_clutering(individual_factor_df[['career']].values, [2,3,4,5])

In [None]:
kmeans = KMeans(n_clusters=4, 
                init = 'k-means+s+',
                tol=1e-4,
                verbose=0,
                random_state=10)
y_pred = kmeans.fit_predict(demo_df[['career']].values)
demo_df['career_category'] = y_pred

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
demo_df = demo_df[['pnum','gender','education','engage_motivation','age_category','career_category']]
demo_df = attatch_prefix_condition(demo_df,'(IF)_demo')

In [None]:
DAILY_SOURCE_PATH = os.path.join(envs['DATA_PATH'],"2_preprocessed","SURVEY_DAILY")
daily_survey = pd.read_csv(os.path.join(DAILY_SOURCE_PATH ,'daily_before_work.csv'),parse_dates=['date'])
daily_survey = daily_survey[['pnum','date','before_work_general_health','before_work_stress', 'before_work_arousal',
                            'before_work_valence','self_reported_sleep_time']]
daily_survey = attatch_prefix_condition(daily_survey,'(IF)')

In [None]:
individual_factor_df= daily_survey.merge(demo_df,on=['pnum'],how='left')
individual_factor_df.to_csv(os.path.join(INDI_DEST_PATH,"individual_factor.csv"),index=False)
