# Basic

In [None]:
# basic library
import os
import pandas as pd
import numpy as np
import json
from matplotlib.font_manager import FontProperties

# 작업 경로 지정
CURRENT_FOLDER  = "your_path"
os.chdir(CURRENT_FOLDER)

from src.modeling.metric import * 
from src.modeling.preprocessing import *   
from src.modeling.MLmodel import * 

with open(os.path.join("env.json")) as f: # input your env file path
    envs = json.load(f)

# RQ1. Model Performance 비교

## General model

In [None]:
# SAVE SETTING
RESULT_PATH = os.path.join(CURRENT_FOLDER,"results","general" )# 수정 using 
if not os.path.exists(RESULT_PATH):
    os.makedirs(RESULT_PATH)

MV_NAME = "MV"
XGBOOST_NAME = "XGBOOST"
RF_NAME = "RF"
DT_NAME = "DT"
AB_NAME = "AB"
LDA_NAME = "LDA"
KNN_NAME = "KNN"
SVM_NAME = "SVM"

# METHOD SELECT
RANDOM_SEED = 42
EXPERIMENT_NUM = 10
USE_SMOTE = True

# DATA SELECT
INCLUDE_PNUM = [1, 2, 3, 7,
                12,9,10,
                13, 14 , 19, 20, 21, 22, 23] # 15, 16, 17, 4, 8 

SENSOR = {
    '(S)_call_log': 'in',
    "(NR)_call_log":'in',
    '(NR)_acc': 'in', 
    '(NR)_fitbit': 'in', 
    '(S)_env': 'in',
    '(S)_customer_audio':'in',
    '(S)_customer_utterance':'in',
    '(R)_worker_audio' :'in',
    '(R)_worker_utterance':'in',
    '(S)_customer_transcript':'in',
    '(R)_worker_transcript':'in',
    "(IF)_before_work" : 'in',
    "(IF)_demo":"in"
}


## LOAD DATA ##
audio = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'audio.csv'),parse_dates=['start_second'])
call_log = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'call_log.csv'),parse_dates=['start_second'])
fitbit = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'fitbit.csv'), parse_dates=['start_second'])
acc = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'acc.csv'),parse_dates=['start_second'])
env = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'env.csv'),parse_dates=['start_second'])
individual_factor = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'individual_factor.csv'),parse_dates=['date'])
transcript = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'transcript.csv'),parse_dates=['start_second'],index_col=False)

# label data
call_label = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'call_label.csv'),parse_dates=['start_second'])
call_label = call_label.query('pnum in @INCLUDE_PNUM')
call_label = remove_unreliable_labels(call_label)

## Concatenate ##
combined_df = pd.concat([call_log.set_index(['pnum', 'start_second']),
                        fitbit.set_index(['pnum', 'start_second']),
                        acc.set_index(['pnum', 'start_second']),
                        env.set_index(['pnum', 'start_second']),
                        audio.set_index(['pnum', 'start_second']),
                        transcript.set_index(['pnum', 'start_second'])],
                        axis=1, join='inner').reset_index()

combined_df['date'] = combined_df['start_second'].dt.date
combined_df['date'] = pd.to_datetime(combined_df['date'])
combined_df = combined_df.merge(individual_factor,left_on=['pnum','date'],right_on=['pnum','date'],how='left').drop(['date'],axis=1)
final_df = combined_df.merge(call_label,on=['pnum','start_second'],how='inner')

# ## Drop missing value ## ==> 모두가 같은 학습 개수를 갖기 위해서
final_df = final_df.dropna()

## Data selection ##
sensor_selected_columns = [key for key, value in SENSOR.items() if value == 'in']
final_df = final_df[[col for col in final_df.columns if any(sensor in col for sensor in sensor_selected_columns)] + ['pnum','start_second','surface_acting']]

# Distinguishing between feature types
all_features = final_df.columns
basic_feature = ['pnum','start_second','surface_acting']
categorical_feature_ohe = ['(S)_call_log_complaint', '(S)_call_log_weekday',
                           '(S)_call_log_hour_category', '(IF)_demo_engage_motivation']
categorical_feature_no_ohe = ['(IF)_demo_age_category', '(IF)_demo_career_category']
categorical_feature_binary = ['(IF)_demo_gender', '(IF)_demo_education']
categorical_features = {
    "categorical_feature_ohe": [feature for feature in categorical_feature_ohe if any(key in feature for key in SENSOR if SENSOR[key] == 'in')],
    "categorical_feature_no_ohe": [feature for feature in categorical_feature_no_ohe if any(key in feature for key in SENSOR if SENSOR[key] == 'in')],
    "categorical_feature_binary": [feature for feature in categorical_feature_binary if any(key in feature for key in SENSOR if SENSOR[key] == 'in')]
}
numeric_feature = [feature for feature in all_features if feature not in basic_feature + categorical_features['categorical_feature_ohe'] + categorical_features['categorical_feature_no_ohe'] + categorical_features['categorical_feature_binary']]

## One-hot-encoding
if len(categorical_features['categorical_feature_ohe'])>0:
    final_df[categorical_features['categorical_feature_ohe']] = final_df[categorical_features['categorical_feature_ohe']].astype('int')
    # one-hot encodingDS
    final_df = one_hot(final_df,categorical_features['categorical_feature_ohe'])


# Model 
result_xgb_mean_df, x_feature_df = XGBoost_General(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_xgb_mean_df.to_csv(os.path.join(RESULT_PATH,XGBOOST_NAME+'.csv'))
x_feature_df.to_csv(os.path.join(RESULT_PATH,XGBOOST_NAME+"_x_feature.csv"))

result_rf_mean_df,  x_feature_df = RandomForest_General(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_rf_mean_df.to_csv(os.path.join(RESULT_PATH,RF_NAME+'.csv'))
x_feature_df.to_csv(os.path.join(RESULT_PATH,RF_NAME+"_x_feature.csv"))

result_lda_mean_df, x_feature_df = LDA_General(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_lda_mean_df.to_csv(os.path.join(RESULT_PATH,LDA_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,LDA_NAME+"+_x_feature.csv"))

result_svm_mean_df, x_feature_df = SVM_General(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_svm_mean_df.to_csv(os.path.join(RESULT_PATH,SVM_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,SVM_NAME+"_x_feature.csv"))

result_knn_mean_df, x_feature_df = KNN_General(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_knn_mean_df.to_csv(os.path.join(RESULT_PATH,KNN_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,KNN_NAME+"_x_feature.csv"))

result_ab_mean_df, x_feature_df = AdaBoost_General(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_ab_mean_df.to_csv(os.path.join(RESULT_PATH,AB_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,AB_NAME+"_x_feature.csv"))

result_dt_mean_df,  x_feature_df = DecisionTree_General(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_dt_mean_df.to_csv(os.path.join(RESULT_PATH,DT_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,DT_NAME+"_x_feature.csv"))

## Personalized model

In [None]:
# SAVE SETTING
RESULT_PATH = os.path.join(CURRENT_FOLDER,"results","personalization" )# 수정 using 
if not os.path.exists(RESULT_PATH):
    os.makedirs(RESULT_PATH)

MV_NAME = "MV_personalized"
XGBOOST_NAME = "XGBOOST_personalized"
RF_NAME = "RF_personalized"
DT_NAME = "DT_personalized"
AB_NAME = "AB_personalized"
LDA_NAME = "LDA_personalized"
KNN_NAME = "KNN_personalized"
SVM_NAME = "SVM_personalized"

# METHOD SELECT
RANDOM_SEED = 42
EXPERIMENT_NUM = 1
USE_SMOTE = True

# DATA SELECT
INCLUDE_PNUM = [1, 2, 3, 7,
                12,9,10,
                13, 14 , 19, 20, 21, 22, 23] # 15, 16, 17, 4, 8 

SENSOR = {
    '(S)_call_log': 'out',
    "(NR)_call_log":'out',
    '(NR)_acc': 'in', 
    '(NR)_fitbit': 'in', 
    '(S)_env': 'out',
    '(S)_customer_audio':'out',
    '(S)_customer_utterance':'out',
    '(R)_worker_audio' :'out',
    '(R)_worker_utterance':'out',
    '(S)_customer_transcript':'out',
    '(R)_worker_transcript':'out',
    "(IF)_before_work" : 'out',
    "(IF)_demo":'out'
}



## LOAD DATA ##
audio = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'audio.csv'),parse_dates=['start_second'])
call_log = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'call_log.csv'),parse_dates=['start_second'])
fitbit = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'fitbit.csv'), parse_dates=['start_second'])
acc = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'acc.csv'),parse_dates=['start_second'])
env = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'env.csv'),parse_dates=['start_second'])
individual_factor = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'individual_factor.csv'),parse_dates=['date'])
transcript = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'transcript.csv'),parse_dates=['start_second'],index_col=False)

# label data
call_label = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'call_label.csv'),parse_dates=['start_second'])
call_label = call_label.query('pnum in @INCLUDE_PNUM')
call_label = remove_unreliable_labels(call_label)

## Concatenate ##
combined_df = pd.concat([call_log.set_index(['pnum', 'start_second']),
                        fitbit.set_index(['pnum', 'start_second']),
                        acc.set_index(['pnum', 'start_second']),
                        env.set_index(['pnum', 'start_second']),
                        audio.set_index(['pnum', 'start_second']),
                        transcript.set_index(['pnum', 'start_second'])],
                        axis=1, join='inner').reset_index()
combined_df['date'] = combined_df['start_second'].dt.date
combined_df['date'] = pd.to_datetime(combined_df['date'])
combined_df = combined_df.merge(individual_factor,left_on=['pnum','date'],right_on=['pnum','date'],how='left').drop(['date'],axis=1)
final_df = combined_df.merge(call_label,on=['pnum','start_second'],how='inner')

## Drop missing value ## ==> 모두가 같은 학습 개수를 갖기 위해서
final_df = final_df.dropna()

## Data selection ##
sensor_selected_columns = [key for key, value in SENSOR.items() if value == 'in']
final_df = final_df[[col for col in final_df.columns if any(sensor in col for sensor in sensor_selected_columns)] + ['pnum','start_second','surface_acting']]

# Distinguishing between feature types
all_features = final_df.columns
basic_feature = ['pnum','start_second','surface_acting']
categorical_feature_ohe = ['(S)_call_log_complaint', '(S)_call_log_weekday',
                           '(S)_call_log_hour_category', '(IF)_demo_engage_motivation']
categorical_feature_no_ohe = ['(IF)_demo_age_category', '(IF)_demo_career_category']
categorical_feature_binary = ['(IF)_demo_gender', '(IF)_demo_education']
categorical_features = {
    "categorical_feature_ohe": [feature for feature in categorical_feature_ohe if any(key in feature for key in SENSOR if SENSOR[key] == 'in')],
    "categorical_feature_no_ohe": [feature for feature in categorical_feature_no_ohe if any(key in feature for key in SENSOR if SENSOR[key] == 'in')],
    "categorical_feature_binary": [feature for feature in categorical_feature_binary if any(key in feature for key in SENSOR if SENSOR[key] == 'in')]
}
numeric_feature = [feature for feature in all_features if feature not in basic_feature + categorical_features['categorical_feature_ohe'] + categorical_features['categorical_feature_no_ohe'] + categorical_features['categorical_feature_binary']]

## One-hot-encoding
if len(categorical_features['categorical_feature_ohe'])>0:
    final_df[categorical_features['categorical_feature_ohe']] = final_df[categorical_features['categorical_feature_ohe']].astype('int')
    # one-hot encoding
    final_df = one_hot(final_df,categorical_features['categorical_feature_ohe'])

result_xgb_mean_df, x_feature_df = XGBoost_Personalized(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_xgb_mean_df.to_csv(os.path.join(RESULT_PATH,XGBOOST_NAME+'.csv'))
x_feature_df.to_csv(os.path.join(RESULT_PATH,XGBOOST_NAME+"_x_feature.csv"))

result_rf_mean_df,  x_feature_df = RandomForest_Personalized(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_rf_mean_df.to_csv(os.path.join(RESULT_PATH,RF_NAME+'.csv'))
x_feature_df.to_csv(os.path.join(RESULT_PATH,RF_NAME+"_x_feature.csv"))

result_lda_mean_df, x_feature_df = LDA_Personalized(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_lda_mean_df.to_csv(os.path.join(RESULT_PATH,LDA_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,LDA_NAME+"+_x_feature.csv"))

result_svm_mean_df, x_feature_df = SVM_Personalized(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_svm_mean_df.to_csv(os.path.join(RESULT_PATH,SVM_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,SVM_NAME+"_x_feature.csv"))

result_knn_mean_df, x_feature_df = KNN_Personalized(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_knn_mean_df.to_csv(os.path.join(RESULT_PATH,KNN_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,KNN_NAME+"_x_feature.csv"))

result_ab_mean_df, x_feature_df = AdaBoost_Personalized(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_ab_mean_df.to_csv(os.path.join(RESULT_PATH,AB_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,AB_NAME+"_x_feature.csv"))

result_dt_mean_df,  x_feature_df = DecisionTree_Personalized(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_dt_mean_df.to_csv(os.path.join(RESULT_PATH,DT_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,DT_NAME+"_x_feature.csv"))

## Hybrid model

In [None]:
# SAVE SETTING
RESULT_PATH = os.path.join(CURRENT_FOLDER,"results","hybrid" )# 수정 using 
if not os.path.exists(RESULT_PATH):
    os.makedirs(RESULT_PATH)

MV_NAME = "MV_hybrid"
XGBOOST_NAME = "XGBOOST_hybrid"
RF_NAME = "RF_hybrid"
DT_NAME = "DT_hybrid"
AB_NAME = "AB_hybrid"
LDA_NAME = "LDA_hybrid"
KNN_NAME = "KNN_hybrid"
SVM_NAME = "SVM_hybrid"

# METHOD SELECT
RANDOM_SEED = 42
EXPERIMENT_NUM = 10
USE_SMOTE = True

# DATA SELECT
INCLUDE_PNUM = [1, 2, 3, 7,
                12,9,10,
                13, 14 , 19, 20, 21, 22, 23] # 15, 16, 17, 4, 8 


SENSOR = {
    '(S)_call_log': 'in',
    "(NR)_call_log":'in',
    '(NR)_acc': 'in', 
    '(NR)_fitbit': 'in', 
    '(S)_env': 'in',
    '(S)_customer_audio':'in',
    '(S)_customer_utterance':'in',
    '(R)_worker_audio' :'in',
    '(R)_worker_utterance':'in',
    '(S)_customer_transcript':'in',
    '(R)_worker_transcript':'in',
    "(IF)_before_work" : 'in',
    "(IF)_demo":"in"
}


## LOAD DATA ##
audio = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'audio.csv'),parse_dates=['start_second'])
call_log = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'call_log.csv'),parse_dates=['start_second'])
fitbit = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'fitbit.csv'), parse_dates=['start_second'])
acc = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'acc.csv'),parse_dates=['start_second'])
env = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'env.csv'),parse_dates=['start_second'])
individual_factor = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'individual_factor.csv'),parse_dates=['date'])
transcript = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'transcript.csv'),parse_dates=['start_second'],index_col=False)

# label data
call_label = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'call_label.csv'),parse_dates=['start_second'])
call_label = call_label.query('pnum in @INCLUDE_PNUM')
call_label = remove_unreliable_labels(call_label)

## Concatenate ##
combined_df = pd.concat([call_log.set_index(['pnum', 'start_second']),
                        fitbit.set_index(['pnum', 'start_second']),
                        acc.set_index(['pnum', 'start_second']),
                        env.set_index(['pnum', 'start_second']),
                        audio.set_index(['pnum', 'start_second']),
                        transcript.set_index(['pnum', 'start_second'])],
                        axis=1, join='inner').reset_index()

combined_df['date'] = combined_df['start_second'].dt.date
combined_df['date'] = pd.to_datetime(combined_df['date'])
combined_df = combined_df.merge(individual_factor,left_on=['pnum','date'],right_on=['pnum','date'],how='left').drop(['date'],axis=1)
final_df = combined_df.merge(call_label,on=['pnum','start_second'],how='inner')

# ## Drop missing value ## ==> 모두가 같은 학습 개수를 갖기 위해서
final_df = final_df.dropna()


## Data selection ##
sensor_selected_columns = [key for key, value in SENSOR.items() if value == 'in']
final_df = final_df[[col for col in final_df.columns if any(sensor in col for sensor in sensor_selected_columns)] + ['pnum','start_second','surface_acting']]

# Distinguishing between feature types
all_features = final_df.columns
basic_feature = ['pnum','start_second','surface_acting']
categorical_feature_ohe = ['(S)_call_log_complaint', '(S)_call_log_weekday',
                           '(S)_call_log_hour_category','(IF)_demo_engage_motivation'] # '(IF)_demo_engage_motivation']
categorical_feature_no_ohe = ['(IF)_demo_age_category', '(IF)_demo_career_category']
categorical_feature_binary = ['(IF)_demo_gender', '(IF)_demo_education']
categorical_features = {
    "categorical_feature_ohe": [feature for feature in categorical_feature_ohe if any(key in feature for key in SENSOR if SENSOR[key] == 'in')],
    "categorical_feature_no_ohe": [feature for feature in categorical_feature_no_ohe if any(key in feature for key in SENSOR if SENSOR[key] == 'in')],
    "categorical_feature_binary": [feature for feature in categorical_feature_binary if any(key in feature for key in SENSOR if SENSOR[key] == 'in')]
}
numeric_feature = [feature for feature in all_features if feature not in basic_feature + categorical_features['categorical_feature_ohe'] + categorical_features['categorical_feature_no_ohe'] + categorical_features['categorical_feature_binary']]

## One-hot-encoding
if len(categorical_features['categorical_feature_ohe'])>0:
    final_df[categorical_features['categorical_feature_ohe']] = final_df[categorical_features['categorical_feature_ohe']].astype('int')
    # one-hot encodingDS
    final_df = one_hot(final_df,categorical_features['categorical_feature_ohe'])

result_xgb_mean_df, x_feature_df = XGBoost_Hybrid(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_xgb_mean_df.to_csv(os.path.join(RESULT_PATH,XGBOOST_NAME+'.csv'))
x_feature_df.to_csv(os.path.join(RESULT_PATH,XGBOOST_NAME+"_x_feature.csv"))

result_rf_mean_df,  x_feature_df = RandomForest_Hybrid(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_rf_mean_df.to_csv(os.path.join(RESULT_PATH,RF_NAME+'.csv'))
x_feature_df.to_csv(os.path.join(RESULT_PATH,RF_NAME+"_x_feature.csv"))

result_lda_mean_df, x_feature_df = LDA_Hybrid(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_lda_mean_df.to_csv(os.path.join(RESULT_PATH,LDA_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,LDA_NAME+"+_x_feature.csv"))

result_svm_mean_df, x_feature_df = SVM_Hybrid(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_svm_mean_df.to_csv(os.path.join(RESULT_PATH,SVM_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,SVM_NAME+"_x_feature.csv"))

result_knn_mean_df, x_feature_df = KNN_Hybrid(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_knn_mean_df.to_csv(os.path.join(RESULT_PATH,KNN_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,KNN_NAME+"_x_feature.csv"))

result_ab_mean_df, x_feature_df = AdaBoost_Hybrid(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_ab_mean_df.to_csv(os.path.join(RESULT_PATH,AB_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,AB_NAME+"_x_feature.csv"))

result_dt_mean_df,  x_feature_df = DecisionTree_Hybrid(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
result_dt_mean_df.to_csv(os.path.join(RESULT_PATH,DT_NAME))
x_feature_df.to_csv(os.path.join(RESULT_PATH,DT_NAME+"_x_feature.csv"))

# RQ2. Ablation Study - ER Framework

In [None]:
RESULT_PATH = os.path.join(CURRENT_FOLDER,"results","ER_ablation_RF" )# 수정 using 
if not os.path.exists(RESULT_PATH):
    os.makedirs(RESULT_PATH)
    
# METHOD SELECT
RANDOM_SEED = 42
EXPERIMENT_NUM = 5
USE_SMOTE = True

# DATA SELECT
INCLUDE_PNUM = [1, 2, 3, 7,
                12,9,10,
                13, 14 , 19, 20, 21, 22, 23] # 15, 16, 17, 4, 8 

def ER_ablation_study(save_name, sensor_dic):
    ## LOAD DATA ##
    audio = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'audio.csv'),parse_dates=['start_second'])
    call_log = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'call_log.csv'),parse_dates=['start_second'])
    fitbit = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'fitbit.csv'), parse_dates=['start_second'])
    acc = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'acc.csv'),parse_dates=['start_second'])
    env = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'env.csv'),parse_dates=['start_second'])
    individual_factor = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'individual_factor.csv'),parse_dates=['date'])
    transcript = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'transcript.csv'),parse_dates=['start_second'],index_col=False)

    # label data
    call_label = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'call_label.csv'),parse_dates=['start_second'])
    call_label = call_label.query('pnum in @INCLUDE_PNUM')
    call_label = remove_unreliable_labels(call_label)

    ## Concatenate ##
    combined_df = pd.concat([call_log.set_index(['pnum', 'start_second']),
                            fitbit.set_index(['pnum', 'start_second']),
                            acc.set_index(['pnum', 'start_second']),
                            env.set_index(['pnum', 'start_second']),
                            audio.set_index(['pnum', 'start_second']),
                            transcript.set_index(['pnum', 'start_second'])],
                            axis=1, join='inner').reset_index()
    combined_df['date'] = combined_df['start_second'].dt.date
    combined_df['date'] = pd.to_datetime(combined_df['date'])
    combined_df = combined_df.merge(individual_factor,left_on=['pnum','date'],right_on=['pnum','date'],how='left').drop(['date'],axis=1)
    final_df = combined_df.merge(call_label,on=['pnum','start_second'],how='inner')

    ## Drop missing value ## ==> 모두가 같은 학습 개수를 갖기 위해서
    final_df = final_df.dropna()

    ## Data selection ##
    sensor_selected_columns = [key for key, value in sensor_dic.items() if value == 'in']
    final_df = final_df[[col for col in final_df.columns if any(sensor in col for sensor in sensor_selected_columns)] + ['pnum','start_second','surface_acting']]

    # Distinguishing between feature types
    all_features = final_df.columns
    basic_feature = ['pnum','start_second','surface_acting']
    categorical_feature_ohe = ['(S)_call_log_complaint', '(S)_call_log_weekday',
                            '(S)_call_log_hour_category', '(IF)_demo_engage_motivation']
    categorical_feature_no_ohe = ['(IF)_demo_age_category', '(IF)_demo_career_category']
    categorical_feature_binary = ['(IF)_demo_gender', '(IF)_demo_education']
    categorical_features = {
        "categorical_feature_ohe": [feature for feature in categorical_feature_ohe if any(key in feature for key in sensor_dic if sensor_dic[key] == 'in')],
        "categorical_feature_no_ohe": [feature for feature in categorical_feature_no_ohe if any(key in feature for key in sensor_dic if sensor_dic[key] == 'in')],
        "categorical_feature_binary": [feature for feature in categorical_feature_binary if any(key in feature for key in sensor_dic if sensor_dic[key] == 'in')]
    }
    numeric_feature = [feature for feature in all_features if feature not in basic_feature + categorical_features['categorical_feature_ohe'] + categorical_features['categorical_feature_no_ohe'] + categorical_features['categorical_feature_binary']]

    ## One-hot-encoding
    if len(categorical_features['categorical_feature_ohe'])>0:
        final_df[categorical_features['categorical_feature_ohe']] = final_df[categorical_features['categorical_feature_ohe']].astype('int')
        # one-hot encoding
        final_df = one_hot(final_df,categorical_features['categorical_feature_ohe'])


    result_rf_mean_df,  x_feature_df = RandomForest_Hybrid(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
    result_rf_mean_df.to_csv(os.path.join(RESULT_PATH,save_name+'.csv'))
    x_feature_df.to_csv(os.path.join(RESULT_PATH,save_name+"_x_feature.csv"))
    # result_xgb_mean_df, x_feature_df = XGBoost_Hybrid(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
    # result_xgb_mean_df.to_csv(os.path.join(RESULT_PATH,save_name+'.csv'))
    # x_feature_df.to_csv(os.path.join(RESULT_PATH,save_name+"_x_feature.csv"))
    


ER_ablation =[
    ["(S)"],["(R)"],["(NR)"], ["(IF)"],
              ["(S)","(R)"],["(S)","(NR)"],["(S)","(IF)"],
              ["(S)","(R)","(NR)"],["(S)","(R)","(IF)"],
              ["(R)","(NR)"],["(R)","(NR)","(IF)"],
              ["(S)","(R)","(NR)","(IF)"]
              ]


SENSOR = {
    '(S)_call_log': 'out',
    "(NR)_call_log":'out',
    '(NR)_acc': 'out', 
    '(NR)_fitbit': 'out', 
    '(S)_env': 'out',
    '(S)_customer_audio':'out',
    '(S)_customer_utterance':'out',
    '(R)_worker_audio' :'out',
    '(R)_worker_utterance':'out',
    '(S)_customer_transcript':'out',
    '(R)_worker_transcript':'in',
    "(IF)_before_work" : 'out',
    "(IF)_demo":"out"
}

# 조합에 따라 "in" 및 "out"을 설정하고 새로운 sensor_dic과 이름을 반환하는 함수
def generate_ablation_sensor_dic(ablation_list, sensor_dict):
    results = []
    
    for ablation in ablation_list:
        # 새로운 sensor_dic을 생성하고 "in"과 "out"을 설정
        new_sensor_dic = {}
        for key in sensor_dict.keys():
            if any(marker in key for marker in ablation):
                new_sensor_dic[key] = 'in'
            else:
                new_sensor_dic[key] = 'out'
        
        # 이름 만들기
        config_name = "_".join(ablation)
        
        # 결과 저장
        results.append((config_name, new_sensor_dic))
    
    return results

ablation_sensor_configs = generate_ablation_sensor_dic(ER_ablation, SENSOR)
for name, sensor_dic in ablation_sensor_configs:
    ER_ablation_study(name, sensor_dic)

# RQ2. Ablation Study - Data modality

In [None]:
RESULT_PATH = os.path.join(CURRENT_FOLDER,"results","Data_ablation_more" )# 수정 using 
if not os.path.exists(RESULT_PATH):
    os.makedirs(RESULT_PATH)
    
# METHOD SELECT
RANDOM_SEED = 42
EXPERIMENT_NUM = 1
USE_SMOTE = True

# DATA SELECT
INCLUDE_PNUM = [1, 2, 3, 7,
                12,9,10,
                13, 14 , 19, 20, 21, 22, 23] # 15, 16, 17, 4, 8 

def Data_ablation_study(save_name, sensor_dic):
    ## LOAD DATA ##
    audio = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'audio.csv'),parse_dates=['start_second'])
    call_log = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'call_log.csv'),parse_dates=['start_second'])[['pnum','start_second','(S)_call_log_duration','(S)_call_log_complaint']]
    fitbit = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'fitbit.csv'), parse_dates=['start_second'])
    acc = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'acc.csv'),parse_dates=['start_second'])
    env = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'env.csv'),parse_dates=['start_second'])
    individual_factor = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'individual_factor.csv'),parse_dates=['date'])
    transcript = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'transcript.csv'),parse_dates=['start_second'],index_col=False)

    # label data
    call_label = pd.read_csv(os.path.join(envs['DATA_PATH'], '3_feature_extraction', 'call_label.csv'),parse_dates=['start_second'])
    call_label = call_label.query('pnum in @INCLUDE_PNUM')
    call_label = remove_unreliable_labels(call_label)

    ## Concatenate ##
    combined_df = pd.concat([call_log.set_index(['pnum', 'start_second']),
                            fitbit.set_index(['pnum', 'start_second']),
                            acc.set_index(['pnum', 'start_second']),
                            env.set_index(['pnum', 'start_second']),
                            audio.set_index(['pnum', 'start_second']),
                            transcript.set_index(['pnum', 'start_second'])],
                            axis=1, join='inner').reset_index()
    combined_df['date'] = combined_df['start_second'].dt.date
    combined_df['date'] = pd.to_datetime(combined_df['date'])
    combined_df = combined_df.merge(individual_factor,left_on=['pnum','date'],right_on=['pnum','date'],how='left').drop(['date'],axis=1)
    final_df = combined_df.merge(call_label,on=['pnum','start_second'],how='inner')

    ## Drop missing value ## ==> 모두가 같은 학습 개수를 갖기 위해서
    final_df = final_df.dropna()

    ## Data selection ##
    sensor_selected_columns = [key for key, value in sensor_dic.items() if value == 'in']
    final_df = final_df[[col for col in final_df.columns if any(sensor in col for sensor in sensor_selected_columns)] + ['pnum','start_second','surface_acting']]
    print("최종 생성된 데이터: ",final_df.columns)

    # Distinguishing between feature types
    all_features = final_df.columns
    basic_feature = ['pnum','start_second','surface_acting']
    categorical_feature_ohe = ['(S)_call_log_complaint', '(S)_call_log_weekday',
                            '(S)_call_log_hour_category', '(IF)_demo_engage_motivation']
    categorical_feature_no_ohe = ['(IF)_demo_age_category', '(IF)_demo_career_category']
    categorical_feature_binary = ['(IF)_demo_gender', '(IF)_demo_education']
    categorical_features = {
        "categorical_feature_ohe": [feature for feature in categorical_feature_ohe if any(key in feature for key in sensor_dic if sensor_dic[key] == 'in')],
        "categorical_feature_no_ohe": [feature for feature in categorical_feature_no_ohe if any(key in feature for key in sensor_dic if sensor_dic[key] == 'in')],
        "categorical_feature_binary": [feature for feature in categorical_feature_binary if any(key in feature for key in sensor_dic if sensor_dic[key] == 'in')]
    }
    numeric_feature = [feature for feature in all_features if feature not in basic_feature + categorical_features['categorical_feature_ohe'] + categorical_features['categorical_feature_no_ohe'] + categorical_features['categorical_feature_binary']]

    # ## One-hot-encoding
    # if len(categorical_features['categorical_feature_ohe'])>0:
    #     final_df[categorical_features['categorical_feature_ohe']] = final_df[categorical_features['categorical_feature_ohe']].astype('int')
    #     # one-hot encoding
    #     final_df = one_hot(final_df,categorical_features['categorical_feature_ohe'])
    
    result_rf_mean_df,  x_feature_df = RandomForest_Hybrid(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
    result_rf_mean_df.to_csv(os.path.join(RESULT_PATH,save_name+'.csv'))
    x_feature_df.to_csv(os.path.join(RESULT_PATH,save_name+"_x_feature.csv"))

    # result_xgb_mean_df, x_feature_df = XGBoost_Hybrid(final_df,numeric_feature,EXPERIMENT_NUM,RANDOM_SEED,USE_SMOTE, RESULT_PATH)
    # result_xgb_mean_df.to_csv(os.path.join(RESULT_PATH,save_name+'.csv'))
    # x_feature_df.to_csv(os.path.join(RESULT_PATH,save_name+"_x_feature.csv"))
    


SENSOR = {
    '(S)_call_log': 'out',
    "(NR)_call_log":'out',
    '(NR)_acc': 'out', 
    '(NR)_fitbit': 'out', 
    '(S)_env': 'out',
    '(S)_customer_audio':'out',
    '(S)_customer_utterance':'out',
    '(R)_worker_audio' :'out',
    '(R)_worker_utterance':'out',
    '(S)_customer_transcript':'out',
    '(R)_worker_transcript':'in',
    "(IF)_before_work" : 'out',
    "(IF)_demo":"out"
}

Data_ablation =[
    # ['(R)_worker_transcript','(R)_worker_utterance','(R)_worker_audio'],
    ['(S)_customer_audio','(S)_customer_transcript'] # '(S)_call_log', '(S)_customer_utterance'
    # ['(R)_worker_utterance','(R)_worker_audio']/
    # ["(S)_customer_audio","(S)_customer_transcript"],['(R)_worker_transcript',"(R)_worker_utterance"]
    # ["(S)_customer_audio","(S)_customer_transcript","(S)_customer_utterance"],
    # ["(S)_customer_audio","(S)_customer_transcript","(S)_customer_utterance",'(S)_call_log']
    # ["(S)_customer_audio"],["(S)_customer_transcript"],["(S)_customer_utterance"],["(S)_env"],['(S)_call_log'],['(R)_worker_audio'],['(R)_worker_transcript'],["(R)_worker_utterance"],['(NR)_fitbit'],["(NR)_call_log",'(NR)_acc'],
                # ["(IF)_before_work"],
                # ["(IF)_demo"]
                ] # "(S)_call_log"는 따로 해야 함

# # 조합에 따라 "in" 및 "out"을 설정하고 새로운 sensor_dic과 이름을 반환하는 함수
def generate_ablation_sensor_dic(ablation_list, sensor_dict):
    results = []
    
    for ablation in ablation_list:
        # 새로운 sensor_dic을 생성하고 "in"과 "out"을 설정
        new_sensor_dic = {}
        for key in sensor_dict.keys():
            if any(marker in key for marker in ablation):
                new_sensor_dic[key] = 'in'
            else:
                new_sensor_dic[key] = 'out'
        
        # 이름 만들기
        config_name = "_".join(ablation)
        
        # 결과 저장
        results.append((config_name, new_sensor_dic))
    
    return results

ablation_sensor_configs = generate_ablation_sensor_dic(Data_ablation, SENSOR)
for name, sensor_dic in ablation_sensor_configs:
    Data_ablation_study(name, sensor_dic)