In [1]:
import os
import config
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## CREATE VISUAL SUMMARY

In [2]:
for f in tqdm(os.listdir(config.RAW_DATA_DIR)):
  target_f_name = f[:3]
  if not os.path.isdir(os.path.join(config.RAW_DATA_DIR, f)):
    continue
  elif f"{target_f_name}_vision_summary.csv" in os.listdir(os.path.join(config.DATA_DIR, 'Vision Summary')):
    continue
  else:
    data_path = os.path.join(config.RAW_DATA_DIR, f)
    clnf_au = os.path.join(data_path, f'{target_f_name}_CLNF_AUs.txt')
    clnf_ft = os.path.join(data_path, f'{target_f_name}_CLNF_features.txt')
    clnf_ft_3d = os.path.join(data_path, f'{target_f_name}_CLNF_features3D.txt')
    clnf_gz = os.path.join(data_path, f'{target_f_name}_CLNF_gaze.txt')
    clnf_ps = os.path.join(data_path, f'{target_f_name}_CLNF_pose.txt')
    ft_df = pd.read_csv(clnf_ft, sep=', ', engine='python', encoding='utf-8')
    ft_3d_df = pd.read_csv(clnf_ft_3d, sep=', ', engine='python', encoding='utf-8')
    au_df = pd.read_csv(clnf_au, sep=', ', engine='python', encoding='utf-8')
    gz_df = pd.read_csv(clnf_gz, sep=',', engine='python', encoding='utf-8')
    ps_df = pd.read_csv(clnf_ps, sep=', ', engine='python', encoding='utf-8')
    uni_col = list(set(ft_df.columns) & set(au_df.columns))
    uni = ft_df[uni_col]
    ft_df.columns = 'ft' + ft_df.columns
    ft_3d_df.columns = 'ft_3d' + ft_3d_df.columns
    au_df.columns = 'au' + au_df.columns
    gz_df.columns = 'gz' + gz_df.columns
    ps_df.columns = 'ps' + ps_df.columns
    ft_x = ft_df.filter(like='x')
    ft_y = ft_df.filter(like='y')
    ft_3d_x = ft_3d_df.filter(like='X')
    ft_3d_y = ft_3d_df.filter(like='Y')
    ft_3d_z = ft_3d_df.filter(like='Z')
    au_r = au_df.filter(like='_r')
    au_c = au_df.filter(like='_c')
    condition_include = gz_df.columns.str.contains('_')
    condition_exclude = ~gz_df.columns.str.contains('h')
    final_mask = condition_include & condition_exclude
    gz_raw = gz_df.loc[:, final_mask]
    gz_h = gz_df.filter(like='h')
    ps_t = ps_df.filter(like='T')
    ps_r = ps_df.filter(like='R')
    merged_df = pd.concat([uni, ft_x, ft_y, ft_3d_x, ft_3d_y, ft_3d_z, au_r, au_c, gz_raw, gz_h, ps_t, ps_r], axis=1)
    merged_df.to_csv(os.path.join(config.DATA_DIR, 'Vision Summary', f'{target_f_name}_vision_summary.csv'), index=False)

100%|██████████| 189/189 [00:06<00:00, 28.09it/s]


## PROCESS TEXT DATA

In [3]:
# 363부터 달라짐

In [4]:
for f in tqdm(os.listdir(config.RAW_DATA_DIR)):
  target_f_name = f[:3]
  if not os.path.isdir(os.path.join(config.RAW_DATA_DIR, f)):
    continue
  elif f"{target_f_name}_transcript.csv" in os.listdir(os.path.join(config.DATA_DIR, 'Transcription')):
    continue
  else:
    text_df = pd.read_csv(os.path.join(config.RAW_DATA_DIR, f'{target_f_name}_P', f'{target_f_name}_TRANSCRIPT.csv'), sep='\t')
    text_df.to_csv(os.path.join(config.DATA_DIR, 'Transcription', f'{target_f_name}_transcript.csv'), index=False)

100%|██████████| 189/189 [00:00<00:00, 7228.72it/s]


## PROCESS AUDIO FT

In [4]:
def calculate_stats(x):
  return x.mean(), x.min(), x.max()

In [5]:
# check column number 10
col_10_lst = []
col_36_lst = []
col_37_lst = []
col_38_lst = []
col_39_lst = []
for f in tqdm(os.listdir(config.RAW_DATA_DIR)):
  if not os.path.isdir(os.path.join(config.RAW_DATA_DIR, f)):
    continue
  elif f"{target_f_name}_audio_summary.csv" in os.listdir(os.path.join(config.DATA_DIR, 'Audio Summary')):
    continue
  else:
    target_f_name=f[:3]
    covarep_df = pd.read_csv(os.path.join(config.RAW_DATA_DIR, f'{target_f_name}_P', f'{target_f_name}_COVAREP.csv'), header=None)
    stats_10 = calculate_stats(covarep_df.iloc[:, 10])
    stats_36 = calculate_stats(covarep_df.iloc[:, 36])
    stats_37 = calculate_stats(covarep_df.iloc[:, 37])
    stats_38 = calculate_stats(covarep_df.iloc[:, 38])
    stats_39 = calculate_stats(covarep_df.iloc[:, 39])
    col_10_lst.append(stats_10)
    col_36_lst.append(stats_36)
    col_37_lst.append(stats_37)
    col_38_lst.append(stats_38)
    col_39_lst.append(stats_39)

100%|██████████| 188/188 [00:00<00:00, 725.99it/s]


In [6]:
col_10_np = np.array(col_10_lst)
col_36_np = np.array(col_36_lst)
col_37_np = np.array(col_37_lst)
col_38_np = np.array(col_38_lst)
col_39_np = np.array(col_39_lst)

print('10번 열 통계량(mean, min, max)', np.mean(col_10_np, axis=0))
print('36번 열 통계량(mean, min, max)',np.mean(col_36_np, axis=0))
print('37번 열 통계량(mean, min, max)',np.mean(col_37_np, axis=0))
print('38번 열 통계량(mean, min, max)',np.mean(col_38_np, axis=0))
print('39번 열 통계량(mean, min, max)',np.mean(col_39_np, axis=0))

10번 열 통계량(mean, min, max) [0. 0. 0.]
36번 열 통계량(mean, min, max) [0. 0. 0.]
37번 열 통계량(mean, min, max) [0. 0. 0.]
38번 열 통계량(mean, min, max) [0. 0. 0.]
39번 열 통계량(mean, min, max) [0. 0. 0.]


covarep 파일의 (0번부터 시작) 10번, 36번, 37번, 38번, 39번은 제외

In [7]:
target_f_name='301'
temp_col = [74, 75, 76, 77, 78]
sample_df = pd.read_csv(os.path.join(config.RAW_DATA_DIR, f'{target_f_name}_P', f'{target_f_name}_COVAREP.csv'), header=None)
sample_f_df = pd.read_csv(os.path.join(config.RAW_DATA_DIR, f'{target_f_name}_P', f'{target_f_name}_FORMANT.csv'), header=None)
sample_f_df.columns = temp_col
delete_target_col = [sample_df.columns[10], sample_df.columns[36], sample_df.columns[37], sample_df.columns[38], sample_df.columns[39]]
new_df = sample_df.drop(delete_target_col, axis=1)
s_30000 = sample_df.iloc[30000]
n_30000 = new_df.iloc[30000]
for s in s_30000:
  print(s, end = ' ')
print()
for n in n_30000:
  print(n, end = ' ')

92.5 0.0 0.089742 0.32257 2.9205 0.12642 0.10171 -0.42546 1.2936 0.471 0.0 -8.5767 2.0857 -0.22174 0.47126 0.03297 -0.068466 -0.065727 0.014886 0.02652 0.15883 -0.04209 0.036558 -0.10241 0.19617 -0.28809 0.025842 0.1255 -0.12538 -0.049855 0.14323 -0.010365 0.087629 -0.067861 0.10681 -0.0676 0.0 0.0 0.0 0.0 0.0 0.0 0.0066048 -0.079718 -0.36235 -2.5475 2.5188 1.8819 1.9136 1.7211 2.0634 2.1656 2.0323 2.2115 2.0762 1.9933 2.0794 2.3189 2.0805 2.6563 2.936 -0.041149 -0.74974 -0.68285 -0.65799 -0.66151 -0.68335 -0.6321 -0.57434 -0.52392 -0.49954 -0.42295 -0.3791 -0.36672 
92.5 0.0 0.089742 0.32257 2.9205 0.12642 0.10171 -0.42546 1.2936 0.471 -8.5767 2.0857 -0.22174 0.47126 0.03297 -0.068466 -0.065727 0.014886 0.02652 0.15883 -0.04209 0.036558 -0.10241 0.19617 -0.28809 0.025842 0.1255 -0.12538 -0.049855 0.14323 -0.010365 0.087629 -0.067861 0.10681 -0.0676 0.0 0.0 0.0066048 -0.079718 -0.36235 -2.5475 2.5188 1.8819 1.9136 1.7211 2.0634 2.1656 2.0323 2.2115 2.0762 1.9933 2.0794 2.3189 2.0805 2.

In [8]:
len(sample_df), len(sample_f_df)

(82389, 82390)

In [5]:
temp_col = [74, 75, 76, 77, 78]
for f in tqdm(os.listdir(config.RAW_DATA_DIR)):
  target_f_name=f[:3]
  if not os.path.isdir(os.path.join(config.RAW_DATA_DIR, f)):
    continue
  elif f"{target_f_name}_audio_summary.csv" in os.listdir(os.path.join(config.DATA_DIR, 'Audio Summary')):
    continue
  else:
    covarep_df = pd.read_csv(os.path.join(config.RAW_DATA_DIR, f'{target_f_name}_P', f'{target_f_name}_COVAREP.csv'), header=None)
    formant_df = pd.read_csv(os.path.join(config.RAW_DATA_DIR, f'{target_f_name}_P', f'{target_f_name}_FORMANT.csv'), header=None)
    formant_df.columns = temp_col
    delete_target_col = [covarep_df.columns[10], covarep_df.columns[36], covarep_df.columns[37], covarep_df.columns[38], covarep_df.columns[39]]
    covarep_dropped = covarep_df.drop(delete_target_col, axis=1)
    audio_summary = pd.concat([covarep_dropped, formant_df], axis=1).dropna()
    audio_summary.to_csv(os.path.join(config.DATA_DIR, 'Audio Summary', f'{target_f_name}_audio_summary.csv'), index=False)

100%|██████████| 189/189 [00:02<00:00, 72.83it/s]
