In [75]:
import os, config
import pandas as pd
import numpy as np
import torch
from multiprocessing import Manager
from multiprocessing.pool import Pool
from tqdm import tqdm
from functools import partial

## Transcription

In [40]:
sample_t_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Transcription', os.listdir(os.path.join(config.DATA_DIR, 'Transcription'))[0]))

In [41]:
is_not_ellie = sample_t_df['speaker'] != 'Ellie'
new_group_start = (is_not_ellie) & (~is_not_ellie.shift(1, fill_value=False))
group_id = new_group_start.cumsum()
sample_t_df['count'] = group_id.where(is_not_ellie, None)
sample_t_df.loc[~is_not_ellie, 'count'] = None

In [42]:
participant_df = sample_t_df[~pd.isna(sample_t_df['count'])]

In [49]:
participant_df.head(5)

Unnamed: 0,start_time,stop_time,speaker,value,count
6,62.328,63.178,Participant,good,1.0
9,68.978,70.288,Participant,atlanta georgia,2.0
12,75.028,78.128,Participant,um my parents are from here um,3.0
14,83.808,84.588,Participant,i love it,4.0
16,88.458,89.968,Participant,i like the weather,5.0


In [52]:
group_df = participant_df.dropna(subset=['count']).groupby('count').agg(
    start_time=('start_time', 'min'), # 그룹의 가장 이른 시작 시간
    stop_time=('stop_time', 'max')   # 그룹의 가장 늦은 종료 시간
).reset_index()

In [53]:
group_df.head(5)

Unnamed: 0,count,start_time,stop_time
0,1.0,62.328,63.178
1,2.0,68.978,70.288
2,3.0,75.028,78.128
3,4.0,83.808,84.588
4,5.0,88.458,97.278


## Visual Encoding

In [14]:
sample_v_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Vision Summary', os.listdir(os.path.join(config.DATA_DIR, 'Vision Summary'))[0]))

In [15]:
timestamp = sample_v_df.timestamp
ft_x = sample_v_df.filter(like='ftx')
ft_y = sample_v_df.filter(like='fty')
ft_3d_x = sample_v_df.filter(like='ft_3dX')
ft_3d_y = sample_v_df.filter(like='ft_3dY')
ft_3d_z = sample_v_df.filter(like='ft_3dZ')
au_r = sample_v_df.filter(like='au').filter(like='_r')
au_c = sample_v_df.filter(like='au').filter(like='_c')
gz_df = sample_v_df.filter(like='gz')
condition_include = gz_df.columns.str.contains('_')
condition_exclude = ~gz_df.columns.str.contains('h')
final_mask = condition_include & condition_exclude
gz_raw = gz_df.loc[:, final_mask]
gz_h = gz_df.filter(like='h')
ps_t = sample_v_df.filter(like='ps').filter(like='T')
ps_r = sample_v_df.filter(like='ps').filter(like='R')

In [16]:
vision = pd.concat([timestamp, au_r, gz_h, ps_t, ps_r], axis=1)

In [22]:
print(vision.timestamp.min(), '->', vision.timestamp.max())

0.0 -> 648.567


## Audio Encoding (Every row = 0.01s)

In [7]:
sample_a_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Audio Summary', os.listdir(os.path.join(config.DATA_DIR, 'Audio Summary'))[0]))

In [23]:
print(sample_a_df.index.min(), '->', sample_a_df.index.max())

0 -> 64849


## Using Manager

In [54]:
mgr = Manager()
dataset = mgr.list()

In [None]:
def process_transcription(df):
  is_not_ellie = df['speaker'] != 'Ellie'
  new_group_start = (is_not_ellie) & (~is_not_ellie.shift(1, fill_value=False))
  group_id = new_group_start.cumsum()
  df['count'] = group_id.where(is_not_ellie, None)
  df.loc[~is_not_ellie, 'count'] = None
  participant_df = df[~pd.isna(df['count'])]
  group_df = participant_df.dropna(subset=['count']).groupby('count').agg(
      start_time=('start_time', 'min'), # 그룹의 가장 이른 시작 시간
      stop_time=('stop_time', 'max')   # 그룹의 가장 늦은 종료 시간
  ).reset_index()
  return participant_df, group_df

def process_vision(df):
  timestamp = df.timestamp
  au_r = df.filter(like='au').filter(like='_r')
  gz_df = df.filter(like='gz')
  gz_h = gz_df.filter(like='h')
  ps_t = df.filter(like='ps').filter(like='T')
  ps_r = df.filter(like='ps').filter(like='R')
  vision = pd.concat([timestamp, au_r, gz_h, ps_t, ps_r], axis=1)
  return vision

def read_tva(id, dataset):
  try:
    TRANSCRIPTION = []
    VISION = []
    AUDIO = []
    t_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Transcription', f'{id}_transcript.csv'))
    v_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Vision Summary', f'{id}_vision_summary.csv'))
    a_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Audio Summary', f'{id}_audio_summary.csv'))

    participant_df, group_df = process_transcription(t_df)
    vision = process_vision(v_df)

    for _, row in group_df.iterrows():
      start = row.start_time
      stop = row.stop_time
      t_target = participant_df.loc[participant_df['count']==row['count']]
      v_target = vision.loc[(start <= vision.timestamp) & (vision.timestamp <= stop)]
      v_target = v_target.drop(columns=['timestamp'])
      v_target_list = v_target.values.tolist()
      a_target = a_df.iloc[int(start*100):int((stop*100)+1)]
      a_target_list = a_target.values.tolist()
      if len(t_target) > 0:
        TRANSCRIPTION.append(t_target)
      else:
        print("ERROR in transcription")
      if len(v_target_list) > 0:
        VISION.append(v_target_list)
      else:
        print("ERROR in vision")
      if len(a_target_list) > 0:
        AUDIO.append(a_target_list)
      else:
        print("ERROR in vision")
    if len(TRANSCRIPTION) == len(VISION) == len(AUDIO):
      dataset.append((TRANSCRIPTION, VISION, AUDIO))
    else:
      print("Length Not Matched")
  except Exception as e:
    print("오류 발생:",e)

id = ['300']
with Pool(processes=10) as p:
  with tqdm(total=len(id)) as pbar:
    for v in p.imap_unordered(partial(read_tva, dataset=dataset),id):
      pbar.update()

  0%|          | 0/1 [00:00<?, ?it/s]