In [2]:
import os, config
import pandas as pd
import numpy as np
import torch
from multiprocessing import Manager
from multiprocessing.pool import Pool
from tqdm import tqdm
from functools import partial
import gensim.downloader as api

from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [None]:
# 1. 파일 경로 설정 (경로는 사용 환경에 맞게 수정해야 합니다)
glove_txt_file = os.path.join(config.MODEL_DIR, 'glove.840B.300d.txt')
word2vec_bin_file = os.path.join(config.MODEL_DIR, "glove.840B.300d.word2vec.bin")

# 2. GloVe TXT 파일을 Word2Vec 형식으로 변환
print("GloVe TXT 파일을 Word2Vec 형식으로 변환 중입니다...")
# 이 과정은 파일 크기가 크기 때문에 약 10~30분 소요될 수 있습니다.
if not os.path.exists(word2vec_bin_file):
    glove2word2vec(glove_txt_file, word2vec_bin_file)
    print("변환 완료.")
else:
    print("변환된 파일이 이미 존재합니다.")

GloVe TXT 파일을 Word2Vec 형식으로 변환 중입니다...


  glove2word2vec(glove_txt_file, word2vec_bin_file)


변환 완료.


In [8]:
word2vec_bin_file = os.path.join(config.MODEL_DIR, "glove.840B.300d.word2vec.bin")
# 1. Word2Vec 형식 바이너리 파일 로드
print("변환된 바이너리 파일을 메모리에 로드 중입니다...")
# 로드도 용량이 크기 때문에 수 분이 소요될 수 있습니다.
glove_model = KeyedVectors.load_word2vec_format(word2vec_bin_file, binary=True, encoding='latin-1')
print(f"모델 로드 완료. 총 {len(glove_model.index_to_key)}개의 단어 벡터가 로드되었습니다.")

# --- 모델 사용 예시 ---

# 2. 단어 임베딩 벡터 얻기
word_vector = glove_model['interview']
print(f"\n'interview' 단어의 임베딩 벡터 (처음 5개 차원): {word_vector[:5]}")
print(f"임베딩 벡터 차원: {len(word_vector)}")

# 3. 단어 유사성 계산
similarity = glove_model.similarity('question', 'query')
print(f"\n'question'과 'query'의 유사도: {similarity:.4f}")

# 4. 가장 유사한 단어 찾기
similar_words = glove_model.most_similar('job', topn=5)
print(f"\n'job'과 가장 유사한 단어 5가지:\n{similar_words}")

# 5. 단어 유추 (Analogy)
# 예를 들어, 'France' - 'Paris' + 'Tokyo' = 'Japan'
analogy_result = glove_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(f"\nKing - Man + Woman = {analogy_result[0][0]}")

변환된 바이너리 파일을 메모리에 로드 중입니다...


MemoryError: Unable to allocate 2.45 GiB for an array with shape (2196017, 300) and data type float32

## Transcription

In [2]:
sample_t_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Transcription', os.listdir(os.path.join(config.DATA_DIR, 'Transcription'))[0]))

In [3]:
is_not_ellie = sample_t_df['speaker'] != 'Ellie'
new_group_start = (is_not_ellie) & (~is_not_ellie.shift(1, fill_value=False))
group_id = new_group_start.cumsum()
sample_t_df['count'] = group_id.where(is_not_ellie, None)
sample_t_df.loc[~is_not_ellie, 'count'] = None

In [4]:
participant_df = sample_t_df[~pd.isna(sample_t_df['count'])]

In [6]:
participant_df.head(10)

Unnamed: 0,start_time,stop_time,speaker,value,count
6,62.328,63.178,Participant,good,1.0
9,68.978,70.288,Participant,atlanta georgia,2.0
12,75.028,78.128,Participant,um my parents are from here um,3.0
14,83.808,84.588,Participant,i love it,4.0
16,88.458,89.968,Participant,i like the weather,5.0
17,90.278,93.568,Participant,i like the opportunities,5.0
18,94.738,95.298,Participant,um,5.0
19,96.588,97.278,Participant,yes,5.0
21,102.428,103.268,Participant,um,6.0
22,104.278,105.558,Participant,it took a minute,6.0


In [11]:
" ".join(participant_df.loc[participant_df['count']==5.0].value.tolist())

'i like the weather i like the opportunities um yes'

In [7]:
group_df = participant_df.dropna(subset=['count']).groupby('count').agg(
    start_time=('start_time', 'min'), # 그룹의 가장 이른 시작 시간
    stop_time=('stop_time', 'max')   # 그룹의 가장 늦은 종료 시간
).reset_index()

In [8]:
group_df.head(5)

Unnamed: 0,count,start_time,stop_time
0,1.0,62.328,63.178
1,2.0,68.978,70.288
2,3.0,75.028,78.128
3,4.0,83.808,84.588
4,5.0,88.458,97.278


## Visual Encoding

In [3]:
sample_v_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Vision Summary', os.listdir(os.path.join(config.DATA_DIR, 'Vision Summary'))[0]))

In [4]:
timestamp = sample_v_df.timestamp
ft_x = sample_v_df.filter(like='ftx')
ft_y = sample_v_df.filter(like='fty')
ft_3d_x = sample_v_df.filter(like='ft_3dX')
ft_3d_y = sample_v_df.filter(like='ft_3dY')
ft_3d_z = sample_v_df.filter(like='ft_3dZ')
au_r = sample_v_df.filter(like='au').filter(like='_r')
au_c = sample_v_df.filter(like='au').filter(like='_c')
gz_df = sample_v_df.filter(like='gz')
condition_include = gz_df.columns.str.contains('_')
condition_exclude = ~gz_df.columns.str.contains('h')
final_mask = condition_include & condition_exclude
gz_raw = gz_df.loc[:, final_mask]
gz_h = gz_df.filter(like='h')
ps_t = sample_v_df.filter(like='ps').filter(like='T')
ps_r = sample_v_df.filter(like='ps').filter(like='R')

In [5]:
vision = pd.concat([timestamp, au_r, gz_h, ps_t, ps_r], axis=1)

In [6]:
upgrade_vision = pd.concat([timestamp, ft_x, ft_y, ft_3d_x, ft_3d_y, ft_3d_z, au_r, gz_h, ps_t, ps_r])

In [None]:
upgrade_vision

Unnamed: 0,timestamp,ftx0,ftx1,ftx2,ftx3,ftx4,ftx5,ftx6,ftx7,ftx8,...,gz z_h0,gz x_h1,gz y_h1,gz z_h1,psTx,psTy,psTz,psRx,psRy,psRz
0,0.000000,,,,,,,,,,...,,,,,,,,,,
1,0.033333,,,,,,,,,,...,,,,,,,,,,
2,0.066667,,,,,,,,,,...,,,,,,,,,,
3,0.100000,,,,,,,,,,...,,,,,,,,,,
4,0.133333,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19453,,,,,,,,,,,...,,,,,,,,0.441858,-0.093022,-0.228571
19454,,,,,,,,,,,...,,,,,,,,0.430105,-0.082108,-0.212567
19455,,,,,,,,,,,...,,,,,,,,0.458600,-0.079041,-0.192886
19456,,,,,,,,,,,...,,,,,,,,0.473191,-0.077877,-0.174793


In [22]:
print(vision.timestamp.min(), '->', vision.timestamp.max())

0.0 -> 648.567


## Audio Encoding (Every row = 0.01s)

In [7]:
sample_a_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Audio Summary', os.listdir(os.path.join(config.DATA_DIR, 'Audio Summary'))[0]))

In [23]:
print(sample_a_df.index.min(), '->', sample_a_df.index.max())

0 -> 64849


## Using Manager

In [54]:
mgr = Manager()
dataset = mgr.list()

In [None]:
def process_transcription(df):
  is_not_ellie = df['speaker'] != 'Ellie'
  new_group_start = (is_not_ellie) & (~is_not_ellie.shift(1, fill_value=False))
  group_id = new_group_start.cumsum()
  df['count'] = group_id.where(is_not_ellie, None)
  df.loc[~is_not_ellie, 'count'] = None
  participant_df = df[~pd.isna(df['count'])]
  group_df = participant_df.dropna(subset=['count']).groupby('count').agg(
      start_time=('start_time', 'min'), # 그룹의 가장 이른 시작 시간
      stop_time=('stop_time', 'max')   # 그룹의 가장 늦은 종료 시간
  ).reset_index()
  return participant_df, group_df

def process_vision(df):
  timestamp = df.timestamp
  au_r = df.filter(like='au').filter(like='_r')
  gz_df = df.filter(like='gz')
  gz_h = gz_df.filter(like='h')
  ps_t = df.filter(like='ps').filter(like='T')
  ps_r = df.filter(like='ps').filter(like='R')
  vision = pd.concat([timestamp, au_r, gz_h, ps_t, ps_r], axis=1)
  return vision

def read_tva(id, dataset):
  try:
    TRANSCRIPTION = []
    VISION = []
    AUDIO = []
    t_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Transcription', f'{id}_transcript.csv'))
    v_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Vision Summary', f'{id}_vision_summary.csv'))
    a_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Audio Summary', f'{id}_audio_summary.csv'))

    participant_df, group_df = process_transcription(t_df)
    vision = process_vision(v_df)

    for _, row in group_df.iterrows():
      start = row.start_time
      stop = row.stop_time
      t_target = participant_df.loc[participant_df['count']==row['count']]
      v_target = vision.loc[(start <= vision.timestamp) & (vision.timestamp <= stop)]
      v_target = v_target.drop(columns=['timestamp'])
      v_target_list = v_target.values.tolist()
      a_target = a_df.iloc[int(start*100):int((stop*100)+1)]
      a_target_list = a_target.values.tolist()
      if len(t_target) > 0:
        TRANSCRIPTION.append(t_target)
      else:
        print("ERROR in transcription")
      if len(v_target_list) > 0:
        VISION.append(v_target_list)
      else:
        print("ERROR in vision")
      if len(a_target_list) > 0:
        AUDIO.append(a_target_list)
      else:
        print("ERROR in vision")
    if len(TRANSCRIPTION) == len(VISION) == len(AUDIO):
      dataset.append((TRANSCRIPTION, VISION, AUDIO))
    else:
      print("Length Not Matched")
  except Exception as e:
    print("오류 발생:",e)

id = ['300']
with Pool(processes=10) as p:
  with tqdm(total=len(id)) as pbar:
    for v in p.imap_unordered(partial(read_tva, dataset=dataset),id):
      pbar.update()

  0%|          | 0/1 [00:00<?, ?it/s]