In [44]:
import os, config
import pandas as pd
import numpy as np
import torch
from multiprocessing import Manager
from multiprocessing.pool import Pool
from tqdm import tqdm
from functools import partial
import gensim.downloader as api

from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.preprocessing import StandardScaler

In [None]:
# 1. 파일 경로 설정 (경로는 사용 환경에 맞게 수정해야 합니다)
glove_txt_file = os.path.join(config.MODEL_DIR, 'glove.840B.300d.txt')
word2vec_bin_file = os.path.join(config.MODEL_DIR, "glove.840B.300d.word2vec.bin")

# 2. GloVe TXT 파일을 Word2Vec 형식으로 변환
print("GloVe TXT 파일을 Word2Vec 형식으로 변환 중입니다...")
# 이 과정은 파일 크기가 크기 때문에 약 10~30분 소요될 수 있습니다.
if not os.path.exists(word2vec_bin_file):
    glove2word2vec(glove_txt_file, word2vec_bin_file)
    print("변환 완료.")
else:
    print("변환된 파일이 이미 존재합니다.")

GloVe TXT 파일을 Word2Vec 형식으로 변환 중입니다...


  glove2word2vec(glove_txt_file, word2vec_bin_file)


변환 완료.


In [14]:
kv_path = os.path.join(config.MODEL_DIR, "glove_model.kv")

glove_model = KeyedVectors.load(kv_path)
print(f"모델 로드 완료. 총 {len(glove_model.index_to_key)}개의 단어 벡터가 로드되었습니다.")

# --- 모델 사용 예시 ---

# 2. 단어 임베딩 벡터 얻기
word_vector = glove_model['interview']
print(f"\n'interview' 단어의 임베딩 벡터 (처음 5개 차원): {word_vector[:5]}")
print(f"임베딩 벡터 차원: {len(word_vector)}")

# 3. 단어 유사성 계산
similarity = glove_model.similarity('question', 'query')
print(f"\n'question'과 'query'의 유사도: {similarity:.4f}")

# 4. 가장 유사한 단어 찾기
similar_words = glove_model.most_similar('job', topn=5)
print(f"\n'job'과 가장 유사한 단어 5가지:\n{similar_words}")

# 5. 단어 유추 (Analogy)
# 예를 들어, 'France' - 'Paris' + 'Tokyo' = 'Japan'
analogy_result = glove_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(f"\nKing - Man + Woman = {analogy_result[0][0]}")

모델 로드 완료. 총 2196016개의 단어 벡터가 로드되었습니다.

'interview' 단어의 임베딩 벡터 (처음 5개 차원): [-0.56421  0.4346  -0.20186  1.011   -0.22081]
임베딩 벡터 차원: 300

'question'과 'query'의 유사도: 0.3921

'job'과 가장 유사한 단어 5가지:
[('jobs', 0.7647691965103149), ('work', 0.6609548926353455), ('hiring', 0.6495566964149475), ('working', 0.6471851468086243), ('employment', 0.6345195174217224)]

King - Man + Woman = queen


In [15]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words_list = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\82102\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
sentence = "I like you"
TRANSCRIPTION = []
for word in sentence.split():
  TRANSCRIPTION.append(glove_model[word].tolist())

## Transcription

In [46]:
sample_t_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Transcription', os.listdir(os.path.join(config.DATA_DIR, 'Transcription'))[1]))
scaler = StandardScaler()

In [47]:
def process_transcription(df):
  finish_utterance = ["asked everything", "asked_everything", "it was great chatting with you"]

  search_pattern = '|'.join(finish_utterance)
  condition = df['value'].str.contains(search_pattern, na=False)
  terminate_index = df.index[condition]
  if terminate_index.empty:
    terminate_value = len(df)
  else:
    terminate_value = terminate_index.values[0]
  n_df = df.iloc[:terminate_value].copy()

  is_not_ellie = n_df['speaker'] != 'Ellie'
  new_group_start = (is_not_ellie) & (~is_not_ellie.shift(1, fill_value=False))
  group_id = new_group_start.cumsum()

  n_df['count'] = group_id.where(is_not_ellie, pd.NA)
  participant_df = n_df.dropna(subset=['count'])
  group_df = participant_df.dropna(subset=['count']).groupby('count').agg(
      start_time=('start_time', 'min'), # 그룹의 가장 이른 시작 시간
      stop_time=('stop_time', 'max')   # 그룹의 가장 늦은 종료 시간
  ).reset_index()
  return participant_df, group_df

In [48]:
participant_df, group_df = process_transcription(sample_t_df)

In [49]:
participant_df.head(10)

Unnamed: 0,start_time,stop_time,speaker,value,count
1,32.738,33.068,Participant,thank you,1.0
3,42.088,42.518,Participant,mmm k,2.0
6,54.328,55.758,Participant,i'm doing good thank you,3.0
9,59.858,60.948,Participant,i'm from los angeles,4.0
11,63.538,64.108,Participant,oh great,5.0
13,67.388,69.858,Participant,i live in west los angeles the west side,6.0
16,74.678,75.248,Participant,it's alright,7.0
17,76.398,76.708,Participant,i xxx,7.0
19,78.218,79.588,Participant,no i live alone so,8.0
22,85.398,89.468,Participant,i love it i'm from here so i grew up here it's...,9.0


In [50]:
" ".join(participant_df.loc[participant_df['count']==5.0].value.tolist())

'oh great'

In [51]:
group_df.head(5)

Unnamed: 0,count,start_time,stop_time
0,1.0,32.738,33.068
1,2.0,42.088,42.518
2,3.0,54.328,55.758
3,4.0,59.858,60.948
4,5.0,63.538,64.108


## Visual Encoding

In [77]:
sample_v_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Vision Summary', os.listdir(os.path.join(config.DATA_DIR, 'Vision Summary'))[1]))

In [78]:
timestamp = sample_v_df.timestamp
ft_x = sample_v_df.filter(like='ftx')
ft_y = sample_v_df.filter(like='fty')
ft_3d_x = sample_v_df.filter(like='ft_3dX')
ft_3d_y = sample_v_df.filter(like='ft_3dY')
ft_3d_z = sample_v_df.filter(like='ft_3dZ')
au_r = sample_v_df.filter(like='au').filter(like='_r')
au_c = sample_v_df.filter(like='au').filter(like='_c')
gz_df = sample_v_df.filter(like='gz')
condition_include = gz_df.columns.str.contains('_')
condition_exclude = ~gz_df.columns.str.contains('h')
final_mask = condition_include & condition_exclude
gz_raw = gz_df.loc[:, final_mask]
gz_h = gz_df.filter(like='h')
ps_t = sample_v_df.filter(like='ps').filter(like='T')
ps_r = sample_v_df.filter(like='ps').filter(like='R')

In [79]:
vision = pd.concat([timestamp, ft_x, ft_y, au_r, gz_h, ps_t, ps_r], axis=1)

In [80]:
vision

Unnamed: 0,timestamp,ftx0,ftx1,ftx2,ftx3,ftx4,ftx5,ftx6,ftx7,ftx8,...,gz z_h0,gz x_h1,gz y_h1,gz z_h1,psTx,psTy,psTz,psRx,psRy,psRz
0,0.000000,526.786,529.036,534.847,543.439,556.099,575.026,599.287,628.048,657.383,...,-0.910699,0.062139,0.311942,-0.948067,1.44225,8.25678,663.887,0.152862,-0.021086,-0.123475
1,0.033333,526.824,529.253,535.202,544.043,557.013,576.148,600.460,629.153,658.604,...,-0.908418,0.058509,0.323298,-0.944487,1.53571,8.16685,664.325,0.161258,-0.016155,-0.132484
2,0.066667,526.808,529.343,535.380,544.353,557.477,576.711,601.048,629.701,659.190,...,-0.909355,0.057533,0.325953,-0.943634,1.57809,8.13997,664.429,0.164528,-0.013700,-0.136451
3,0.100000,525.793,528.653,534.997,544.260,557.755,577.378,602.039,630.828,660.157,...,-0.903249,0.071786,0.319817,-0.944756,1.76802,7.06334,666.207,0.150502,-0.028384,-0.143667
4,0.133333,525.089,528.040,534.418,543.754,557.452,577.392,602.300,631.246,660.461,...,-0.904398,0.093860,0.319435,-0.942948,1.65268,6.75033,667.385,0.143607,-0.039160,-0.149124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24716,823.867000,599.432,594.082,589.804,572.667,547.473,551.391,565.993,625.876,698.182,...,-1.000000,0.000000,0.000000,-1.000000,49.68730,-16.62920,282.211,0.290521,-0.781698,0.308885
24717,823.900000,599.432,594.082,589.804,572.667,547.473,551.391,565.993,625.876,698.182,...,-1.000000,0.000000,0.000000,-1.000000,49.68730,-16.62920,282.211,0.290521,-0.781698,0.308885
24718,823.933000,599.432,594.082,589.804,572.667,547.473,551.391,565.993,625.876,698.182,...,-1.000000,0.000000,0.000000,-1.000000,49.68730,-16.62920,282.211,0.290521,-0.781698,0.308885
24719,823.967000,599.432,594.082,589.804,572.667,547.473,551.391,565.993,625.876,698.182,...,-1.000000,0.000000,0.000000,-1.000000,49.68730,-16.62920,282.211,0.290521,-0.781698,0.308885


In [75]:
for count in group_df['count'].tolist():
    word_list = " ".join(participant_df.loc[participant_df['count']==count].value.tolist()).split()
    group_trans_list = []
    for word in word_list:
        try:
            word = word.strip()
            if "'" in word:
                word = word.split("'")[0]
            if '<' in word or '>' in word or '_' in word or '[' in word or ']' in word or word in stop_words_list:
                pass
            elif word == "":
                pass
            else:
                group_trans_list.append(glove_model[word].tolist())
        except KeyError:
            print(f"{word} is not in the model.")

    row = group_df[group_df['count']==count].iloc[0]
    start = row.start_time
    stop = row.stop_time
    v_target = vision.loc[(start <= vision.timestamp) & (vision.timestamp <= stop)]
    v_target = v_target.drop(columns=['timestamp'])
    v_target_list = v_target.values.tolist()

In [76]:
len(v_target_list)

871

In [66]:
v_target_list[10]

[0.3052992027368094,
 0.3230265466191525,
 0.3324003170275849,
 0.3218535734913433,
 0.2914653998858076,
 0.26415970487770674,
 0.23760907149302563,
 0.21455297588554512,
 0.2205180754597112,
 0.253633119038225,
 0.33130658716907957,
 0.3917376540587406,
 0.4200399812259838,
 0.3878293827495666,
 0.34250709770868504,
 0.306381037967437,
 0.27070436161446276,
 0.16579275583865544,
 0.14925727849720444,
 0.14880527001713126,
 0.1486050357763323,
 0.15477227045742234,
 0.1181817095595324,
 0.13046614669266765,
 0.145372456149635,
 0.16014252938230483,
 0.17029049793487516,
 0.14492126919023246,
 0.14118146079221586,
 0.13674578568715404,
 0.13215328439158827,
 0.14740182724235404,
 0.1501698015148118,
 0.15250048418615766,
 0.15549572604767434,
 0.1613552078497376,
 0.16942845749591529,
 0.1611115580020661,
 0.1550799177375509,
 0.16437608507657064,
 0.1597579407862539,
 0.16568813249832573,
 0.1578584257773676,
 0.1584057528116053,
 0.1558807717022003,
 0.16609923412006072,
 0.1522190833

In [67]:
upgrade_vision = pd.concat([timestamp, ft_x, ft_y, ft_3d_x, ft_3d_y, ft_3d_z, au_r, gz_h, ps_t, ps_r])

In [68]:
upgrade_vision

Unnamed: 0,timestamp,ftx0,ftx1,ftx2,ftx3,ftx4,ftx5,ftx6,ftx7,ftx8,...,gz z_h0,gz x_h1,gz y_h1,gz z_h1,psTx,psTy,psTz,psRx,psRy,psRz
0,0.000000,,,,,,,,,,...,,,,,,,,,,
1,0.033333,,,,,,,,,,...,,,,,,,,,,
2,0.066667,,,,,,,,,,...,,,,,,,,,,
3,0.100000,,,,,,,,,,...,,,,,,,,,,
4,0.133333,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24716,,,,,,,,,,,...,,,,,,,,0.290521,-0.781698,0.308885
24717,,,,,,,,,,,...,,,,,,,,0.290521,-0.781698,0.308885
24718,,,,,,,,,,,...,,,,,,,,0.290521,-0.781698,0.308885
24719,,,,,,,,,,,...,,,,,,,,0.290521,-0.781698,0.308885


In [22]:
print(vision.timestamp.min(), '->', vision.timestamp.max())

0.0 -> 648.567


## Audio Encoding (Every row = 0.01s)

In [7]:
sample_a_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Audio Summary', os.listdir(os.path.join(config.DATA_DIR, 'Audio Summary'))[0]))

In [23]:
print(sample_a_df.index.min(), '->', sample_a_df.index.max())

0 -> 64849


## Using Manager

In [54]:
mgr = Manager()
dataset = mgr.list()

In [None]:
def process_transcription(df):
  is_not_ellie = df['speaker'] != 'Ellie'
  new_group_start = (is_not_ellie) & (~is_not_ellie.shift(1, fill_value=False))
  group_id = new_group_start.cumsum()
  df['count'] = group_id.where(is_not_ellie, None)
  df.loc[~is_not_ellie, 'count'] = None
  participant_df = df[~pd.isna(df['count'])]
  group_df = participant_df.dropna(subset=['count']).groupby('count').agg(
      start_time=('start_time', 'min'), # 그룹의 가장 이른 시작 시간
      stop_time=('stop_time', 'max')   # 그룹의 가장 늦은 종료 시간
  ).reset_index()
  return participant_df, group_df

def process_vision(df):
  timestamp = df.timestamp
  au_r = df.filter(like='au').filter(like='_r')
  gz_df = df.filter(like='gz')
  gz_h = gz_df.filter(like='h')
  ps_t = df.filter(like='ps').filter(like='T')
  ps_r = df.filter(like='ps').filter(like='R')
  vision = pd.concat([timestamp, au_r, gz_h, ps_t, ps_r], axis=1)
  return vision

def read_tva(id, dataset):
  try:
    TRANSCRIPTION = []
    VISION = []
    AUDIO = []
    t_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Transcription', f'{id}_transcript.csv'))
    v_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Vision Summary', f'{id}_vision_summary.csv'))
    a_df = pd.read_csv(os.path.join(config.DATA_DIR, 'Audio Summary', f'{id}_audio_summary.csv'))

    participant_df, group_df = process_transcription(t_df)
    vision = process_vision(v_df)

    for _, row in group_df.iterrows():
      start = row.start_time
      stop = row.stop_time
      t_target = participant_df.loc[participant_df['count']==row['count']]
      v_target = vision.loc[(start <= vision.timestamp) & (vision.timestamp <= stop)]
      v_target = v_target.drop(columns=['timestamp'])
      v_target_list = v_target.values.tolist()
      a_target = a_df.iloc[int(start*100):int((stop*100)+1)]
      a_target_list = a_target.values.tolist()
      if len(t_target) > 0:
        TRANSCRIPTION.append(t_target)
      else:
        print("ERROR in transcription")
      if len(v_target_list) > 0:
        VISION.append(v_target_list)
      else:
        print("ERROR in vision")
      if len(a_target_list) > 0:
        AUDIO.append(a_target_list)
      else:
        print("ERROR in vision")
    if len(TRANSCRIPTION) == len(VISION) == len(AUDIO):
      dataset.append((TRANSCRIPTION, VISION, AUDIO))
    else:
      print("Length Not Matched")
  except Exception as e:
    print("오류 발생:",e)

id = ['300']
with Pool(processes=10) as p:
  with tqdm(total=len(id)) as pbar:
    for v in p.imap_unordered(partial(read_tva, dataset=dataset),id):
      pbar.update()

  0%|          | 0/1 [00:00<?, ?it/s]