In [40]:
from gensim.models import Word2Vec
import re

In [41]:
# 임의 문장 생성 feat GPT
sentences = ["Homer Simpson forgot his lunch at home, so he had to buy a burger on his way to work.",
    "Marge was busy knitting a new sweater for Bart's upcoming school play.",
    "Lisa Simpson played a beautiful saxophone solo at the school concert.",
    "Mr. Burns secretly plotted another scheme from his office at the Springfield Nuclear Power Plant.",
    "Ned Flanders offered to help Homer fix the fence between their houses.",
    "Bart Simpson tried a new prank at school, but it didn't go as planned.",
    "Milhouse and Bart spent the afternoon playing video games and forgot to do their homework.",
    "Maggie Simpson's adorable giggle filled the room as she played with her toys.",
    "Apu had a busy day at the Kwik-E-Mart, dealing with a rush of customers.",
    "Krusty the Clown decided to change his show a bit to attract a new audience."]

In [42]:
# 전처리
sentences = [re.sub(r"[/;.]","" ,sentence).lower().split(" ") for sentence in sentences]

In [43]:
sentences[0]

['homer',
 'simpson',
 'forgot',
 'his',
 'lunch',
 'at',
 'home,',
 'so',
 'he',
 'had',
 'to',
 'buy',
 'a',
 'burger',
 'on',
 'his',
 'way',
 'to',
 'work']

In [44]:
# train word2vec model

skip_gram = Word2Vec(sentences, vector_size=300, window=5, sg=1, min_count=1)

In [45]:
print("{} 의 vector representation : \n{}".format('homer', skip_gram.wv.get_vector(skip_gram.wv.key_to_index['homer'])))

homer 의 vector representation : 
[ 4.37010603e-04  2.20553810e-03  3.31862061e-03  2.99401255e-03
 -2.65673967e-03  2.12347833e-03 -1.88211305e-03 -2.38198700e-04
  1.61271979e-04  2.17785477e-03  1.51176867e-03  1.51511212e-03
  3.16741085e-03  1.19458826e-04 -2.02419143e-03 -2.14576861e-03
  2.20433832e-03 -1.76242844e-03 -9.45917913e-04  1.27933640e-03
 -7.24085956e-04 -2.00292631e-03 -7.57622067e-04  4.20411321e-04
  7.65976263e-04  2.03796569e-03 -1.74286042e-03  1.02971040e-03
  2.42782990e-03  6.99461787e-04  1.79673161e-03 -1.59968506e-03
  2.05293740e-03 -2.52368348e-03  1.15334080e-03 -3.07910051e-03
 -8.48860247e-04 -3.05664004e-03 -5.17563312e-04 -1.79819262e-03
 -1.29387691e-03  3.84993793e-04  9.41949547e-04 -5.10141545e-04
 -2.67216726e-03 -1.91109558e-03  2.82451365e-04 -1.28447858e-03
 -3.15148057e-03 -2.41194459e-04  2.21195235e-03  1.98771991e-03
 -3.32846283e-03  1.07913720e-03 -2.04740674e-03 -3.04534333e-03
  3.59066107e-05 -8.09029953e-05 -2.32922519e-03 -2.05692

In [46]:
skip_gram.wv.most_similar("homer")

[('video', 0.14035673439502716),
 ('his', 0.12365606427192688),
 ('adorable', 0.11183690279722214),
 ('burger', 0.10865366458892822),
 ('planned', 0.09786190837621689),
 ('she', 0.09258976578712463),
 ('do', 0.09069041907787323),
 ('as', 0.08781418949365616),
 ('concert', 0.08753110468387604),
 ('lisa', 0.08640197664499283)]

In [47]:
homer_vector = skip_gram.wv.get_vector(skip_gram.wv.key_to_index['homer'])
video_vector = skip_gram.wv.get_vector(skip_gram.wv.key_to_index['video'])

In [48]:
# 유사도 계산

import numpy as np
from numpy.linalg import norm

def cosine_similarity(vector_a: np.ndarray, vector_b: np.ndarray) -> float:
    dot_product = np.dot(vector_a, vector_b)
    norm_a = norm(vector_a)
    norm_b = norm(vector_b)
    return dot_product / (norm_a * norm_b)

In [49]:
cosine_similarity(homer_vector, video_vector)

0.14035672

## Simpsons dataset을 활용한 word2vec 모델 학습

In [51]:
from typing import Optional
import nltk
import re
import pandas as pd
import spacy

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/jaypark/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jaypark/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [53]:
df = pd.read_csv('./data/simpsons_dataset.csv')
df.shape

(158314, 2)

In [54]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [55]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [57]:
# 전처리

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
# 표제어 추출 (good, better, best -> good) 


def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)
    
    
cleaner = (re.sub(r"[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [58]:
txt = [cleaning(doc) for doc in nlp.pipe(cleaner, batch_size=5000)]

In [59]:
txt[0]

'actually little disease magazine news show natural think'

In [60]:
# No, actually, it was a little of both. Sometimes when a disease is in all the magazines and all the news shows, it's only natural that you think you have it.

In [61]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85956, 1)

In [63]:
sentences = [s.split(' ') for s in df_clean['clean']]
len(sentences)

85956

In [64]:
sentences[0]

['actually',
 'little',
 'disease',
 'magazine',
 'news',
 'show',
 'natural',
 'think']

In [65]:
# word2vec 모델 학습
from gensim.models import Word2Vec

### help(Word2Vec)
- `window` : 문장 내에서 현재 단어와 예측 단어 사이의 최대 거리. ex) 타겟 단어의 왼쪽과 오른쪽 n번째 단어
- `vector_size` : 단어 벡터의 차원 수
- `min_count` : 이 값보다 총 절대 빈도수가 낮은 모든 단어를 무시함 - (2, 100)
- `sg` : 1은 skip-gram, 0은 CBOW method를 사용

In [67]:
# 모델 정의 하기
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007)

In [68]:
# 문장에 들어있는 각 단어들을 Word2Vec 모델이 인식할 수 있는 형태로 변환
w2v_model.build_vocab(sentences)

In [69]:
# 모델 훈련
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=100)

(19987512, 54001900)

### dir(w2v_model.wv)
- most_similar : 주어진 조건에 가장 적합한 단어 탐색
- similarity : 주어진 단어들의 유사도 계산
- doesnt_match : 주어진 단어들 중 가장 '덜 유사한' 단어

In [71]:
w2v_model.wv.most_similar(positive=['homer'])

[('marge', 0.42831405997276306),
 ('simpson', 0.3640384376049042),
 ('mr', 0.30622023344039917),
 ('dad', 0.2774527668952942),
 ('bart', 0.26508015394210815),
 ('right', 0.23841038346290588),
 ('barney', 0.2356049120426178),
 ('moe', 0.23117829859256744),
 ('son', 0.22991864383220673),
 ('lisa', 0.22671663761138916)]

In [72]:
w2v_model.wv.most_similar(positive=['bart'])

[('lisa', 0.4833817780017853),
 ('kid', 0.323870986700058),
 ('milhouse', 0.3115254342556),
 ('dad', 0.30365189909935),
 ('mom', 0.2980813682079315),
 ('boy', 0.2954169809818268),
 ('child', 0.28352370858192444),
 ('son', 0.28075528144836426),
 ('think', 0.27276813983917236),
 ('homer', 0.26508015394210815)]

In [73]:
# woman : homer = ________ : marge
w2v_model.wv.most_similar(positive=["woman","homer"], negative=["marge"],topn=3)

[('people', 0.19781005382537842),
 ('young', 0.19512462615966797),
 ('man', 0.19244812428951263)]

In [74]:
w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3)

[('lisa', 0.2406603842973709),
 ('nelson', 0.22582858800888062),
 ('maggie', 0.22556255757808685)]

In [75]:
w2v_model.wv.doesnt_match(['bart', 'homer', 'marge'])

'bart'

In [76]:
w2v_model.wv.doesnt_match(['bart', 'lisa', 'marge'])

'marge'

### 단어 임베딩의 한계점
- 우리가 사용하는 모든 단어는 context에 따라 의미가 다르다
- 단어 embedding의 경우 이런 유연성을 확보하지 못 함
    - 배를 깎아 먹었다 / 배가 고프다 / 배 멀미를 하다


In [81]:
# sentence embedding
from transformers import BertTokenizer, BertModel
import torch

In [82]:
# pre-trained model tokenizer와 and bert model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # smaller & uncased model
model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [83]:
# bank가 들어간 유사한 문장 두 개
sentence1 = "I deposited money at the bank."
sentence2 = "The ducks swam to the river bank."

In [84]:
# 문장을 BERT가 인식할 수 있는 형태로 Tokenize
encoded_input1 = tokenizer(sentence1, return_tensors='pt') # pytorch
encoded_input2 = tokenizer(sentence2, return_tensors='pt')

In [85]:
encoded_input1

{'input_ids': tensor([[  101,  1045, 14140,  2769,  2012,  1996,  2924,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

- `input_ids` : 각 단어별로 매핑된 key. 101은 문장의 시작을, 102는 문장의 끝을 의미
- `token_type_ids` : 문장 번호
- `attention_mask` : attention을 가져야 하는 단어는 1, 그렇지 않은 단어는 0. (만약 input이 실제 단어들이라면 1)

In [86]:
# embedding 생성!
with torch.no_grad():
    output1 = model(**encoded_input1)
    output2 = model(**encoded_input2)

In [87]:
# embedding 내에서 bank라는 단어 찾아오기 (문장의 5번째에 있는 단어)
bank_embedding_sentence1 = output1.last_hidden_state[0, 5, :]
bank_embedding_sentence2 = output2.last_hidden_state[0, 5, :]

In [88]:
similarity = cosine_similarity(bank_embedding_sentence1, bank_embedding_sentence2)
# print("Embedding for 'bank' in sentence 1:", bank_embedding_sentence1)
# print("Embedding for 'bank' in sentence 2:", bank_embedding_sentence2)
print("Cosine similarity between the two embeddings:", similarity)

Cosine similarity between the two embeddings: 0.59224117
