word2vec을 이용한 단어 embedding

In [2]:
from gensim.models import Word2Vec
import re

In [3]:
# GPT야, simpsons 캐릭터 이름이 들어간 랜덤 문장 10개를 생성해줘

sentences = ["Homer Simpson forgot his lunch at home, so he had to buy a burger on his way to work.",
    "Marge was busy knitting a new sweater for Bart's upcoming school play.",
    "Lisa Simpson played a beautiful saxophone solo at the school concert.",
    "Mr. Burns secretly plotted another scheme from his office at the Springfield Nuclear Power Plant.",
    "Ned Flanders offered to help Homer fix the fence between their houses.",
    "Bart Simpson tried a new prank at school, but it didn't go as planned.",
    "Milhouse and Bart spent the afternoon playing video games and forgot to do their homework.",
    "Maggie Simpson's adorable giggle filled the room as she played with her toys.",
    "Apu had a busy day at the Kwik-E-Mart, dealing with a rush of customers.",
    "Krusty the Clown decided to change his show a bit to attract a new audience."]

In [4]:
# preprocessing
# get rid of stopwords, lower case

sentences = [re.sub(r"[.',]", "", sentence).lower().split(" ") for sentence in sentences]

In [5]:
sentences[0]

['homer',
 'simpson',
 'forgot',
 'his',
 'lunch',
 'at',
 'home',
 'so',
 'he',
 'had',
 'to',
 'buy',
 'a',
 'burger',
 'on',
 'his',
 'way',
 'to',
 'work']

In [6]:
# train word2vec

skip_gram = Word2Vec(sentences, vector_size=300, min_count=1, window=5, sg=1)

In [7]:
print("{} 의 vector representation: \n{}".format('homer', skip_gram.wv.get_vector(skip_gram.wv.key_to_index['homer'])))

homer 의 vector representation: 
[-1.66068680e-03 -4.20303462e-04  1.09255337e-03 -2.18484551e-03
 -3.23499087e-03 -3.12648294e-03  3.04949097e-03  1.87829603e-03
 -1.61115616e-03 -2.80928891e-03  4.52324020e-04  9.58630932e-04
 -4.38321120e-04  4.20039461e-04 -1.44900673e-03  1.57583959e-03
  5.10560756e-04  2.96067167e-03 -3.31212161e-03 -1.82445510e-03
 -3.01285670e-03 -1.11729649e-04 -2.61634029e-03  1.71959959e-03
 -2.11100094e-03 -2.03485345e-03  1.69092277e-03 -2.72677210e-03
  4.76851710e-04 -2.43410910e-03  3.28659941e-03  2.89322482e-03
  5.95899648e-04  1.94471062e-03  1.52419391e-03 -1.98301394e-03
  3.28957522e-03 -3.26667982e-03  2.68764189e-03  9.17330268e-04
 -9.88418004e-04 -1.19753159e-03  3.04961996e-03 -1.81994250e-03
  2.79433304e-03 -1.95553945e-03  2.79207341e-03 -1.48160951e-04
  2.66201468e-03 -1.02206389e-03  2.01950571e-03  2.95562134e-03
  8.11300066e-04  4.55215428e-04  1.65783032e-03  2.71425396e-03
  2.85594515e-03  2.84643611e-03  2.36571440e-03  2.695694

In [8]:
skip_gram.wv.most_similar('homer')

[('marge', 0.14081521332263947),
 ('offered', 0.13243569433689117),
 ('games', 0.12250109761953354),
 ('her', 0.11486154049634933),
 ('nuclear', 0.10569247603416443),
 ('do', 0.09913021326065063),
 ('toys', 0.0984482690691948),
 ('office', 0.0924413651227951),
 ('bart', 0.09009940177202225),
 ('way', 0.08802291005849838)]

직접 유사도 구해보기

In [9]:
homer_vector = skip_gram.wv.get_vector(skip_gram.wv.key_to_index['homer'])
marge_vector = skip_gram.wv.get_vector(skip_gram.wv.key_to_index['marge'])

In [10]:
import numpy as np
from numpy.linalg import norm

def cosine_similarity(vector_a: np.ndarray, vector_b: np.ndarray) -> float:
    """
    두 벡터간 cosine similarity를 계산
    
    Parameters
    ----------
    vector_a : np.ndarray
        The first input vector.
    vector_b : np.ndarray
        The second input vector.

    Returns
    -------
    float
        The cosine similarity between `vector_a` and `vector_b`, which is a value between -1 and 1.

    """

    dot_product = np.dot(vector_a, vector_b)
    norm_a = norm(vector_a)
    norm_b = norm(vector_b)
    simillarity = dot_product / (norm_a * norm_b)
    return simillarity

In [11]:
cosine_similarity(homer_vector, marge_vector)

0.14081518

In [14]:
from typing import Optional
import nltk
import re
import pandas as pd
import spacy

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\삼성\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\삼성\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [15]:
df = pd.read_csv('simpsons_dataset.csv')
df.shape

(158314, 2)

In [16]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [17]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [18]:
df.loc[0, 'spoken_words']

"No, actually, it was a little of both. Sometimes when a disease is in all the magazines and all the news shows, it's only natural that you think you have it."

데이터 전처리

In [21]:
# lemmatize and remove the stopwords and non-alphabetic characters for each line of dialogue

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

def cleaning(doc):
    """
    Cleans a spaCy Doc object by lemmatizing its tokens and removing stop words,
    then joins the remaining tokens into a single string if there are more than two tokens left.
    
    Parameters:
    ----------
    doc : spacy.tokens.Doc
        A spaCy Doc object containing the processed text.
    
    Returns:
    ----------
    Optional : str
        A string composed of the lemmatized, non-stop tokens separated by spaces,
        if the resulting list of tokens has more than two elements. Otherwise, returns None.
    """

    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

In [22]:
# only keep alphabets
cleaner = (re.sub("[^A-Za-z]+", ' ', str(row)).lower() for row in df['spoken_words'])

In [24]:
txt = [cleaning(doc) for doc in nlp.pipe(cleaner, batch_size=5000)]

In [25]:
txt[0]

'yeah s pretty feeling station'

In [27]:
# dataframe에 넣어서 null이 있는 대화는 삭제
# 주로 null은 특정 행동을 했지만 대화가 없었을 때임

df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(89372, 1)

In [28]:
# 하나의 문장을 여러 단위의 단어로 분할
sentences = [s.split(' ') for s in df_clean['clean']]

In [29]:
len(sentences)

89372

In [30]:
sentences[0]

['yeah', 's', 'pretty', 'feeling', 'station']

Word2Vec 모델 훈련

In [31]:
from gensim.models import Word2Vec

In [32]:
help(Word2Vec)

Help on class Word2Vec in module gensim.models.word2vec:

class Word2Vec(gensim.utils.SaveLoad)
 |  Word2Vec(sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), comment=None, max_final_vocab=None, shrink_windows=True)
 |  
 |  Method resolution order:
 |      Word2Vec
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=100

- window : 문장 내에서 현재 단어와 예측 단어 사이의 최대 거리. ex) 타겟 단어의 왼쪽과 오른쪽 n번째 단어
- vector_size : 단어 벡터의 차원 수
- min_count : 이 값보다 총 절대 빈도수가 낮은 모든 단어를 무시함 - (2, 100)
- sg : 1은 skip-gram, 0은 CBOW method를 사용

In [33]:
# 모델 정의 하기
w2v_model = Word2Vec(
    min_count=20,
    window=2,
    sample=6e-5,
    alpha=0.03,
    min_alpha=0.0007
)

In [34]:
# 문장에 들어있는 각 단어들을 Word2Vec 모델이 인식할 수 있는 형태로 변환
w2v_model.build_vocab(sentences)

In [35]:
# 모델 훈련
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=100)

(21260182, 60280100)

In [36]:
help(w2v_model.train)

Help on method train in module gensim.models.word2vec:

train(corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), **kwargs) method of gensim.models.word2vec.Word2Vec instance
    Update the model's neural weights from a sequence of sentences.
    
    Notes
    -----
    To support linear learning-rate decay from (initial) `alpha` to `min_alpha`, and accurate
    progress-percentage logging, either `total_examples` (count of sentences) or `total_words` (count of
    raw words in sentences) **MUST** be provided. If `sentences` is the same corpus
    that was provided to :meth:`~gensim.models.word2vec.Word2Vec.build_vocab` earlier,
    you can simply use `total_examples=self.corpus_count`.
    
    --------
    To avoid common mistakes around the model's ability to do multiple training passes itself, an
    explicit `epochs` argument 

### 단어간 유사도 확인하기

In [37]:
dir(w2v_model.wv)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_load_specials',
 '_log_evaluate_word_analogies',
 '_save_specials',
 '_smart_save',
 '_upconvert_old_d2vkv',
 '_upconvert_old_vocab',
 'add_lifecycle_event',
 'add_vector',
 'add_vectors',
 'allocate_vecattrs',
 'closer_than',
 'cosine_similarities',
 'distance',
 'distances',
 'doesnt_match',
 'evaluate_word_analogies',
 'evaluate_word_pairs',
 'expandos',
 'fill_norms',
 'get_index',
 'get_mean_vector',
 'get_normed_vectors',
 'get_vecattr',
 'get_vector',
 'has_index_for',
 'index2entity',
 'index2word',
 'index_to_key'

- most_similar : 주어진 조건에 가장 적합한 단어 탐색
- similarity : 주어진 단어들의 유사도 계산
- doesnt_match : 주어진 단어들 중 가장 '덜 유사한' 단어

In [38]:
help(w2v_model.wv.most_similar)

Help on method most_similar in module gensim.models.keyedvectors:

most_similar(positive=None, negative=None, topn=10, clip_start=0, clip_end=None, restrict_vocab=None, indexer=None) method of gensim.models.keyedvectors.KeyedVectors instance
    Find the top-N most similar keys.
    Positive keys contribute positively towards the similarity, negative keys negatively.
    
    This method computes cosine similarity between a simple mean of the projection
    weight vectors of the given keys and the vectors for each key in the model.
    The method corresponds to the `word-analogy` and `distance` scripts in the original
    word2vec implementation.
    
    Parameters
    ----------
    positive : list of (str or int or ndarray) or list of ((str,float) or (int,float) or (ndarray,float)), optional
        List of keys that contribute positively. If tuple, second element specifies the weight (default `1.0`)
    negative : list of (str or int or ndarray) or list of ((str,float) or (int,floa

In [39]:
help(w2v_model.wv.similarity)

Help on method similarity in module gensim.models.keyedvectors:

similarity(w1, w2) method of gensim.models.keyedvectors.KeyedVectors instance
    Compute cosine similarity between two keys.
    
    Parameters
    ----------
    w1 : str
        Input key.
    w2 : str
        Input key.
    
    Returns
    -------
    float
        Cosine similarity between `w1` and `w2`.



In [40]:
w2v_model.wv.most_similar(positive=['homer'])

[('marge', 0.6219091415405273),
 ('simpson', 0.5833418369293213),
 ('son', 0.5303625464439392),
 ('m', 0.5297139286994934),
 ('mr', 0.45622143149375916),
 ('way', 0.4336795210838318),
 ('maybe', 0.4258110225200653),
 ('ve', 0.42477402091026306),
 ('friend', 0.4154490828514099),
 ('bart', 0.41092047095298767)]

In [41]:
w2v_model.wv.most_similar(positive=['bart'])

[('lisa', 0.7192193865776062),
 ('milhouse', 0.5457488894462585),
 ('mom', 0.5421731472015381),
 ('boy', 0.5403572916984558),
 ('m', 0.5353553891181946),
 ('dad', 0.5148656964302063),
 ('ll', 0.4800121784210205),
 ('maggie', 0.47311967611312866),
 ('kid', 0.4630797505378723),
 ('school', 0.46018898487091064)]

- Woman : homer = ___ : marge

In [42]:
w2v_model.wv.most_similar(positive=['woman', 'homer'], negative=['marge'], topn=3)

[('man', 0.5459743738174438),
 ('simpson', 0.41761404275894165),
 ('people', 0.4040578305721283)]

In [43]:
w2v_model.wv.most_similar(positive=['woman', 'bart'], negative=['man'], topn=3)

[('lisa', 0.6235493421554565),
 ('hoover', 0.5296967625617981),
 ('mom', 0.5210390686988831)]

In [44]:
w2v_model.wv.doesnt_match(['bard', 'homer', 'marge'])

'homer'

In [45]:
w2v_model.wv.doesnt_match(['bart', 'lisa', 'marge'])

'marge'

### 단어 임베딩의 한계점

In [46]:
bank_vector = w2v_model.wv.get_vector(w2v_model.wv.key_to_index['bank'])

In [47]:
bank_vector

array([ 0.61653596, -0.6019891 , -0.94934744,  0.28376544,  1.1348646 ,
        0.26321322, -0.23323672, -0.27753466, -0.41983634,  1.7742176 ,
       -1.296568  , -0.3656422 ,  0.2235764 ,  0.9179288 ,  0.4883457 ,
        0.8312497 ,  1.0029693 , -0.6748741 , -1.6038657 , -0.7150788 ,
       -0.94990605, -0.2480318 , -0.7942151 ,  0.27103478,  1.2512724 ,
        0.12173048,  0.2943026 , -1.7506382 ,  0.7010823 , -0.4782461 ,
       -0.9024575 , -0.55438185,  0.43307015,  0.05748967, -1.303301  ,
       -0.01815479,  0.50058013, -0.5830111 ,  0.5908915 , -1.6770072 ,
        0.01647623,  0.48206076,  0.90807897, -0.6568238 , -0.5112265 ,
       -0.24112633, -1.2961593 ,  0.7980117 ,  0.47578752,  0.2899129 ,
        0.46487457,  0.8924679 , -1.8701609 , -0.6965123 ,  0.19850846,
       -0.01144549,  0.06661145,  1.1753205 ,  0.66201144,  0.64105207,
       -0.80141616, -0.58304787, -2.07443   ,  1.1676633 , -0.4093226 ,
       -1.0685458 , -0.8839352 , -0.19339511, -1.0874537 , -0.14

- 우리가 사용하는 모든 단어는 context에 따라 의미가 다르다
- 단어 embedding의 경우 이런 유연성을 확보하지 못 함
   - 배를 깎아 먹었다 / 배가 고프다 / 배 멀미를 하다

### sentence embedding

In [48]:
from transformers import BertTokenizer, BertModel
import torch

In [49]:
# pre-trained model tokenizer와 and bert model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [50]:
# bank가 들어간 유사한 문장 두 개
sentence1 = "I deposited money at the bank."
sentence2 = "The ducks swam to the river bank."

In [51]:
# 문장을 BERT가 인식할 수 있는 형태로 Tokenize
encoded_input1 = tokenizer(sentence1, return_tensors='pt')
encoded_input2 = tokenizer(sentence2, return_tensors='pt')

In [52]:
encoded_input1

{'input_ids': tensor([[  101,  1045, 14140,  2769,  2012,  1996,  2924,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

- input_ids : 각 단어별로 매핑된 key. 101은 문장의 시작을, 102는 문장의 끝을 의미
- token_type_ids : 문장 번호
- attention_mask : attention을 가져야 하는 단어는 1, 그렇지 않은 단어는 0. (만약 input이 실제 단어들이라면 1)

In [53]:
# embedding 생성!
with torch.no_grad():
    output1 = model(**encoded_input1)
    output2 = model(**encoded_input2)

In [54]:
# embedding 내에서 bank라는 단어 찾아오기 (문장의 5번째에 있는 단어)
bank_embedding_sentence1 = output1.last_hidden_state[0, 5, :]
bank_embedding_sentence2 = output2.last_hidden_state[0, 5, :]

In [55]:
# cosine similarity 계산

similarity = cosine_similarity(bank_embedding_sentence1, bank_embedding_sentence2)
print("Embedding for 'bank' in sentence 1:", bank_embedding_sentence1)
print("Embedding for 'bank' in sentence 2:", bank_embedding_sentence2)
print("Cosine similarity between the two embeddings:", similarity)

Embedding for 'bank' in sentence 1: tensor([ 7.5762e-01, -4.9297e-01, -1.4577e-01,  3.7644e-01,  1.3760e-01,
        -1.1604e-01,  1.9647e-01,  8.5373e-01,  2.8922e-01, -7.7167e-01,
         7.0191e-01,  1.5900e-02, -8.7243e-02, -8.7655e-02,  7.8980e-03,
        -3.0902e-01,  8.2599e-01, -5.8779e-01,  4.4748e-01, -1.5286e-01,
        -3.8126e-01, -3.2590e-03, -1.5456e-01,  8.0130e-01,  4.3496e-01,
         3.9089e-01, -6.2273e-02,  4.3417e-01, -5.5404e-01, -6.3325e-01,
         9.7596e-02,  1.3885e-01, -1.1386e+00, -2.9132e-01,  1.2839e-01,
         1.4953e-01,  2.3165e-01, -4.7182e-01, -1.1090e+00,  2.2412e-01,
        -9.2482e-01, -1.0203e-01,  6.8692e-01, -6.2060e-01,  4.4294e-01,
        -3.7882e-01,  9.6047e-01, -3.6463e-01,  1.4609e-02, -1.1666e+00,
        -4.4949e-01, -2.8636e-01, -2.2656e-02,  1.5022e-01, -2.7032e-01,
         1.5229e+00, -1.6890e-01, -5.8337e-01, -7.3641e-01,  3.8574e-01,
         8.4156e-02, -4.8477e-01,  8.2392e-01, -2.3315e-01,  2.4931e-01,
         6.1016