### 네거티브 샘플링을 이용한 Word2Vec 구현
#### 일부 단어 집합에만 집중. 마지막 단계를 이진 분류 문제로 변환  
#### 주변 단어를 긍정(positive), 랜덤으로 선택된 단어를 부정(negative)로 labeling

### 20뉴스그룹 데이터 전처리하기

In [94]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from tensorflow.keras.preprocessing.text import Tokenizer

In [95]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print('총 샘플 수 :',len(documents))

총 샘플 수 : 11314


In [96]:
documents[:1]

["Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"]

In [97]:
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거

In [98]:
news_df.head(3)

Unnamed: 0,document
0,Well i'm not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,Although I realize that principle is not one o...


In [99]:
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")


In [100]:
news_df.clean_doc[1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

In [101]:
news_df.isnull().values.any()

False

In [102]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11314 entries, 0 to 11313
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   document   11314 non-null  object
 1   clean_doc  11314 non-null  object
dtypes: object(2)
memory usage: 176.9+ KB


In [103]:
# 결측치 변환
news_df.replace("", float("NaN"), inplace=True)

In [104]:
news_df.isnull().values.any()

True

In [105]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11314 entries, 0 to 11313
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   document   11096 non-null  object
 1   clean_doc  10995 non-null  object
dtypes: object(2)
memory usage: 176.9+ KB


In [106]:
# 결측치 제거
news_df.dropna(inplace=True)

In [107]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10995 entries, 0 to 11313
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   document   10995 non-null  object
 1   clean_doc  10995 non-null  object
dtypes: object(2)
memory usage: 257.7+ KB


In [108]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/leok/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [109]:
# NLTK에서 정의한 불용어를 기반으로 정리
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())

In [110]:
type(tokenized_doc)

pandas.core.series.Series

In [111]:
tokenized_doc.head()

0    [well, sure, about, story, seem, biased, what,...
1    [yeah, expect, people, read, actually, accept,...
2    [although, realize, that, principle, your, str...
3    [notwithstanding, legitimate, fuss, about, thi...
4    [well, will, have, change, scoring, playoff, p...
Name: clean_doc, dtype: object

In [112]:
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

In [113]:
tokenized_doc = tokenized_doc.to_list()

In [114]:
type(tokenized_doc)

list

In [115]:
# 단어가 1개 이하인 샘플의 인덱스를 찾아서 저장하고 해당 샘플은 제거
drop_train = [index for index, sentence in enumerate(tokenized_doc) if len(sentence) <= 1]

In [116]:
drop_train

[44,
 260,
 353,
 1651,
 1839,
 2321,
 2336,
 2371,
 2862,
 2963,
 3290,
 3387,
 3395,
 3396,
 3421,
 3563,
 3591,
 3713,
 3874,
 3897,
 4180,
 4524,
 4587,
 4617,
 4947,
 4970,
 5129,
 5525,
 6015,
 6227,
 6652,
 6723,
 6883,
 7080,
 7956,
 8000,
 8156,
 8212,
 8283,
 8588,
 8867,
 8903,
 9045,
 9555,
 9696,
 10439,
 10447,
 10564,
 10707,
 10730,
 10750,
 10838,
 10896,
 10908,
 10967]

In [117]:
tokenized_doc = np.delete(tokenized_doc, drop_train, axis=0)

In [118]:
len(tokenized_doc)

10940

In [119]:
type(tokenized_doc)

numpy.ndarray

In [120]:
tokenized_doc[1]

['yeah',
 'expect',
 'people',
 'read',
 'actually',
 'accept',
 'hard',
 'atheism',
 'need',
 'little',
 'leap',
 'faith',
 'jimmy',
 'logic',
 'runs',
 'steam',
 'sorry',
 'pity',
 'sorry',
 'feelings',
 'denial',
 'faith',
 'need',
 'well',
 'pretend',
 'happily',
 'ever',
 'anyway',
 'maybe',
 'start',
 'newsgroup',
 'atheist',
 'hard',
 'bummin',
 'much',
 'forget',
 'flintstone',
 'chewables',
 'bake',
 'timmons']

In [121]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_doc)

word2idx = tokenizer.word_index

In [122]:
tokenizer.word_index

{'would': 1,
 'people': 2,
 'like': 3,
 'know': 4,
 'also': 5,
 'think': 6,
 'time': 7,
 'could': 8,
 'well': 9,
 'good': 10,
 'even': 11,
 'first': 12,
 'much': 13,
 'many': 14,
 'make': 15,
 'system': 16,
 'used': 17,
 'right': 18,
 'file': 19,
 'want': 20,
 'said': 21,
 'anyone': 22,
 'need': 23,
 'work': 24,
 'something': 25,
 'problem': 26,
 'since': 27,
 'please': 28,
 'information': 29,
 'year': 30,
 'back': 31,
 'using': 32,
 'really': 33,
 'program': 34,
 'going': 35,
 'still': 36,
 'years': 37,
 'find': 38,
 'believe': 39,
 'must': 40,
 'point': 41,
 'last': 42,
 'available': 43,
 'take': 44,
 'number': 45,
 'space': 46,
 'data': 47,
 'thanks': 48,
 'things': 49,
 'windows': 50,
 'government': 51,
 'might': 52,
 'made': 53,
 'another': 54,
 'mail': 55,
 'without': 56,
 'help': 57,
 'better': 58,
 'sure': 59,
 'read': 60,
 'never': 61,
 'drive': 62,
 'part': 63,
 'long': 64,
 'case': 65,
 'look': 66,
 'however': 67,
 'power': 68,
 'question': 69,
 'world': 70,
 'name': 71,
 'c

In [123]:
tokenizer.word_counts

OrderedDict([('well', 2513),
             ('sure', 1174),
             ('story', 278),
             ('seem', 577),
             ('biased', 50),
             ('disagree', 121),
             ('statement', 334),
             ('media', 244),
             ('ruin', 11),
             ('israels', 6),
             ('reputation', 25),
             ('rediculous', 5),
             ('israeli', 350),
             ('world', 1052),
             ('lived', 160),
             ('europe', 166),
             ('realize', 175),
             ('incidences', 3),
             ('described', 183),
             ('letter', 243),
             ('occured', 35),
             ('whole', 562),
             ('ignore', 90),
             ('subsidizing', 5),
             ('existance', 30),
             ('europeans', 33),
             ('least', 946),
             ('degree', 118),
             ('think', 3014),
             ('might', 1228),
             ('reason', 692),
             ('report', 287),
             ('clearly', 257),


In [124]:
# 인덱스:단어 형태로 딕셔너리 생성
idx2word = {v:k for k, v in word2idx.items()}

In [125]:
idx2word

{1: 'would',
 2: 'people',
 3: 'like',
 4: 'know',
 5: 'also',
 6: 'think',
 7: 'time',
 8: 'could',
 9: 'well',
 10: 'good',
 11: 'even',
 12: 'first',
 13: 'much',
 14: 'many',
 15: 'make',
 16: 'system',
 17: 'used',
 18: 'right',
 19: 'file',
 20: 'want',
 21: 'said',
 22: 'anyone',
 23: 'need',
 24: 'work',
 25: 'something',
 26: 'problem',
 27: 'since',
 28: 'please',
 29: 'information',
 30: 'year',
 31: 'back',
 32: 'using',
 33: 'really',
 34: 'program',
 35: 'going',
 36: 'still',
 37: 'years',
 38: 'find',
 39: 'believe',
 40: 'must',
 41: 'point',
 42: 'last',
 43: 'available',
 44: 'take',
 45: 'number',
 46: 'space',
 47: 'data',
 48: 'thanks',
 49: 'things',
 50: 'windows',
 51: 'government',
 52: 'might',
 53: 'made',
 54: 'another',
 55: 'mail',
 56: 'without',
 57: 'help',
 58: 'better',
 59: 'sure',
 60: 'read',
 61: 'never',
 62: 'drive',
 63: 'part',
 64: 'long',
 65: 'case',
 66: 'look',
 67: 'however',
 68: 'power',
 69: 'question',
 70: 'world',
 71: 'name',
 72

In [126]:
# 정수 인코딩
encoded = tokenizer.texts_to_sequences(tokenized_doc)

In [127]:
encoded[:2]

[[9,
  59,
  603,
  207,
  3278,
  1495,
  474,
  702,
  9470,
  13686,
  5533,
  15227,
  702,
  442,
  702,
  70,
  1148,
  1095,
  1036,
  20294,
  984,
  705,
  4294,
  702,
  217,
  207,
  1979,
  15228,
  13686,
  4865,
  4520,
  87,
  1530,
  6,
  52,
  149,
  581,
  661,
  4406,
  4988,
  4866,
  1920,
  755,
  10668,
  1102,
  7837,
  442,
  957,
  10669,
  634,
  51,
  228,
  2669,
  4989,
  178,
  66,
  222,
  4521,
  6066,
  68,
  4295],
 [1026,
  532,
  2,
  60,
  98,
  582,
  107,
  800,
  23,
  79,
  4522,
  333,
  7838,
  864,
  421,
  3825,
  458,
  6488,
  458,
  2700,
  4730,
  333,
  23,
  9,
  4731,
  7262,
  186,
  310,
  146,
  170,
  642,
  1260,
  107,
  33568,
  13,
  985,
  33569,
  33570,
  9471,
  11491]]

In [128]:
# 단어 집합의 크기 확인
vocab_size = len(word2idx) + 1

In [129]:
vocab_size

64277

### 네거티브 샘플링을 통한 데이터셋 구성

In [130]:
from tensorflow.keras.preprocessing.sequence import skipgrams
# 네거티브 샘플링
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded[:10]]

In [131]:
type(skip_grams)

list

In [132]:
# 앞: 중심단어,  뒤: 주변단어
skip_grams[0]

([[4866, 10140],
  [20294, 41366],
  [5533, 36177],
  [52, 22898],
  [13686, 34926],
  [4521, 24398],
  [87, 4520],
  [87, 1530],
  [634, 37434],
  [70, 64131],
  [705, 47261],
  [4988, 8040],
  [228, 11693],
  [603, 19206],
  [1036, 23433],
  [4988, 60801],
  [957, 59462],
  [1095, 46086],
  [755, 21119],
  [1920, 581],
  [702, 62033],
  [207, 19641],
  [1979, 149],
  [702, 49782],
  [4521, 222],
  [4295, 66],
  [702, 1530],
  [1979, 15228],
  [6, 19223],
  [59, 37543],
  [4295, 3235],
  [442, 5533],
  [984, 53329],
  [705, 17810],
  [4294, 23029],
  [1095, 1036],
  [13686, 1148],
  [10669, 43502],
  [1530, 702],
  [581, 52],
  [755, 957],
  [5533, 70],
  [755, 12068],
  [4406, 23841],
  [68, 60015],
  [6066, 222],
  [702, 15227],
  [4865, 52773],
  [149, 58270],
  [7837, 178],
  [442, 1102],
  [217, 702],
  [217, 1979],
  [52, 4406],
  [6, 661],
  [957, 51],
  [442, 17685],
  [702, 40359],
  [661, 2870],
  [3278, 23167],
  [9, 26399],
  [217, 22089],
  [13686, 49104],
  [474, 24737],

In [133]:
# 첫번째 샘플인 skip_grams[0] 내 skip_grams로 형성된 데이터셋 확인
pairs, labels = skip_grams[0][0], skip_grams[0][1]

In [134]:
type(labels)

list

In [135]:
labels[:5]

[0, 0, 0, 0, 0]

In [136]:
for i in range(5):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          idx2word[pairs[i][0]], pairs[i][0], 
          idx2word[pairs[i][1]], pairs[i][1], 
          labels[i]))

(austria (4866), disney (10140)) -> 0
(incidences (20294), skogstad (41366)) -> 0
(reputation (5533), triplet (36177)) -> 0
(might (52), tenuous (22898)) -> 0
(israels (13686), teller (34926)) -> 0


In [137]:
print('전체 샘플 수 :',len(skip_grams))

전체 샘플 수 : 10


In [138]:
# 첫번째 뉴스그룹 샘플에 대해서 생긴 pairs와 labels의 개수
print(len(pairs))
print(len(labels))

2220
2220


In [139]:
# 모든 샘플에 대해 수행
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded]

### Skip-Gram with Negative Sampling(SGNS) 구현하기

In [140]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input
from tensorflow.keras.layers import Dot
from tensorflow.keras.utils import plot_model
from IPython.display import SVG

In [141]:
# 임베딩 벡터의 차원은 100. 임의의 하이퍼파라미터
embed_size = 100

In [142]:
# 중심 단어를 위한 임베딩 테이블
w_inputs = Input(shape=(1, ), dtype='int32')
word_embedding = Embedding(vocab_size, embed_size)(w_inputs)

# 주변 단어를 위한 임베딩 테이블
c_inputs = Input(shape=(1, ), dtype='int32')
context_embedding  = Embedding(vocab_size, embed_size)(c_inputs)

In [146]:
dot_product = Dot(axes=2)([word_embedding, context_embedding])
dot_product = Reshape((1,), input_shape=(1, 1))(dot_product)
output = Activation('sigmoid')(dot_product)

In [147]:
model = Model(inputs=[w_inputs, c_inputs], outputs=output)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam')
plot_model(model, to_file='model3.png', show_shapes=True, show_layer_names=True, rankdir='TB')

Model: "functional_15"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 100)       6427700     input_3[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 100)       6427700     input_4[0][0]                    
______________________________________________________________________________________

In [52]:
# 학습
for epoch in range(1, 6):
    loss = 0
    for _, elem in enumerate(skip_grams):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X,Y)  
    print('Epoch :',epoch, 'Loss :',loss)

Epoch : 1 Loss : 4627.9567530713975
Epoch : 2 Loss : 3660.535947930068
Epoch : 3 Loss : 3503.5169317163527
Epoch : 4 Loss : 3307.7909146454185
Epoch : 5 Loss : 3082.8056223411113


### 결과 확인

In [53]:
import gensim

In [55]:
# 학습된 임베딩 벡터들을 저장
f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

In [56]:
# 단어 간 유사도
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [57]:
w2v.most_similar(positive=['soldiers'])

[('refugees', 0.8953213691711426),
 ('azerbaijani', 0.8915454745292664),
 ('azerbaijanis', 0.8913716077804565),
 ('shouted', 0.8870112299919128),
 ('massacred', 0.8830893635749817),
 ('slaughtered', 0.8828731775283813),
 ('escaped', 0.8796411752700806),
 ('turkiye', 0.8779966831207275),
 ('azerbaijan', 0.8753331899642944),
 ('palestinian', 0.8752043843269348)]

In [58]:
w2v.most_similar(positive=['doctor'])

[('candida', 0.6401154398918152),
 ('quack', 0.6366071701049805),
 ('medication', 0.6347054243087769),
 ('pain', 0.6213609576225281),
 ('disease', 0.6169787049293518),
 ('treatment', 0.6164973974227905),
 ('lyme', 0.6162136793136597),
 ('yeast', 0.6152898073196411),
 ('dietary', 0.6126024723052979),
 ('patients', 0.6093378067016602)]

In [59]:
w2v.most_similar(positive=['police'])

[('peaceful', 0.6572277545928955),
 ('politicians', 0.6516166925430298),
 ('relatives', 0.6515181660652161),
 ('trabzon', 0.6505950689315796),
 ('threatened', 0.6505939960479736),
 ('greece', 0.6468607187271118),
 ('authorities', 0.6467465162277222),
 ('armed', 0.6350993514060974),
 ('members', 0.6302595138549805),
 ('attorney', 0.627491295337677)]