<a href="https://colab.research.google.com/github/Kimminsu-ds/Deep-Learning-NLP-using-Tensorflow/blob/main/03_05_SGNS(Skip_Gram_Negative_Sampling)_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 러닝스푼즈 - Tensorflow를 활용한 딥러닝 자연어처리

In [28]:
import os
from collections import Counter
from time import time

import numpy as np
import pandas as pd

from keras.layers import Dense, Dot, Embedding, Input, Reshape
from keras.models import Model
from keras.preprocessing.sequence import skipgrams
from nltk.corpus import stopwords

np.random.seed(777)
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [29]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 데이터 세팅

In [30]:
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/eds-uga/csci1360e-su18/master/assignments/A9/abcnews-date-text.csv", filename="abcnews-date-text.csv")

corpus = pd.read_csv("abcnews-date-text.csv").iloc[:, 1]
corpus.head(10)

0    aba decides against community broadcasting lic...
1       act fire witnesses must be aware of defamation
2       a g calls for infrastructure protection summit
3             air nz staff in aust strike for pay rise
4        air nz strike to affect australian travellers
5                    ambitious olsson wins triple jump
6           antic delighted with record breaking barca
7    aussie qualifier stosur wastes four memphis match
8         aust addresses un security council over iraq
9           australia is locked into war timetable opp
Name: headline_text, dtype: object

## 전처리

In [31]:
# 소문자로 변경
corpus = corpus.str.lower()

# 숫자/알파벳/공백을 제외하고 전부 제고
corpus = corpus.str.replace('[^a-z0-9]+', ' ', regex=True)
corpus.head(10)

0    aba decides against community broadcasting lic...
1       act fire witnesses must be aware of defamation
2       a g calls for infrastructure protection summit
3             air nz staff in aust strike for pay rise
4        air nz strike to affect australian travellers
5                    ambitious olsson wins triple jump
6           antic delighted with record breaking barca
7    aussie qualifier stosur wastes four memphis match
8         aust addresses un security council over iraq
9           australia is locked into war timetable opp
Name: headline_text, dtype: object

In [32]:
corpus_list = corpus.values.tolist()
len(corpus_list)

1103665

In [33]:
corpus_list[0:5]

['aba decides against community broadcasting licence',
 'act fire witnesses must be aware of defamation',
 'a g calls for infrastructure protection summit',
 'air nz staff in aust strike for pay rise',
 'air nz strike to affect australian travellers']

In [34]:
# corpus에 있는 모든 line에서 word를 추출해서 하나의 list에 정리
words = np.concatenate(np.core.defchararray.split(corpus_list)).tolist()
print(words[0:100])

['aba', 'decides', 'against', 'community', 'broadcasting', 'licence', 'act', 'fire', 'witnesses', 'must', 'be', 'aware', 'of', 'defamation', 'a', 'g', 'calls', 'for', 'infrastructure', 'protection', 'summit', 'air', 'nz', 'staff', 'in', 'aust', 'strike', 'for', 'pay', 'rise', 'air', 'nz', 'strike', 'to', 'affect', 'australian', 'travellers', 'ambitious', 'olsson', 'wins', 'triple', 'jump', 'antic', 'delighted', 'with', 'record', 'breaking', 'barca', 'aussie', 'qualifier', 'stosur', 'wastes', 'four', 'memphis', 'match', 'aust', 'addresses', 'un', 'security', 'council', 'over', 'iraq', 'australia', 'is', 'locked', 'into', 'war', 'timetable', 'opp', 'australia', 'to', 'contribute', '10', 'million', 'in', 'aid', 'to', 'iraq', 'barca', 'take', 'record', 'as', 'robson', 'celebrates', 'birthday', 'in', 'bathhouse', 'plans', 'move', 'ahead', 'big', 'hopes', 'for', 'launceston', 'cycling', 'championship', 'big', 'plan', 'to', 'boost']


In [35]:
# 불용어 설정
stopWords = set(stopwords.words("english"))
print(stopWords)

{'at', "you've", 'both', 'd', 'as', 'herself', 'will', 'again', 'down', 'in', 'on', 'all', 'didn', 'that', 'isn', 'nor', "mightn't", 'itself', 'once', "shouldn't", 'until', 'having', 'ma', 'any', 'above', 'out', 'i', 'been', 'by', 'where', 'themselves', "didn't", "it's", 'hers', 'yours', 'against', 's', 'be', 'what', 'their', 'have', 'himself', 'who', 'him', 'am', 'here', 'no', 'further', 'my', "weren't", 'these', "hadn't", 'because', 'he', 'from', 'his', 'o', 'its', 'to', 'doesn', 'below', 'very', 'had', "haven't", 'about', "should've", 'but', 'now', 'just', 'weren', 'ourselves', 'our', 'why', 'when', "needn't", 'mightn', 'an', 'then', 'under', 'other', 'between', "you'd", 'some', 'haven', 'over', 'more', "aren't", 'or', "won't", 't', 'those', 'y', 'it', 'which', 'few', "don't", 'before', "she's", 'mustn', 'does', 'them', "isn't", 'aren', 'so', "wasn't", 'her', 'm', 'while', 'same', 'the', 'won', 'there', 'needn', 'each', 'too', 'me', 'not', "that'll", 'most', 'shan', 'than', 'a', 're

In [36]:
# 단어의 등장 빈도수를 카운트
counter = Counter(words)
len(counter)

96722

In [37]:
# 등장 빈도 상위 100개
counter.most_common(100)

[('to', 214201),
 ('in', 135982),
 ('for', 130239),
 ('of', 80759),
 ('on', 73037),
 ('over', 50306),
 ('the', 49810),
 ('police', 35986),
 ('at', 31723),
 ('with', 29676),
 ('after', 29661),
 ('new', 29095),
 ('man', 28110),
 ('a', 24885),
 ('and', 22748),
 ('up', 20990),
 ('as', 20481),
 ('says', 20001),
 ('from', 19086),
 ('by', 17727),
 ('us', 17536),
 ('out', 17184),
 ('govt', 16935),
 ('court', 16383),
 ('council', 16363),
 ('be', 15774),
 ('more', 15247),
 ('interview', 15025),
 ('fire', 13910),
 ('not', 13741),
 ('nsw', 12919),
 ('australia', 12680),
 ('plan', 12307),
 ('water', 11877),
 ('qld', 11790),
 ('wa', 11534),
 ('crash', 11208),
 ('death', 11174),
 ('into', 10901),
 ('sydney', 10761),
 ('off', 10731),
 ('against', 10427),
 ('health', 10373),
 ('australian', 10352),
 ('charged', 10148),
 ('back', 10134),
 ('no', 10050),
 ('report', 9589),
 ('down', 9555),
 ('call', 9292),
 ('murder', 9217),
 ('sa', 9121),
 ('an', 9053),
 ('day', 8827),
 ('hospital', 8821),
 ('car', 8690

In [38]:
# 불용어 제거
words = [word for word in words if word not in stopWords]
print(counter.most_common(100))

[('to', 214201), ('in', 135982), ('for', 130239), ('of', 80759), ('on', 73037), ('over', 50306), ('the', 49810), ('police', 35986), ('at', 31723), ('with', 29676), ('after', 29661), ('new', 29095), ('man', 28110), ('a', 24885), ('and', 22748), ('up', 20990), ('as', 20481), ('says', 20001), ('from', 19086), ('by', 17727), ('us', 17536), ('out', 17184), ('govt', 16935), ('court', 16383), ('council', 16363), ('be', 15774), ('more', 15247), ('interview', 15025), ('fire', 13910), ('not', 13741), ('nsw', 12919), ('australia', 12680), ('plan', 12307), ('water', 11877), ('qld', 11790), ('wa', 11534), ('crash', 11208), ('death', 11174), ('into', 10901), ('sydney', 10761), ('off', 10731), ('against', 10427), ('health', 10373), ('australian', 10352), ('charged', 10148), ('back', 10134), ('no', 10050), ('report', 9589), ('down', 9555), ('call', 9292), ('murder', 9217), ('sa', 9121), ('an', 9053), ('day', 8827), ('hospital', 8821), ('car', 8690), ('may', 8534), ('coast', 8419), ('calls', 8401), ('a

In [39]:
# 상위 80%만 남기고 UNK 토큰 추가
top_n_ratio = 0.8

counter = Counter(dict(counter.most_common(int(top_n_ratio * len(counter)))))
vocab = list(counter) + ["UNK"]
vocab[-10:]

['haytossing',
 'mitter',
 'flyboss',
 'bigga',
 'vandeleur',
 'accredits',
 'ides',
 'certifi',
 'jackknifes',
 'UNK']

In [40]:
vocab_size = len(vocab)
vocab_size

77378

In [41]:
word2index = {word:index for index, word in enumerate(vocab)}
index2word = {index:word for word, index in word2index.items()}

In [42]:
word2index['iraq']

132

In [43]:
index2word[96]

'china'

In [44]:
indexed_corpus_list = []
for doc in corpus_list:
  indexed_corpus_list.append([word2index[word] if word in word2index else word2index["UNK"] for word in doc.split()])

In [45]:
corpus_list[0]

'aba decides against community broadcasting licence'

In [46]:
indexed_corpus_list[0]

[11212, 6092, 41, 182, 11058, 1320]

In [47]:
word2index['aba']

11212

In [48]:
word2index['licence']

1320

## SGNS 데이터셋 생성

In [49]:
# 네거티브 샘플링이 섞인 데이터셋 생성
def generating_wordpairs(indexed_corpus, vocab_size, window_size=4):
    X = []
    Y = []
    for row in indexed_corpus:
        x, y = skipgrams(sequence=row, vocabulary_size=vocab_size, window_size=window_size,
                        negative_samples=1.0, shuffle=True, categorical=False, sampling_table=None, seed=None)
        X = X + list(x)
        Y = Y + list(y)
    return X, Y

In [50]:
X, Y = generating_wordpairs(indexed_corpus_list[0:100], vocab_size, window_size=4)

In [51]:
print(X[0])

[6092, 11058]


In [52]:
print(Y[0])

1


## Embedding

In [53]:
#embedding_dim=300
embedding_dim=30

input_target = Input((1,))
input_context = Input((1,))

embedding_layer = Embedding(vocab_size, embedding_dim, input_length=1)

target_embedding = embedding_layer(input_target)
target_embedding = Reshape((embedding_dim, 1))(target_embedding)
context_embedding = embedding_layer(input_context)
context_embedding = Reshape((embedding_dim, 1))(context_embedding)

hidden_layer = Dot(axes=1)([target_embedding, context_embedding])
hidden_layer = Reshape((1,))(hidden_layer)

output = Dense(16, activation='sigmoid')(hidden_layer)
output = Dense(1, activation='sigmoid')(output)

model = Model(inputs=[input_target, input_context], outputs=output)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='sgd')

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 30)        2321340     input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
reshape_3 (Reshape)             (None, 30, 1)        0           embedding_1[0][0]          

In [54]:
# epochs = 100000
epochs = 1000
batch_size = 512

for i in range(epochs):
    idx_batch = np.random.choice(len(indexed_corpus_list), batch_size)
    X, Y = generating_wordpairs(np.array(indexed_corpus_list)[idx_batch].tolist(), vocab_size)

    word_target, word_context = zip(*X)
    word_target = np.array(word_target, dtype=np.int32)
    word_context = np.array(word_context, dtype=np.int32)

    target = np.zeros((1,))
    context = np.zeros((1,))
    label = np.zeros((1,))
    idx = np.random.randint(0, len(Y)-1)
    target[0,] = word_target[idx]
    context[0,] = word_context[idx]
    label[0,] = Y[idx]
    loss = model.train_on_batch([target, context], label)
    if i % 10 == 0:
        print("Iteration {}, loss={}".format(i, loss))

  import sys


Iteration 0, loss=0.7868516445159912
Iteration 10, loss=0.8306280374526978
Iteration 20, loss=0.6297745108604431
Iteration 30, loss=0.6377366185188293
Iteration 40, loss=0.7446576952934265
Iteration 50, loss=0.6953948140144348
Iteration 60, loss=0.6652116775512695
Iteration 70, loss=0.7419299483299255
Iteration 80, loss=0.7604449987411499
Iteration 90, loss=0.7254554033279419
Iteration 100, loss=0.6725188493728638
Iteration 110, loss=0.6272990703582764
Iteration 120, loss=0.6587733626365662
Iteration 130, loss=0.6162236332893372
Iteration 140, loss=0.816859245300293
Iteration 150, loss=0.6613016128540039
Iteration 160, loss=0.6964981555938721
Iteration 170, loss=0.6480298042297363
Iteration 180, loss=0.701492190361023
Iteration 190, loss=0.7031185626983643
Iteration 200, loss=0.7314386367797852
Iteration 210, loss=0.6613101363182068
Iteration 220, loss=0.5771893858909607
Iteration 230, loss=0.8629833459854126
Iteration 240, loss=0.8716559410095215
Iteration 250, loss=0.8198219537734985

In [55]:
word2vec_file_path = 'word2vec.txt'
f = open(word2vec_file_path, 'w')
f.write('{} {}\n'.format(vocab_size-1, embedding_dim))
vectors = model.get_weights()[0]
for word, i in word2index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

In [56]:
from gensim.models.keyedvectors import Word2VecKeyedVectors

word_vectors = Word2VecKeyedVectors.load_word2vec_format(word2vec_file_path, binary=False)
vector = word_vectors['computer']
vector

array([ 7.9439878e-03,  3.6232356e-02, -1.7401122e-02,  3.7665080e-02,
        4.7997203e-02,  2.6497137e-02, -3.4903705e-02,  5.1197186e-03,
        4.9720753e-02,  1.9438755e-02,  3.3077013e-02,  1.2119353e-02,
       -3.3054519e-02,  3.1138372e-02,  1.0858893e-02, -4.8681926e-02,
        8.4899366e-05, -4.1210093e-02, -2.0462228e-02,  8.3420873e-03,
        3.8035344e-02,  2.1159958e-02, -8.4199198e-03, -1.3231263e-03,
       -4.4698525e-02,  3.5169031e-02, -1.0928415e-02, -2.6792645e-02,
        4.9998078e-02, -1.0963894e-02], dtype=float32)

In [57]:
word_vectors.similar_by_word("cat")

[('passchendaele', 0.6930641531944275),
 ('meharry', 0.6720962524414062),
 ('recreativo', 0.621709942817688),
 ('bodyline', 0.6203605532646179),
 ('solastor', 0.617520272731781),
 ('kevat', 0.6149469017982483),
 ('mendelsohn', 0.6098932027816772),
 ('zellweger', 0.609707772731781),
 ('japan', 0.6096819043159485),
 ('speakds', 0.5981526374816895)]