# Word2Vec tutorial

In [1]:
from collections import Counter
import os, sys
import nltk
import re
import pandas as pd
import numpy as np
from gensim.models.word2vec import Word2Vec

## 1. Prepare data

### 1.1 Read data

In [2]:
papers = pd.read_csv('./text_mining_paper.csv')
papers.head()

Unnamed: 0.1,Unnamed: 0,abstract,author,meta,subject,title
0,0,This paper advances the state of the art in te...,"Hossein Hematialam, Wlodek Zadrozny","Tue, 13 Jun 2017 18:02:27 GMT (14kb) [v2] Wed,...",Computation and Language (cs.CL),Identifying Condition-Action Statements in Med...
1,1,The large scale of scholarly publications pose...,"Muthu Kumar Chandrasekaran, Kokil Jaidka, Phil...","Thu, 8 Jun 2017 10:53:57 GMT (48kb)",Digital Libraries (cs.DL),Joint Workshop on Bibliometric-enhanced Inform...
2,2,Recognizing textual entailment is a fundamenta...,"Zhipeng Xie, Junfeng Hu","Thu, 25 May 2017 05:45:42 GMT (14kb)",Computation and Language (cs.CL),Max-Cosine Matching Based Neural Models for Re...
3,3,It is oftentimes impossible to understand how ...,"Wenbo Guo, Kaixuan Zhang, Lin Lin, Sui Huang, ...","Tue, 23 May 2017 23:51:37 GMT (731kb,D)",Learning (cs.LG),Towards Interrogating Discriminative Machine L...
4,4,"In this paper, we demonstrate how the state-of...","Tao Ding, Warren K. Bickel, Shimei Pan","Tue, 16 May 2017 10:37:52 GMT (78kb,D) [v2] We...",Computation and Language (cs.CL),Social Media-based Substance Use Prediction


### 1.2 Preprocess data

- abstract 컬럼에서 2글자 이상의 알파벳만 뽑아낸다.
- Counter를 활용해서 빈도수를 구하고 2번 이상 반복되어 나온 단어만 뽑아낸다.

In [3]:
abstracts = list(papers['abstract'])
corpus = list(map(lambda x : re.findall('[a-z]{2,}',x.lower()), abstracts))

tokens = sum(corpus, [])
tokens = Counter(tokens)
tokens = [token[0] for token in tokens.items() if token[1] >= 2]

## 2. Word2Vec

### 2.1 Vectorize

- 단어 리스트를 벡터화하고 메모리 정리
- configuration
    + size : feature vector의 차원
    + alpha : initial learning rate
    + window : 앞 뒤로 몇 단어까지 볼 것인지
    + sg : skip-gram 방법의 사용여부. 1이면 skip-gram, 0이면 CBOW
    + min_count : 단어의 최소 빈도. 이 이하의 단어는 무시한다.
    + workers : 프로세서 몇 개 써서 병렬처리 할 것인지

In [9]:
config = {'size' :100, 'alpha' : 0.025, 'window' : 5, 'sg' : 1, 'min_count' : 2}
model = Word2Vec(sentences = corpus, **config)
model.init_sims(replace = True) # 필요없는 메모리 unload

### 2.2 단어 간 유사도 체크

In [17]:
word_sim = model.similarity(tokens[0], tokens[1])
print("words: '{}', '{}'".format(tokens[0], tokens[1]))
print("similarity: {}".format(word_sim))

words: 'this', 'paper'
similarity: 0.9951527331584714


### 2.3 유사한 단어

In [18]:
model.most_similar(tokens[0], topn = 20)

[('we', 0.9972541928291321),
 ('in', 0.9955449104309082),
 ('paper', 0.9951527714729309),
 ('propose', 0.9938578009605408),
 ('study', 0.9935561418533325),
 ('method', 0.9916635751724243),
 ('approach', 0.991506040096283),
 ('also', 0.991470456123352),
 ('new', 0.9914051294326782),
 ('problem', 0.9913595914840698),
 ('present', 0.9911965131759644),
 ('an', 0.9911403656005859),
 ('the', 0.9908382892608643),
 ('novel', 0.9907150268554688),
 ('how', 0.9906813502311707),
 ('proposed', 0.9906740188598633),
 ('show', 0.9902957677841187),
 ('is', 0.9901669025421143),
 ('presents', 0.9901626110076904),
 ('technique', 0.9900466799736023)]

### 2.4 단어 간 관계 (King-Queen)

- positive: 해당 단어들과 긍정적인 관계
- negative: 해당 단어들과 부정적 관계

In [19]:
model.most_similar(positive = ['experimental', 'words'], negative = ['that'])

[('popular', 0.9987698793411255),
 ('sentences', 0.9987324476242065),
 ('ontologies', 0.9987238049507141),
 ('database', 0.9987122416496277),
 ('categories', 0.998700737953186),
 ('science', 0.9986943602561951),
 ('numbers', 0.9986836910247803),
 ('indices', 0.9986716508865356),
 ('mns', 0.9986706972122192),
 ('answering', 0.9986652135848999)]

### 2.5 Embedding된 words

In [26]:
print('Words length: {}'.format(len(model.wv.index2word)))
print(model.wv.index2word[:10])

Words length: 2281
['the', 'of', 'and', 'to', 'in', 'is', 'for', 'we', 'text', 'this']


In [30]:
my_word = model.wv.index2word
embedding = [model[token] for token in my_word]
embedding[0]

array([ 0.05594978,  0.05807013, -0.03238869, -0.07171033,  0.05283086,
       -0.06667535,  0.06541217, -0.04155858, -0.03341519,  0.11654437,
        0.20107394,  0.14403497, -0.06954887,  0.1761362 , -0.04633851,
       -0.07623592, -0.05029421, -0.04052194, -0.05250368,  0.06357939,
       -0.21229286,  0.04722983, -0.03840394,  0.03564699,  0.07652467,
        0.01386526, -0.04466556,  0.05193227, -0.09246389,  0.05370577,
        0.282792  , -0.1118412 , -0.17727992,  0.07671122, -0.06332225,
       -0.01372707,  0.01543607, -0.13429877,  0.17193122,  0.16334276,
       -0.0075834 ,  0.00768877,  0.04423469,  0.09267427,  0.10473999,
       -0.23148984, -0.04500943,  0.04297513,  0.05064177, -0.03340086,
        0.0583695 ,  0.10503582,  0.15592501,  0.04505303, -0.02601505,
       -0.02186228, -0.06606573, -0.15155126,  0.07642452, -0.00750054,
        0.17220958, -0.17409481, -0.0294145 , -0.06806629, -0.0271793 ,
       -0.1173178 , -0.00774782,  0.07125244,  0.20547967, -0.01

In [32]:
embedding = np.asarray(embedding)
print("embedding shape: {}".format(embedding.shape))

embedding shape: (2281, 100)
