# Word2Vec

## IMDB data

In [61]:
from gensim.models import Word2Vec, KeyedVectors
import csv
import re

def open_csv():
    # csv파일을 연다!
    f = open('./data/IMDB_dataset.csv', 'r', encoding='utf-8')
    csvreader = csv.reader(f)
    
    doc_list = []

    next(csvreader)
    for f in csvreader:
        line = re.compile("[^\w]").sub(' ', f[0].lower())
        doc_list.append(line.split())

    return doc_list

In [63]:
doc_list = open_csv()
# print(doc_list[0])

model = Word2Vec(sentences=doc_list, size=100, window=3, min_count=3, workers=-1, sg=0)

['one', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', '1', 'oz', 'episode', 'you', 'll', 'be', 'hooked', 'they', 'are', 'right', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'me', 'br', 'br', 'the', 'first', 'thing', 'that', 'struck', 'me', 'about', 'oz', 'was', 'its', 'brutality', 'and', 'unflinching', 'scenes', 'of', 'violence', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'go', 'trust', 'me', 'this', 'is', 'not', 'a', 'show', 'for', 'the', 'faint', 'hearted', 'or', 'timid', 'this', 'show', 'pulls', 'no', 'punches', 'with', 'regards', 'to', 'drugs', 'sex', 'or', 'violence', 'its', 'is', 'hardcore', 'in', 'the', 'classic', 'use', 'of', 'the', 'word', 'br', 'br', 'it', 'is', 'called', 'oz', 'as', 'that', 'is', 'the', 'nickname', 'given', 'to', 'the', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'it', 'focuses', 'mainly', 'on', 'emerald', 'city', 'an', 'experimental', 'section', 'of', 'the', 'prison', 'wher

## Word2Vec 함수 인자에 대한 설명!

- sentences : 문장들

- size : 임베딩 벡터의 크기

- window : 고려할 앞/뒤 단어의 갯수

- min_count : 최소 단어 길이

- workers : 사용할 프로세서의 수

- sg : 0=cbow, 1=**skipgram**

In [8]:
model_result = model.wv.most_similar("man")

print(model_result)

[('grabbed', 0.39505261182785034), ('dumbrille', 0.38505125045776367), ('cynic', 0.36763641238212585), ('cothk', 0.36432337760925293), ('obligation', 0.3579435348510742), ('prayed', 0.3572878837585449), ('deviate', 0.3572033643722534), ('quicksilver', 0.35572564601898193), ('masters', 0.3493580222129822), ('passing', 0.3471279740333557)]


In [9]:
model.wv.save_word2vec_format('imdb_w2v')
imdb_model = KeyedVectors.load_word2vec_format("imdb_w2v")

## 네이버 영화 리뷰로 w2v 모델 만들기!

- stopwords 제외 리스트를 만들어서, 조사는 제외한다.

- ratings.txt 파일만 사용한다.

In [86]:
from gensim.models import Word2Vec, KeyedVectors
import csv
import re

def open_csv():
    f = open('./data/ratings.txt', 'r', encoding='utf-8')

    next(f)
    naver_doc_list = []
    for line in f:
        stc_list = line.split('\t')
        naver_doc_list.append(stc_list[1].split())

    return naver_doc_list

In [87]:
naver_doc_list = open_csv()
model_naver = Word2Vec(sentences=naver_doc_list, size=50, window=3, min_count=3, workers=-1, sg=1)

In [88]:
naver_doc_list[0]

['어릴때보고', '지금다시봐도', '재밌어요ㅋㅋ']

In [89]:
model_result = model_naver.wv.most_similar("남자")

print(model_result)

[('그래픽만', 0.5392969846725464), ('인셉션', 0.5247762799263), ('제이크', 0.5021212697029114), ('가지지', 0.5011077523231506), ('우울함.', 0.49976545572280884), ('못하네...', 0.49665191769599915), ('신기함.', 0.4886792004108429), ('턱쟁이형님의', 0.4859004318714142), ('두렵지', 0.48356252908706665), ('바보연기', 0.4813482463359833)]


In [90]:
# 저장
model_naver.wv.save_word2vec_format('naver_ratings_w2v')

# 불러오기
naver_model = KeyedVectors.load_word2vec_format("naver_ratings_w2v")


- 나중에 추가

    - Konlpy 사용

    - stopword 사용