# 영화 리뷰 워드 임베딩 (Word2Vec, FastText)
- gensim 라이브러리 사용 : pip install gensim
    - Word2Vec : models.Word2Vec
    - FastText : models.FastText

## 1. 데이터 준비
* 토큰화가 잘 되어 있는 filtered 데이터 사용

In [1]:
data_filename = './data/Korean_movie_reviews_2016_filtered.csv'

import pandas as pd
review_df = pd.read_csv(data_filename)
review_df.head()

Unnamed: 0,review,rate
0,아니 딴 그렇 비 비탄 총 대체 왜 들 온겨,7
1,진심 쓰레기 영화 만들 무서 알 쫄아 틀었 이건 뭐 웃 거리 없는 쓰레기 영화 임,1
2,역대 좀비 영화 가장 최고다 원작 만화 읽어 보려 영화 보고 결정 하려 감독 간츠 ...,10
3,온종일 불편한 피 범벅 일,6
4,답답함 극치 움직일 잇으 좀 움직여 어지간히 좀비 봣으 얼 타고 때려 잡 때 되 않냐,1


In [2]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 788189 entries, 0 to 788188
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   review  785448 non-null  object
 1   rate    788189 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 12.0+ MB


In [3]:
# review만 모아서 review별 토큰 리스트로 변환 : review가 Object 타입이므로 str로 변환 후 split
review_list = list(map(str, review_df.review))
corpus = [review.split() for review in review_list]
corpus[:5]

[['아니', '딴', '그렇', '비', '비탄', '총', '대체', '왜', '들', '온겨'],
 ['진심',
  '쓰레기',
  '영화',
  '만들',
  '무서',
  '알',
  '쫄아',
  '틀었',
  '이건',
  '뭐',
  '웃',
  '거리',
  '없는',
  '쓰레기',
  '영화',
  '임'],
 ['역대',
  '좀비',
  '영화',
  '가장',
  '최고다',
  '원작',
  '만화',
  '읽어',
  '보려',
  '영화',
  '보고',
  '결정',
  '하려',
  '감독',
  '간츠',
  '실사',
  '했',
  '사람',
  '거르려',
  '그냥',
  '봤',
  '정말',
  '흠잡',
  '없는',
  '최고',
  '좀비',
  '영화',
  '잔인',
  '거',
  '싫어하지',
  '참고',
  '볼',
  '만하',
  '로미',
  '인물',
  '왜',
  '그런',
  '모르'],
 ['온종일', '불편한', '피', '범벅', '일'],
 ['답답함',
  '극치',
  '움직일',
  '잇으',
  '좀',
  '움직여',
  '어지간히',
  '좀비',
  '봣으',
  '얼',
  '타고',
  '때려',
  '잡',
  '때',
  '되',
  '않냐']]

## 1. Word2Vec 활용 영화 리뷰 워드 임베딩
* https://radimrehurek.com/gensim/models/word2vec.html

### Skipgram, negative=10 인 경우

In [4]:
# Word2Vec 모델 생성 및 학습 : window=3, min_count=3
from gensim.models import Word2Vec
model_sg_n10 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=1, negative=10)

In [5]:
# 단어의 임베딩 벡터 확인
model_sg_n10.wv['이정재']

array([-0.28264958, -0.0172856 , -0.33901787, -0.21196248, -0.01020698,
        0.08712021,  0.15715712, -0.0145459 ,  0.66060567,  0.1339974 ,
       -0.14450881, -0.19346508, -0.26811245,  0.55313617, -0.26885796,
        0.24601103, -0.175366  , -0.26247567,  0.3224126 , -0.16400638,
        0.19680712,  0.0701113 ,  0.24706282,  0.22691871, -0.21238172,
        0.02449687,  0.7999493 ,  0.30805385,  0.2503203 ,  0.0247461 ,
        0.13525616,  0.0335877 , -0.02549328, -0.7642966 ,  0.11388669,
       -0.36613297,  0.12432896, -0.02299633, -0.28443685, -0.1561737 ,
       -0.5304416 , -0.10075071, -0.27796715,  0.23346242, -0.275031  ,
        0.33448368, -0.15503767,  0.2156552 ,  0.01364183,  0.60480756,
       -0.18533173, -1.0216093 , -0.01696062, -0.37514004,  0.2698937 ,
        0.03835453, -0.09038824, -0.45301875,  0.41017392,  0.26201543,
       -0.31184077, -0.31511658, -0.00475753, -0.2692985 , -0.6435737 ,
       -0.09522572,  0.27801904, -0.61141694,  0.08818445,  0.08

In [6]:
# 단어의 임베딩 벡터 차원 확인
len(model_sg_n10.wv['이정재'])

100

In [7]:
# 두 단어 간 유사도 확인
model_sg_n10.wv.similarity('이정재', '정우성')

0.74404585

In [8]:
# 특정 단어와 유사한 단어 추출
model_sg_n10.wv.most_similar('이정재', topn=20)

[('송강호', 0.8240769505500793),
 ('공유', 0.8207034468650818),
 ('이범수', 0.7999857068061829),
 ('이성민', 0.7608655095100403),
 ('정우성', 0.7440459132194519),
 ('김범수', 0.7407302260398865),
 ('이병헌', 0.7339077591896057),
 ('김남길', 0.7326019406318665),
 ('조재현', 0.7258879542350769),
 ('이진욱', 0.7235654592514038),
 ('김성균', 0.7132023572921753),
 ('김명민', 0.7127196192741394),
 ('리암', 0.708563506603241),
 ('마동석', 0.7076446413993835),
 ('주지훈', 0.706768810749054),
 ('김윤석', 0.7052406072616577),
 ('곽도원', 0.7039344906806946),
 ('황정민', 0.7013072967529297),
 ('송광호', 0.6964501738548279),
 ('박해일', 0.6936976313591003)]

In [9]:
model_sg_n10.wv.most_similar('재밌', topn=20)

[('재미있', 0.9041098356246948),
 ('재밌네', 0.8312935829162598),
 ('재밌었', 0.8170600533485413),
 ('잼남', 0.8093922138214111),
 ('재밋음', 0.8060846328735352),
 ('잼슴', 0.8011594414710999),
 ('재밌어', 0.8001789450645447),
 ('재밌아', 0.783962070941925),
 ('재밋었어', 0.778581440448761),
 ('재밋었음', 0.7763713002204895),
 ('쟈밋', 0.7722235321998596),
 ('재밋어용', 0.7707235813140869),
 ('재밋엇어용', 0.7662728428840637),
 ('재밋엇음', 0.7640242576599121),
 ('재밋네용', 0.7605178356170654),
 ('엇', 0.7598572969436646),
 ('재밌더', 0.7574360370635986),
 ('재미있었', 0.7549335360527039),
 ('재밋네', 0.7510827779769897),
 ('재밋엇', 0.7496328353881836)]

### Skipgram, negative=5 인 경우

In [11]:
# 모델 생성
model_sg_n5 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=1, negative=5)

In [24]:
# 단어의 임베딩 벡터 확인
model_sg_n5.wv['이정재']

array([-0.01500924,  0.1886691 , -0.47442573, -0.56149423, -0.18427987,
       -0.00257667,  0.26962665,  0.24900845,  0.44140688,  0.06200967,
       -0.44759765, -0.18634643,  0.01703296,  0.22040427, -0.3083917 ,
       -0.5078271 , -0.2216234 , -0.3839562 ,  0.61998004, -0.53469235,
        0.06420291, -0.08190833,  0.21335864, -0.10974576,  0.20330657,
        0.11586396,  0.16200915,  0.13004926,  0.45783433,  0.10238216,
        0.07577568,  0.32794192, -0.0295987 , -0.4882616 ,  0.02664905,
       -0.7118783 , -0.09794995, -0.13484834, -0.3194473 , -0.14193954,
       -0.29103827,  0.29178748,  0.16663119,  0.3037012 , -0.35431048,
        0.51077384,  0.00416152,  0.37806025,  0.29581103, -0.12638533,
        0.0928093 , -0.5443455 , -0.2212055 , -0.6548067 ,  0.18526234,
       -0.01332427,  0.02697273, -0.5125231 ,  0.1383291 ,  0.01369506,
       -0.75773615, -0.39115173,  0.36098823, -0.42351195, -0.559229  ,
       -0.29801938,  0.50022995, -0.17849073, -0.0022199 , -0.35

In [25]:
# 특어 단어와 유사한 단어 추출 : 재밌
model_sg_n5.wv.most_similar('재밌', topn=20)

[('재미있', 0.901084303855896),
 ('재밌네', 0.8202382326126099),
 ('재밋음', 0.8157777786254883),
 ('잼남', 0.8145461082458496),
 ('재밌었', 0.8144572973251343),
 ('재밌어', 0.8133275508880615),
 ('재밋엇음', 0.7879084348678589),
 ('재밋었음', 0.7727741599082947),
 ('재밋어용', 0.7699772119522095),
 ('재밋네', 0.767364501953125),
 ('잼슴', 0.7631867527961731),
 ('재밋어', 0.7577487826347351),
 ('엇', 0.7568485140800476),
 ('재미있었', 0.7526331543922424),
 ('재밋네용', 0.7514306306838989),
 ('재밌아', 0.7503105401992798),
 ('재밋습니', 0.7502946257591248),
 ('재밋었습니', 0.7471709251403809),
 ('쟈밋', 0.7471442222595215),
 ('재밌슴', 0.7465928792953491)]

### CBOW, negative=10 인 경우

In [26]:
# 모델 생성
model_cbow_n10 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=0, negative=10)

In [27]:
# 단어의 임베딩 벡터 확인
model_cbow_n10.wv['이정재']

array([-8.9919108e-01, -8.8714010e-01,  5.7895768e-02,  1.5683074e-01,
       -6.0539514e-01,  1.3230885e+00,  4.4545803e-02,  8.5013890e-01,
        9.3974447e-01, -5.7417297e-01, -2.0943589e+00, -1.7734858e+00,
        1.0844117e+00, -9.7167838e-01, -7.3306638e-01,  7.5530952e-01,
       -1.9544092e-01,  1.5405792e-01, -6.0065466e-01,  8.5874373e-01,
       -3.6709663e-02,  6.6852671e-01,  1.9916800e+00,  3.5523981e-01,
       -5.5526280e-01,  3.9929789e-01, -1.7751653e-03,  5.9887111e-01,
        7.0121318e-01,  6.0211565e-02,  8.3799496e-02, -5.0383568e-01,
        5.4552609e-01, -8.1528914e-01, -5.4370880e-01, -4.9348456e-01,
        3.6052004e-01, -5.1253760e-01,  1.5824192e+00, -8.6374110e-01,
       -1.2727438e+00,  7.1773976e-01,  1.2232053e+00, -8.6998397e-01,
        1.1450955e-01, -7.3784627e-02,  2.1709895e+00, -1.5808544e-01,
       -7.6308131e-01, -3.1788343e-01,  5.7938850e-01, -9.3422288e-01,
        1.2690709e+00, -5.5343205e-01,  5.2151394e-01,  3.0873999e-01,
      

In [28]:
# 특어 단어와 유사한 단어 추출 : 재밌
model_cbow_n10.wv.most_similar('재밌', topn=20)

[('재미있', 0.8938097953796387),
 ('재밌네', 0.8076400756835938),
 ('재밌어', 0.80181485414505),
 ('재밋음', 0.7982984185218811),
 ('재밌었', 0.7776693105697632),
 ('재밌는', 0.7167515754699707),
 ('재밋어', 0.7135028839111328),
 ('잼남', 0.7085589170455933),
 ('재미있네', 0.7054212689399719),
 ('재미있었', 0.7051866054534912),
 ('재밋네', 0.6850832104682922),
 ('재미있어', 0.68181312084198),
 ('재밋엇어', 0.6803247332572937),
 ('재밌더', 0.6750017404556274),
 ('재밌던', 0.6671257615089417),
 ('재밋었', 0.655978262424469),
 ('재밋', 0.6548629999160767),
 ('재밌고', 0.6475374102592468),
 ('재미있던', 0.6447578072547913),
 ('재밌다', 0.6423748731613159)]

### CBOW, negative=5 인 경우

In [19]:
# 모델 생성
model_cbow_n5 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=0, negative=5)

In [29]:
# 단어의 임베딩 벡터 확인
model_cbow_n5.wv['이정재']

array([-0.7257281 , -0.6768872 ,  0.01604948, -0.43529376, -0.90390944,
        1.964444  ,  0.4274932 ,  0.6243791 ,  0.9392622 , -2.0857666 ,
       -0.9065436 , -1.5355284 ,  1.1443706 ,  0.16622242, -0.17079057,
       -0.6859102 , -0.04101451, -0.49447504,  0.9447038 , -0.08656523,
       -2.1754222 ,  1.4861469 ,  1.3940623 , -0.2840115 ,  0.45519274,
        0.6459767 ,  0.46770948,  0.06651357,  0.03557469,  0.16170168,
       -0.6115209 ,  0.42957988,  1.4333256 , -0.5049513 ,  0.40964872,
       -0.4606669 , -0.17447163, -0.15535106,  0.81863546, -0.26015243,
        0.21874781,  0.9974713 ,  0.03283563, -0.8402102 , -0.69674283,
        0.65132546,  0.9764183 ,  0.11590831, -0.777245  , -0.04731341,
        0.22244439, -1.0835397 , -0.43185407, -0.3140751 ,  0.05750364,
        0.03399146,  0.7603365 ,  0.74769974,  0.21113636,  1.2892708 ,
       -0.4380216 , -1.7699184 , -0.954818  , -0.71359247,  0.29221568,
        0.12428732, -0.24269916, -1.4926057 ,  0.6896363 , -0.49

In [30]:
# 특어 단어와 유사한 단어 추출 : 재밌
model_cbow_n5.wv.most_similar('재밌', topn=20)

[('재미있', 0.896662175655365),
 ('재밌네', 0.8267502188682556),
 ('재밌어', 0.8146780133247375),
 ('재밌었', 0.7945839762687683),
 ('재밋음', 0.7935121059417725),
 ('재밌는', 0.7337284684181213),
 ('재미있네', 0.7322923541069031),
 ('재미있었', 0.7259728908538818),
 ('재밌더', 0.7246324419975281),
 ('재밋어', 0.7125368714332581),
 ('재밋엇어', 0.7086813449859619),
 ('재미있어', 0.7072996497154236),
 ('재밋네', 0.6992778778076172),
 ('재밌던', 0.6914545297622681),
 ('잼남', 0.6844478249549866),
 ('꿀잼', 0.6698039770126343),
 ('재밌다', 0.6647352576255798),
 ('재밋', 0.6495932340621948),
 ('재미있는', 0.6467329859733582),
 ('재밌고', 0.6437668204307556)]

### OOV(Out of Vocabulary) 문제

In [31]:
# corpus에 없는 단어 확인 : 우주평화 
'우주평화' in model_sg_n10.wv.key_to_index

False

In [32]:
# corpus에 없는 단어의 임베딩 벡터 확인 
model_sg_n10.wv['우주평화']

KeyError: "Key '우주평화' not present"

## 2. FastText 활용 영화 리뷰 워드 임베딩
* https://radimrehurek.com/gensim/models/fasttext.html

In [35]:
# FastText 모델 생성 및 학습
# window=3, min_count=3, min_n=2, max_n=2

from gensim.models import FastText

model_ft = FastText(corpus, window=3, min_count=3, min_n=2, max_n=2, vector_size=100, sg=1, negative=10)

In [36]:
# 특정 단어와 유사한 단어 추출 : 이정재
model_ft.wv['이정재']

array([-0.05169896,  0.16818574, -0.04339173, -0.15347156,  0.14051971,
       -0.531576  , -0.25006512,  0.65206397,  0.84572726,  0.08389471,
       -0.24879734,  0.1188952 , -0.26153913,  0.16347216, -0.20695135,
        0.0676625 , -0.14017183, -0.60060346,  0.35699087, -0.26028365,
        0.14190087, -0.3592915 ,  0.09560583, -0.0468838 , -0.09837441,
        0.0343164 , -0.13617301, -0.18737139, -0.5888349 , -0.27951506,
        0.15847711, -0.7028911 ,  0.09472108, -0.04818245, -0.35567364,
       -0.1103575 ,  0.25230667, -0.3994171 , -0.20207676,  0.17026538,
       -0.464085  ,  0.03017764, -0.13890031, -0.18342747, -0.65599537,
       -0.35056853, -0.37994274, -0.46942443,  0.03500858, -0.00756844,
       -0.18770854,  0.04803809, -0.18730707,  0.00186331, -0.12313239,
       -0.11449697,  0.44439763,  0.1970244 , -0.03129076,  0.18433014,
        0.05261855, -0.19110547, -0.33274016,  0.2377933 , -0.23131213,
       -0.4410348 ,  0.33354935,  0.06155592, -0.03299984, -0.32

In [37]:
# corpus에 없는 단어 확인 : 우주평화 

'우주평화' in model_ft.wv.key_to_index

False

In [38]:
# corpus에 없는 단어의 임베딩 벡터 확인 
model_ft.wv['우주평화']

array([ 0.2967518 ,  0.2343233 ,  0.00172641,  0.20807576, -0.05652617,
       -0.09737702,  0.04573008,  0.74781996,  0.45849705,  0.38437745,
        0.03538332,  0.18617114,  0.05591177,  0.4414092 , -0.28732228,
       -0.17661577, -0.05991974, -0.22013848, -0.03765762,  0.0770898 ,
        0.08464812, -0.15494066, -0.00712381, -0.06891862, -0.05067811,
       -0.05582947, -0.26789472, -0.21552852, -0.35749108, -0.3058882 ,
        0.05957737, -0.17786992, -0.19696817, -0.27347738, -0.16240588,
        0.30668893,  0.1665832 ,  0.25036648, -0.29722467, -0.12628658,
       -0.13547114,  0.07897984, -0.17955746,  0.10827844, -0.32224923,
        0.13662365, -0.09270506,  0.12101535, -0.16317353,  0.1267141 ,
        0.19367105,  0.18681398, -0.0428108 , -0.16587329, -0.21910882,
        0.0106194 ,  0.2018141 ,  0.20136964, -0.17180622,  0.18294592,
        0.09741277,  0.46779776, -0.54443544, -0.18406925, -0.13866779,
       -0.3828046 ,  0.01543306, -0.19727364,  0.23495188, -0.01

In [39]:
# corpus에 없는 단어와 유사한 단어추출 
model_ft.wv.most_similar('우주평화')

[('평화', 0.8095723390579224),
 ('우주', 0.8057762980461121),
 ('우장', 0.8000538349151611),
 ('우주비행사', 0.7988510727882385),
 ('쉘', 0.7948558926582336),
 ('우방', 0.7924414277076721),
 ('볕', 0.7850431799888611),
 ('회색곰', 0.7814286947250366),
 ('아비규환', 0.781424880027771),
 ('경복궁', 0.7811520099639893)]