## Part 1 - Preprocessing

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import re
from bs4 import BeautifulSoup
import pandas as pd       
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

In [3]:
print(train.shape)
print(test.shape)
print(unlabeled_train.shape)

print(train['review'].size)
print(test['review'].size)
print(unlabeled_train['review'].size)

(25000, 3)
(25000, 2)
(50000, 2)
25000
25000
50000


In [4]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [5]:
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [6]:
train['sentiment'].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [7]:
from Word2VecUtil import Word2VecUtil
Word2VecUtil.review_to_wordlist(train['review'][0])[:10]

['with', 'all', 'this', 'stuff', 'go', 'down', 'at', 'the', 'moment', 'with']

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LGPC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
sentences = []
for review in train['review']:
    sentences += Word2VecUtil.review_to_sentences(
    review, remove_stopwords=False) #stopwords는 불용어라는 뜻 ! 

In [10]:
for review in unlabeled_train['review']:
    sentences += Word2VecUtil.review_to_sentences(
    review, remove_stopwords=False)
    
len(sentences)

# 단어의 개수가 아닌, 문장의 개수를 센다. like가 긍정인지 부정인지 알기 위해서는 앞뒤 문맥을 알아야하기 때문이다. 

795538

In [14]:
sentences[0][:10]

['with', 'all', 'this', 'stuff', 'go', 'down', 'at', 'the', 'moment', 'with']

In [20]:
import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s : %(message)s',
                   level=logging.INFO)

In [21]:
#Create Word2Vec
#parameters
num_vector = 300 # 문제 벡처 차원 수
min_word_count = 40
num_workers = 4
windows = 10 #문자열 창 크기 (전후)
downsampling = 1e-3 # 0.0001 

from gensim.models import word2vec

model = word2vec.Word2Vec(
    sentences, 
    workers= num_workers,
    size = num_vector,
    min_count = min_word_count,
    window = windows,
    sample = downsampling
) 

2020-04-30 11:17:36,478: INFO : collecting all words and their counts
2020-04-30 11:17:36,501: INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-30 11:17:37,080: INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 12465 word types
2020-04-30 11:17:37,314: INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 17070 word types
2020-04-30 11:17:37,476: INFO : PROGRESS: at sentence #30000, processed 671314 words, keeping 20370 word types
2020-04-30 11:17:37,711: INFO : PROGRESS: at sentence #40000, processed 897814 words, keeping 23125 word types
2020-04-30 11:17:37,941: INFO : PROGRESS: at sentence #50000, processed 1116962 words, keeping 25365 word types
2020-04-30 11:17:38,186: INFO : PROGRESS: at sentence #60000, processed 1338403 words, keeping 27283 word types
2020-04-30 11:17:38,465: INFO : PROGRESS: at sentence #70000, processed 1561579 words, keeping 29024 word types
2020-04-30 11:17:38,970: INFO : PROGRESS: at sentenc

In [22]:
#학습 후 메모리 정리 (unlad)
model.init_sims(replace=True)
model_name = "300vector_40min_10text"
model.save(model_name)

2020-04-30 11:23:40,787: INFO : precomputing L2-norms of word weight vectors
2020-04-30 11:23:40,862: INFO : saving Word2Vec object under 300vector_40min_10text, separately None
2020-04-30 11:23:40,894: INFO : not storing attribute vectors_norm
2020-04-30 11:23:40,943: INFO : not storing attribute cum_table
2020-04-30 11:23:41,585: INFO : saved 300vector_40min_10text


## Model evaluate


In [23]:
model.wv.doesnt_match("man woman child kitchen".split())

'kitchen'

In [29]:
model.wv.doesnt_match("france england korea germany tokyo".split()) #나라인데 도시여서 tokyo 관계없음 !



'england'

In [25]:
model.wv.most_similar("man")

[('woman', 0.6406874656677246),
 ('ladi', 0.5394983291625977),
 ('lad', 0.5073885917663574),
 ('millionair', 0.5039361119270325),
 ('businessman', 0.49491316080093384),
 ('loner', 0.49450647830963135),
 ('widow', 0.4717475175857544),
 ('policeman', 0.4698629677295685),
 ('farmer', 0.4688703715801239),
 ('men', 0.46664324402809143)]

In [26]:
model.wv.most_similar("school")

[('schooler', 0.7042346596717834),
 ('colleg', 0.6707702279090881),
 ('junior', 0.6026756763458252),
 ('noon', 0.5779948234558105),
 ('tech', 0.5303436517715454),
 ('gym', 0.5116730332374573),
 ('student', 0.4988793432712555),
 ('graduat', 0.48448991775512695),
 ('class', 0.48017966747283936),
 ('classroom', 0.47576698660850525)]

In [27]:
model.wv.most_similar("film")

[('movi', 0.8528131246566772),
 ('flick', 0.6047983169555664),
 ('documentari', 0.5563231706619263),
 ('pictur', 0.5506850481033325),
 ('cinema', 0.5181126594543457),
 ('masterpiec', 0.4901534616947174),
 ('it', 0.4856082797050476),
 ('sequel', 0.4792662262916565),
 ('effort', 0.46896636486053467),
 ('thriller', 0.4656416177749634)]

In [30]:
model.wv.most_similar("happi")

[('unhappi', 0.4541730582714081),
 ('bitter', 0.41757822036743164),
 ('afraid', 0.4127125144004822),
 ('satisfi', 0.40617939829826355),
 ('sad', 0.4056433439254761),
 ('happier', 0.40464216470718384),
 ('glad', 0.385473370552063),
 ('upset', 0.38082295656204224),
 ('proud', 0.3802988529205322),
 ('lucki', 0.379236102104187)]

## Part 2 - Word2Vec

## 모델 결과 탐색

### Word2Vec으로 벡터화 한 단어를 t-SNE 를 통해 시각화

### 모델 생성

### Test