In [1]:
import pandas as pd
train = pd.read_csv( "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
print("Read %d labeled train reviews\n, %d labeled test reviews\n, and %d unlabeled reiews" % (train["review"].size, test["review"].size, unlabeled_train["review"].size))

Read 25000 labeled train reviews
, 25000 labeled test reviews
, and 50000 unlabeled reiews


In [2]:
#Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
def review_to_wordlist(review, remove_stopwords=False):
    #문서를 단어들의 집합으로 바꾸는 함수
    #선택적으로 stop words를 없애고, 단어의 list를 반환
    #1. HTML을 없애기
    review_text = BeautifulSoup(review).get_text()
    #2. 글자가 아닌 것을 없애기
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    #3. 소문자화하고 나누기
    words = review_text.lower().split()
    #4. 선택적으로 stop words를 없애기 (기본적으론 False)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #5. 단어 리스트를 반환
    return(words)

In [3]:
import nltk
nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# 리뷰를 파싱된 문장으로 바꾸기 위한 함수
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    # 리뷰를 파싱된 문장으로 바꾸기 위한 함수. 문장의 리스트를 반환.
    # 각 문장은 단어들의 리스트로 이루어져 있음.
    # 1. NLTK tokenizer를 이용해서 단락을 문장으로 바꾼다.
    raw_sentences = tokenizer.tokenize(review.strip())
    # 2. 각 문장들을 Loop(순환)
    sentences = []
    for raw_sentence in raw_sentences:
        #만약 문장이 비어있으면, 스킵
        if len(raw_sentence) > 0:
            # 그렇지 않으면, 단어의 리스트를 받아오기 위해서
            # review_to_wordlist를 call해옴
            sentences.append( review_to_wordlist(raw_sentence,remove_stopwords) )
    # 문장의 리스트를 반환(각 문장은 단어 리스트로 이루어져 있음. 즉
    # 리스트들의 리스트를 반환하는 것)
    return sentences

[nltk_data] Downloading package punkt to /Users/mongdu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:

sentences = []
print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer, remove_stopwords=False)
print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer, remove_stopwords=False)

Parsing sentences from training set
Parsing sentences from unlabeled set


In [10]:
print(len(sentences))

795538


In [7]:
print(sentences[0])

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


In [8]:
print(sentences[1])

['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']


In [11]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [12]:
num_features = 300
logging.basicConfig(format='%(asctime)s: %(levelname)s : %(message)s', level=logging.INFO)

In [21]:
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, size = num_features, min_count=min_word_count, window = context, sample = downsampling)
Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2020-07-25 16:15:07,684 : INFO : collecting all words and their counts
2020-07-25 16:15:07,688 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-07-25 16:15:07,762 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2020-07-25 16:15:07,825 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types


Training model...


2020-07-25 16:15:07,893 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2020-07-25 16:15:07,979 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2020-07-25 16:15:08,038 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2020-07-25 16:15:08,098 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2020-07-25 16:15:08,159 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2020-07-25 16:15:08,219 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2020-07-25 16:15:08,292 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2020-07-25 16:15:08,369 : INFO : PROGRESS: at sentence #100000, processed 2226966 words, keeping 50207 word types
2020-07-25 16:15:08,446 : INFO : PROGRESS: at sentence #110000, processed 2446580 words, keeping 

NameError: name 'Word2Vec' is not defined

In [22]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.
2020-07-25 16:18:55,182 : INFO : precomputing L2-norms of word weight vectors
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'kitchen'

In [23]:
model.doesnt_match("paris berlin london austria".split())
'paris'

  """Entry point for launching an IPython kernel.


'paris'

In [25]:
model.most_similar("coffee")

  """Entry point for launching an IPython kernel.


[('beer', 0.7703843712806702),
 ('lunch', 0.6994674801826477),
 ('drink', 0.6896820664405823),
 ('kitchen', 0.678272008895874),
 ('bottle', 0.6762230396270752),
 ('soda', 0.6750398874282837),
 ('wine', 0.6666887998580933),
 ('shopping', 0.6614474654197693),
 ('shop', 0.6610244512557983),
 ('pot', 0.6606205701828003)]