In [3]:
import pandas as pd

train = pd.read_csv("/content/drive/MyDrive/labeledTrainData.tsv/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("/content/drive/MyDrive/testData.tsv/testData.tsv", header=0, delimiter="\t", quoting=3)
unlabeled_train = pd.read_csv("/content/drive/MyDrive/unlabeledTrainData.tsv/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)

print("Read %d labeled train reviews, %d labeled test reviews, " \
      "and %d unlabeled reviews\n" % (train["review"].size,
                                      test["review"].size, unlabeled_train["review"].size))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [5]:
!pip install BeautifulSoup4



In [7]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist(review, remove_stopwords=False):
  review_text = BeautifulSoup(review).get_text()

  review_text = re.sub("[^a-zA-Z]"," ", review_text)

  words = review_text.lower().split()

  if remove_stopwords:
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops]

  return(words)

In [8]:
import nltk.data
nltk.download()

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_sentences(review, tokenizer, remove_stopwords=False):
  raw_sentences = tokenizer.tokenize(review.strip())

  sentences = []
  for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
      sentences.append(review_to_wordlist(raw_sentence, \
                                          remove_stopwords))
  return sentences

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all


    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package averaged_perceptron_tagger to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger.zip.
       | Downloading package averaged_perceptron_tagger_ru to
       |     /root/nltk_data...
       |   Unzipping taggers/averaged_perceptron_tagger_ru.zip.
       | Downloading package basque_grammars to /root/nltk_data...
       |   Unzipping grammars/basque_grammars.zip.
       | Downloading package bcp47 to /root/nltk_data...
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package bllip_wsj_no_aux to /root/nltk_data...
       |   Unzipping models/bllip_wsj_no_aux.zip.
       | Downloading package book_grammars to


---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


In [9]:
sentences = []

print("Parsing sentences from training set")
for review in train["review"]:
  sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
  sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set


  review_text = BeautifulSoup(review).get_text()


Parsing sentences from unlabeled set


In [10]:
print(len(sentences))

75000


In [11]:
print(sentences[0])

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


In [12]:
print(sentences[1])

['the', 'classic', 'war', 'of', 'the', 'worlds', 'by', 'timothy', 'hines', 'is', 'a', 'very', 'entertaining', 'film', 'that', 'obviously', 'goes', 'to', 'great', 'effort', 'and', 'lengths', 'to', 'faithfully', 'recreate', 'h', 'g', 'wells', 'classic', 'book']


In [16]:
import logging
logging.basicConfig(format='%(asctime)s : %(levekname)s : %(message)s', \
    level=logging.INFO)

num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, vector_size = num_features,
                          min_count = min_word_count, window = context, sample = downsampling)

model.init_sims(replace=True)

model_name = "300featrues_40minwords_10context"
model.save(model_name)

Training model...


  model.init_sims(replace=True)


In [19]:
model.wv.doesnt_match("man woman child kitchen".split())



'child'

In [20]:
model.wv.doesnt_match("france england germany berlin".split())

'berlin'

In [21]:
model.wv.doesnt_match("paris berlin london austria".split())



'paris'

In [22]:
model.wv.most_similar("man")

[('scientist', 0.7268587946891785),
 ('boy', 0.7222631573677063),
 ('woman', 0.7192825078964233),
 ('doctor', 0.7145560383796692),
 ('girl', 0.6911660432815552),
 ('cop', 0.6910780072212219),
 ('guy', 0.6677914261817932),
 ('photographer', 0.656509518623352),
 ('himself', 0.6535006761550903),
 ('named', 0.6519820094108582)]

In [23]:
model.wv.most_similar("queen")

[('mrs', 0.8437225818634033),
 ('southern', 0.8428744077682495),
 ('mary', 0.8375134468078613),
 ('model', 0.8349179625511169),
 ('former', 0.8334486484527588),
 ('ruth', 0.8330400586128235),
 ('widow', 0.8286206722259521),
 ('owner', 0.8264226317405701),
 ('boss', 0.8256161212921143),
 ('prince', 0.8255757689476013)]

In [24]:
model.wv.most_similar("awful")

[('horrible', 0.8597468137741089),
 ('terrible', 0.8068706393241882),
 ('awesome', 0.667228102684021),
 ('dreadful', 0.6645852327346802),
 ('ridiculous', 0.6623292565345764),
 ('unbelievable', 0.6550235748291016),
 ('atrocious', 0.6548013687133789),
 ('amazing', 0.6534857749938965),
 ('crap', 0.6397861838340759),
 ('boring', 0.6205078959465027)]