### Load Gensim Library

In [None]:
!pip install gensim --quiet

In [None]:
import gensim

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)

### Load Text Data

Data can be downloaded from https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [None]:
#This is needed only if you have uploaded data to Google drive
from google.colab import drive
drive.mount('/gdrive')

In [None]:
import pandas as pd

#change file path to point to where you have stored the zip file.
df = pd.read_csv('/gdrive/My Drive/AI-ML/unlabeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)

In [6]:
print('Number of examples in Dataset: ', df.shape)
df.head()

Number of examples in Dataset:  (50000, 2)


Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [7]:
df.loc[0, 'review']

'"Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \\"Hey, let\'s pool our money together and make a really bad movie!\\" Or something like that. What ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc. All corners were cut, except the one that would have prevented this film\'s release. Life\'s like that."'

### Function to Clean up data

In [8]:
import re, string

def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)
  except:
    return ""

### Clean the Data using routine above

In [9]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


### Convert Review to a Word List

In [10]:
#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review']:
    documents.append(doc.split(' '))

In [11]:
print(len(documents))

50000


In [12]:
print(documents[0])

['watching', 'time', 'chasers', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends', 'maybe', 'they', 'were', 'sitting', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'said', 'hey', 'let', 's', 'pool', 'our', 'money', 'together', 'and', 'make', 'a', 'really', 'bad', 'movie', 'or', 'something', 'like', 'that', 'what', 'ever', 'they', 'said', 'they', 'still', 'ended', 'up', 'making', 'a', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', 'etc', 'all', 'corners', 'were', 'cut', 'except', 'the', 'one', 'that', 'would', 'have', 'prevented', 'this', 'film', 's', 'release', 'life', 's', 'like', 'that']


### Build the Model

In [13]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=10, #Ignore all words with total frequency lower than this                           
                               workers=4, #Number of CPU Cores
                               size=50,  #Embedding size
                               window=5, #Neighbours on the left and right
                               iter=10   #Number of iterations over the text corpus
                              ) 

2021-01-17 08:11:51,636 : INFO : collecting all words and their counts
2021-01-17 08:11:51,638 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-01-17 08:11:52,148 : INFO : PROGRESS: at sentence #10000, processed 2399440 words, keeping 51654 word types
2021-01-17 08:11:52,664 : INFO : PROGRESS: at sentence #20000, processed 4835846 words, keeping 69077 word types
2021-01-17 08:11:53,175 : INFO : PROGRESS: at sentence #30000, processed 7267977 words, keeping 81515 word types
2021-01-17 08:11:53,692 : INFO : PROGRESS: at sentence #40000, processed 9669772 words, keeping 91685 word types
2021-01-17 08:11:54,207 : INFO : collected 100479 word types from a corpus of 12084660 raw words and 50000 sentences
2021-01-17 08:11:54,209 : INFO : Loading a fresh vocabulary
2021-01-17 08:11:54,630 : INFO : effective_min_count=10 retains 28322 unique words (28% of original 100479, drops 72157)
2021-01-17 08:11:54,631 : INFO : effective_min_count=10 leaves 11910457 word cor

In [None]:
#documents[0]

# Exploring the model

### How many words in the model

In [14]:
#Model size
model.wv.vectors.shape

(28322, 50)

In [15]:
# Vocablury of the model
model.wv.vocab

{'watching': <gensim.models.keyedvectors.Vocab at 0x7f90096aca20>,
 'time': <gensim.models.keyedvectors.Vocab at 0x7f90096aca90>,
 'chasers': <gensim.models.keyedvectors.Vocab at 0x7f8fc220b470>,
 'it': <gensim.models.keyedvectors.Vocab at 0x7f8fc220b518>,
 'obvious': <gensim.models.keyedvectors.Vocab at 0x7f8fc220b588>,
 'that': <gensim.models.keyedvectors.Vocab at 0x7f8fc220b5c0>,
 'was': <gensim.models.keyedvectors.Vocab at 0x7f8fc220b5f8>,
 'made': <gensim.models.keyedvectors.Vocab at 0x7f8fc220b630>,
 'by': <gensim.models.keyedvectors.Vocab at 0x7f8fc220b668>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7f8fc220b6a0>,
 'bunch': <gensim.models.keyedvectors.Vocab at 0x7f8fc220b6d8>,
 'of': <gensim.models.keyedvectors.Vocab at 0x7f8fc220b710>,
 'friends': <gensim.models.keyedvectors.Vocab at 0x7f8fc220b748>,
 'maybe': <gensim.models.keyedvectors.Vocab at 0x7f8fc220b780>,
 'they': <gensim.models.keyedvectors.Vocab at 0x7f8fc220b7b8>,
 'were': <gensim.models.keyedvectors.Vocab at 0x7f

### Get an embedding for a word

In [19]:
model.wv['corners']

array([ 3.1250823e-01,  6.2953025e-01, -5.3930151e-01, -2.2235082e-01,
        3.4406394e-02, -4.0225539e-02, -8.1245676e-03,  8.2767147e-01,
        8.2249916e-01, -1.0689138e-05,  3.7456724e-01,  1.9993152e-01,
       -8.2618707e-01,  4.6319214e-01,  2.0958680e-01,  2.1144494e-01,
       -9.5636356e-01,  2.4882241e-01, -3.2372776e-01,  8.6474085e-01,
       -1.8678010e-01, -2.1509089e-01, -1.1593798e+00,  4.8650544e-02,
        3.8986304e-01, -3.6180177e-01,  7.2912753e-02, -6.1790681e-01,
        1.4091675e+00, -5.7116175e-01, -4.4975586e-02, -3.3175018e-01,
        1.3980058e+00, -9.6290588e-01, -5.3014588e-01,  8.0354804e-01,
        5.3642583e-01, -2.7390775e-01, -3.5282066e-01, -5.4663581e-01,
        2.9263896e-01,  1.0129520e+00, -2.7111357e-01, -5.3921586e-01,
        2.5332987e-01,  1.5646270e-02,  1.1138320e-02, -8.5327888e-01,
        4.8182723e-01, -3.2837741e-02], dtype=float32)

### Finding Words which have similar meaning

In [20]:
model.wv.most_similar('great', topn=15)

2021-01-17 08:23:38,819 : INFO : precomputing L2-norms of word weight vectors


[('fantastic', 0.8950309157371521),
 ('wonderful', 0.8747765421867371),
 ('terrific', 0.8665562868118286),
 ('fine', 0.8425877690315247),
 ('good', 0.8256796598434448),
 ('brilliant', 0.8110665082931519),
 ('superb', 0.7785057425498962),
 ('perfect', 0.7527344822883606),
 ('nice', 0.7472909092903137),
 ('amazing', 0.73293536901474),
 ('marvelous', 0.730586051940918),
 ('awesome', 0.72437584400177),
 ('fabulous', 0.7235651612281799),
 ('spectacular', 0.7229869365692139),
 ('remarkable', 0.7209872007369995)]

### Find the word which is not like others

In [21]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'kitchen'

### Saving the model

In [22]:
model.save('word2vec-movie-50')

2021-01-17 08:29:57,141 : INFO : saving Word2Vec object under word2vec-movie-50, separately None
2021-01-17 08:29:57,143 : INFO : not storing attribute vectors_norm
2021-01-17 08:29:57,145 : INFO : not storing attribute cum_table
2021-01-17 08:29:57,367 : INFO : saved word2vec-movie-50


In [25]:
!ls -l

total 18536
drwxr-xr-x 1 root root     4096 Jan  6 18:10 sample_data
-rw-r--r-- 1 root root 18975625 Jan 17 08:29 word2vec-movie-50


In [23]:
#Load model from memory
model = gensim.models.Word2Vec.load('word2vec-movie-50')

2021-01-17 08:30:03,112 : INFO : loading Word2Vec object from word2vec-movie-50
2021-01-17 08:30:03,273 : INFO : loading wv recursively from word2vec-movie-50.wv.* with mmap=None
2021-01-17 08:30:03,274 : INFO : setting ignored attribute vectors_norm to None
2021-01-17 08:30:03,275 : INFO : loading vocabulary recursively from word2vec-movie-50.vocabulary.* with mmap=None
2021-01-17 08:30:03,276 : INFO : loading trainables recursively from word2vec-movie-50.trainables.* with mmap=None
2021-01-17 08:30:03,276 : INFO : setting ignored attribute cum_table to None
2021-01-17 08:30:03,277 : INFO : loaded word2vec-movie-50


1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [24]:
model.most_similar(positive=['king','man'], negative=['queen'])

  """Entry point for launching an IPython kernel.
2021-01-17 08:32:15,592 : INFO : precomputing L2-norms of word weight vectors


[('soldier', 0.5844407677650452),
 ('scientist', 0.5803942084312439),
 ('marine', 0.5649991631507874),
 ('buio', 0.5623047947883606),
 ('master', 0.5462765693664551),
 ('warlord', 0.5370714664459229),
 ('seed', 0.5318151712417603),
 ('cop', 0.5304993987083435),
 ('filmmaker', 0.5286292433738708),
 ('joker', 0.5281445384025574)]

In [None]:
model.wv['king'] + model.wv['man'] - model.wv['queen']