In [20]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
from tqdm.notebook import tqdm
import spacy  # For preprocessing
import nltk
import string
import os
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer
import warnings
warnings.filterwarnings("ignore")

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [27]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/gunjan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gunjan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/gunjan/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df.shape

(50000, 2)

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [5]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [6]:
TEXT_COL = 'review'

## Cleaning:

In [28]:
Word = WordNetLemmatizer()
stop_words = stopwords.words('english')

In [13]:
def clean(raw):
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result) # greater than sign
    result = re.sub('&#x27;', "'", result) # apostrophe
    # result = re.sub('&quot;', '"', result) 
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result) # paragraph tag
    result = re.sub('<i>', ' ', result) #italics tag
    result = re.sub('</i>', '', result) 
    result = re.sub('&#62;', '', result)
    result = re.sub("\n", '', result) # newline 
    return result

In [14]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    text = re.sub(r"[^a-zA-Z0-9?!.,]+", ' ', text)
    return text

In [15]:
def lower_case(df):
    df[TEXT_COL] = df[TEXT_COL].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [22]:
def lemmatize(df):
    df[TEXT_COL] = df[TEXT_COL].apply(lambda x: " ".join([Word.lemmatize(word) for word in x.split()]))

In [17]:
def stop_words_remove(df):
    df[TEXT_COL] = df[TEXT_COL].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

In [23]:
def preprocess(df):
    df[TEXT_COL] = df[TEXT_COL].apply(clean)
    df[TEXT_COL] = df[TEXT_COL].apply(lambda x: remove_punct(x))
    lower_case(df)
    stop_words_remove(df)
    lemmatize(df)

In [29]:
t = time()

preprocess(df)

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.56 mins


## Bigrams

In [31]:
from gensim.models.phrases import Phrases, Phraser

In [32]:
sent = [row.split() for row in df[TEXT_COL]]

In [33]:
# min_count (float, optional) – Ignore all words and bigrams with total collected count lower than this value.
phrases = Phrases(sent, min_count=20, progress_per=5000)

INFO - 11:26:30: collecting all words and their counts
INFO - 11:26:30: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 11:26:31: PROGRESS: at sentence #5000, processed 608451 words and 481638 word types
INFO - 11:26:31: PROGRESS: at sentence #10000, processed 1215835 words and 861349 word types
INFO - 11:26:32: PROGRESS: at sentence #15000, processed 1827330 words and 1206409 word types
INFO - 11:26:33: PROGRESS: at sentence #20000, processed 2431867 words and 1525124 word types
INFO - 11:26:34: PROGRESS: at sentence #25000, processed 3043520 words and 1831358 word types
INFO - 11:26:34: PROGRESS: at sentence #30000, processed 3646687 words and 2118658 word types
INFO - 11:26:35: PROGRESS: at sentence #35000, processed 4256720 words and 2401073 word types
INFO - 11:26:36: PROGRESS: at sentence #40000, processed 4859876 words and 2668485 word types
INFO - 11:26:37: PROGRESS: at sentence #45000, processed 5477871 words and 2934388 word types
INFO - 11:26:38: collecte

In [34]:
bigram = Phraser(phrases)

INFO - 11:26:51: exporting phrases from Phrases<3184082 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>
INFO - 11:26:57: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<4902 phrases, min_count=20, threshold=10.0> from Phrases<3184082 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> in 5.93s', 'datetime': '2022-06-25T11:26:57.843759', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'created'}


In [35]:
sentences = bigram[sent]

### Most frequent words

In [36]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

168334

In [37]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['movie',
 'film',
 'br',
 'one',
 'like',
 'good',
 'time',
 'character',
 'get',
 'would']

## Training the Model

In [38]:
import multiprocessing

from gensim.models import Word2Vec

In [39]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [42]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5,
                     sg=1,
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

INFO - 11:30:13: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2022-06-25T11:30:13.447139', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'created'}


In [43]:
# Building vocabulary table
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 11:30:45: collecting all words and their counts
INFO - 11:30:45: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 11:30:46: PROGRESS: at sentence #10000, processed 1140281 words, keeping 70769 word types
INFO - 11:30:47: PROGRESS: at sentence #20000, processed 2280442 words, keeping 101555 word types
INFO - 11:30:48: PROGRESS: at sentence #30000, processed 3419370 words, keeping 126312 word types
INFO - 11:30:49: PROGRESS: at sentence #40000, processed 4557220 words, keeping 148615 word types
INFO - 11:30:50: collected 168334 word types from a corpus of 5701144 raw words and 50000 sentences
INFO - 11:30:50: Creating a fresh vocabulary
INFO - 11:30:50: Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 21975 unique words (13.05% of original 168334, drops 146359)', 'datetime': '2022-06-25T11:30:50.710895', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc

Time to build vocab: 0.08 mins


### Training of model

In [44]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 11:31:20: Word2Vec lifecycle event {'msg': 'training model with 11 workers on 21975 vocabulary and 300 features, using sg=1 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2022-06-25T11:31:20.467640', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'train'}
INFO - 11:31:21: EPOCH 0 - PROGRESS: at 4.96% examples, 161011 words/s, in_qsize 21, out_qsize 1
INFO - 11:31:22: EPOCH 0 - PROGRESS: at 10.28% examples, 165291 words/s, in_qsize 21, out_qsize 0
INFO - 11:31:23: EPOCH 0 - PROGRESS: at 16.30% examples, 176505 words/s, in_qsize 17, out_qsize 4
INFO - 11:31:24: EPOCH 0 - PROGRESS: at 22.84% examples, 186091 words/s, in_qsize 22, out_qsize 0
INFO - 11:31:25: EPOCH 0 - PROGRESS: at 29.01% examples, 185517 words/s, in_qsize 22, out_qsize 0
INFO - 11:31:26: EPOCH 0 - PROGRESS: at 36.01% examples, 192490 words/s, in_qsize 21, out_qsize 0
INFO - 11:31:27

Time to train the model: 7.91 mins


In [46]:
w2v_model.save('imdb_review_w2v.bin')

INFO - 11:40:40: Word2Vec lifecycle event {'fname_or_handle': 'imdb_review_w2v.bin', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-06-25T11:40:40.931394', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'saving'}
INFO - 11:40:40: not storing attribute cum_table
INFO - 11:40:40: saved imdb_review_w2v.bin


## Exploring the model

In [76]:
# check if the word is in the vocabulary before feeding in

#Get most similar words
w2v_model.wv.most_similar(positive=["christopher_nolan"])

[('batman_begin', 0.37667492032051086),
 ('david_mamet', 0.37097829580307007),
 ('batman_robin', 0.3625071942806244),
 ('nolans', 0.3601624071598053),
 ('memento', 0.3583686351776123),
 ('great', 0.3400098979473114),
 ('entire_cast', 0.32693397998809814),
 ('ridley_scott', 0.32323116064071655),
 ('excellent', 0.32223260402679443),
 ('look_forward', 0.31359919905662537)]

In [69]:
w2v_model.wv.similarity("batman", 'joker')

0.46241245

In [70]:
w2v_model.wv.doesnt_match(['batman', 'joker', 'hulk'])

'hulk'

In [78]:
w2v_model.wv.most_similar(positive=["titanic", "james_cameron"], negative=["christopher_nolan"], topn=3)

[('bridge', 0.28366515040397644),
 ('kleenex', 0.28308799862861633),
 ('port', 0.27012956142425537)]