In [3]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
from tqdm.notebook import tqdm
import spacy  # For preprocessing
import nltk
import string
import glob
import os
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer
import warnings
warnings.filterwarnings("ignore")

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [9]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/gunjan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gunjan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/gunjan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/gunjan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
txt_files = glob.glob('marvel/*.txt')

print(txt_files[:3], end='\n\n')
print("Total number of movie scripts:", len(txt_files))

['marvel/Iron-Man.2.txt', 'marvel/Spider-Man.Far.From.Home.txt', 'marvel/Captain.Marvel.txt']

Total number of movie scripts: 23


In [6]:
marvel_corpus = ''

for file in txt_files:
    with open(file, 'r', encoding="ISO-8859-1") as f:
        text = f.read()
        marvel_corpus += text

In [7]:
marvel_corpus = marvel_corpus.lower()
print('Sample text from the corpus:\n\n', marvel_corpus[9500:10000])

Sample text from the corpus:

 urned. we're adjourned for today.
- okay. - you've been a delight.
my bond is with the people.
and i will serve this great nation at the pleasure of myself.
if there's one thing i've proven
it's that you can count on me to pleasure myself.
wake up. daddy's home.
welcome home, sir.
congratulations on the opening ceremonies.
they were such a success, as was your senate hearing.
and may i say how refreshing it is
to finally see you in a video with your clothing on, sir.
you!
i swear to god i'll dis


## Cleaning:

In [20]:
Word = WordNetLemmatizer()
stop_words = stopwords.words('english')

In [21]:
t = time()

text = marvel_corpus
text=re.sub(r"[^.A-Za-z]",' ',text)
sentence=text.split('.')
tokens=[nltk.word_tokenize(words) for words in sentence]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.04 mins


In [33]:
tokens_filtered = []
temp = []
for sent in tokens:
    temp = []
    for word in sent:
        word = Word.lemmatize(word)
        if word not in stop_words:
            temp.append(word)
    tokens_filtered.append(temp)
       

## bIGRAMS

In [34]:
from gensim.models.phrases import Phrases, Phraser

In [35]:
# min_count (float, optional) – Ignore all words and bigrams with total collected count lower than this value.
phrases = Phrases(tokens_filtered, min_count=20, progress_per=10000)

INFO - 12:30:42: collecting all words and their counts
INFO - 12:30:42: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 12:30:42: PROGRESS: at sentence #10000, processed 27569 words and 21158 word types
INFO - 12:30:42: PROGRESS: at sentence #20000, processed 56951 words and 40162 word types
INFO - 12:30:42: PROGRESS: at sentence #30000, processed 86987 words and 57782 word types
INFO - 12:30:42: collected 66014 token types (unigram + bigrams) from a corpus of 103118 words and 36103 sentences
INFO - 12:30:42: merged Phrases<66014 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>
INFO - 12:30:42: Phrases lifecycle event {'msg': 'built Phrases<66014 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> in 0.14s', 'datetime': '2022-06-25T12:30:42.224933', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'created'}


In [36]:
bigram = Phraser(phrases)

INFO - 12:30:45: exporting phrases from Phrases<66014 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>
INFO - 12:30:45: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<13 phrases, min_count=20, threshold=10.0> from Phrases<66014 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> in 0.12s', 'datetime': '2022-06-25T12:30:45.435063', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'created'}


In [37]:
sentences = bigram[tokens_filtered]

### Most frequent words

In [38]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

10005

In [39]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['know', 'wa', 'right', 'get', 'yeah', 'like', 'one', 'got', 'okay', 'go']

## Training the Model

In [40]:
import multiprocessing

from gensim.models import Word2Vec

In [41]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [53]:
w2v_model = Word2Vec(workers=cores-1)

INFO - 12:35:47: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=100, alpha=0.025>', 'datetime': '2022-06-25T12:35:47.465703', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'created'}


In [54]:
# Building vocabulary table
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 12:35:50: collecting all words and their counts
INFO - 12:35:50: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 12:35:50: PROGRESS: at sentence #10000, processed 27142 words, keeping 4910 word types
INFO - 12:35:50: PROGRESS: at sentence #20000, processed 56184 words, keeping 7393 word types
INFO - 12:35:50: PROGRESS: at sentence #30000, processed 85788 words, keeping 9269 word types
INFO - 12:35:50: collected 10005 word types from a corpus of 101576 raw words and 36103 sentences
INFO - 12:35:50: Creating a fresh vocabulary
INFO - 12:35:50: Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 2833 unique words (28.32% of original 10005, drops 7172)', 'datetime': '2022-06-25T12:35:50.495412', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'prepare_vocab'}
INFO - 12:35:50: Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 898

Time to build vocab: 0.0 mins


### Training of model

In [55]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=10, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 12:35:55: Word2Vec lifecycle event {'msg': 'training model with 11 workers on 2833 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2022-06-25T12:35:55.363403', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'train'}
INFO - 12:35:55: EPOCH 0: training on 101576 raw words (79137 effective words) took 0.2s, 381511 effective words/s
INFO - 12:35:55: EPOCH 1: training on 101576 raw words (79273 effective words) took 0.2s, 399413 effective words/s
INFO - 12:35:55: EPOCH 2: training on 101576 raw words (79226 effective words) took 0.2s, 396452 effective words/s
INFO - 12:35:56: EPOCH 3: training on 101576 raw words (79364 effective words) took 0.2s, 365748 effective words/s
INFO - 12:35:56: EPOCH 4: training on 101576 raw words (79159 effective words) took 0.4s, 217270 effective words/s
INFO - 12:35:56: EPOCH 5: tra

Time to train the model: 0.04 mins


In [51]:
w2v_model.save('marvel_w2v.bin')

INFO - 12:34:20: Word2Vec lifecycle event {'fname_or_handle': 'marvel_w2v.bin', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-06-25T12:34:20.175789', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'saving'}
INFO - 12:34:20: not storing attribute cum_table
INFO - 12:34:20: saved marvel_w2v.bin


## Exploring the model

In [62]:
# check if the word is in the vocabulary before feeding in

#Get most similar words
w2v_model.wv.most_similar(positive=["hero"])

[('group', 0.9990216493606567),
 ('created', 0.9989855885505676),
 ('company', 0.9989451766014099),
 ('vibranium', 0.998924195766449),
 ('drone', 0.998919665813446),
 ('finally', 0.9989016652107239),
 ('cross', 0.9988691806793213),
 ('enemy', 0.9988670945167542),
 ('pilot', 0.9988518357276917),
 ('fact', 0.9988443851470947)]

In [69]:
w2v_model.wv.similarity("batman", 'joker')

0.46241245

In [70]:
w2v_model.wv.doesnt_match(['batman', 'joker', 'hulk'])

'hulk'

In [78]:
w2v_model.wv.most_similar(positive=["titanic", "james_cameron"], negative=["christopher_nolan"], topn=3)

[('bridge', 0.28366515040397644),
 ('kleenex', 0.28308799862861633),
 ('port', 0.27012956142425537)]

In [66]:
voc = w2v_model.wv.vocab
len(voc)

AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4