In [1]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
from tqdm.notebook import tqdm
import spacy  # For preprocessing
import nltk
import string
import glob
import os
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer
import warnings

warnings.filterwarnings("ignore")

import logging  # Setting up the loggings to monitor gensim

logging.basicConfig(
    format="%(levelname)s - %(asctime)s: %(message)s",
    datefmt="%H:%M:%S",
    level=logging.INFO,
)


In [2]:
from gensim.parsing.preprocessing import preprocess_string


In [3]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt")


[nltk_data] Downloading package stopwords to /home/gunjan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gunjan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/gunjan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/gunjan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
txt_files = glob.glob("marvel/*.txt")

print(txt_files[:3], end="\n\n")
print("Total number of movie scripts:", len(txt_files))


['marvel/Iron-Man.2.txt', 'marvel/Spider-Man.Far.From.Home.txt', 'marvel/Captain.Marvel.txt']

Total number of movie scripts: 23


In [5]:
marvel_corpus = ""

for file in txt_files:
    with open(file, "r", encoding="ISO-8859-1") as f:
        text = f.read()
        marvel_corpus += text


In [6]:
marvel_corpus = marvel_corpus.lower()
print("Sample text from the corpus:\n\n", marvel_corpus[9500:10000])


Sample text from the corpus:

 urned. we're adjourned for today.
- okay. - you've been a delight.
my bond is with the people.
and i will serve this great nation at the pleasure of myself.
if there's one thing i've proven
it's that you can count on me to pleasure myself.
wake up. daddy's home.
welcome home, sir.
congratulations on the opening ceremonies.
they were such a success, as was your senate hearing.
and may i say how refreshing it is
to finally see you in a video with your clothing on, sir.
you!
i swear to god i'll dis


## Cleaning:


In [7]:
Word = WordNetLemmatizer()
stop_words = stopwords.words("english")


In [13]:
t = time()

text = marvel_corpus
text = re.sub(r"[^.A-Za-z]", " ", text)
sentence_list = text.split(".")
print(sentence_list[0])
sentences_without_stopword = []
for sentence in sentence_list:
    temp = []
    for word in sentence.split():
        if word not in stop_words:
            temp.append(word)
    sentences_without_stopword.append(" ".join(temp))
sentences_without_stopword[0]

been a while since i was up here in front of you


'since front'

In [14]:
tokens=[nltk.word_tokenize(words) for words in sentences_without_stopword]
tokens[0]

['since', 'front']

## bIGRAMS


In [15]:
from gensim.models.phrases import Phrases, Phraser


In [16]:
# min_count (float, optional) – Ignore all words and bigrams with total collected count lower than this value.
phrases = Phrases(tokens, min_count=20, progress_per=10000)


INFO - 17:26:54: collecting all words and their counts
INFO - 17:26:54: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 17:26:54: PROGRESS: at sentence #10000, processed 27219 words and 21466 word types
INFO - 17:26:54: PROGRESS: at sentence #20000, processed 56151 words and 40845 word types
INFO - 17:26:54: PROGRESS: at sentence #30000, processed 85759 words and 58899 word types
INFO - 17:26:54: collected 67354 token types (unigram + bigrams) from a corpus of 101708 words and 36103 sentences
INFO - 17:26:54: merged Phrases<67354 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>
INFO - 17:26:54: Phrases lifecycle event {'msg': 'built Phrases<67354 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> in 0.13s', 'datetime': '2022-06-27T17:26:54.473865', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'created'}


In [17]:
bigram = Phraser(phrases)


INFO - 17:26:57: exporting phrases from Phrases<67354 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>
INFO - 17:26:57: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<15 phrases, min_count=20, threshold=10.0> from Phrases<67354 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> in 0.11s', 'datetime': '2022-06-27T17:26:57.625815', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'created'}


In [18]:
sentences = bigram[tokens]


### Most frequent words


In [19]:
word_freq = defaultdict(int)
for sent in sentences:
    for word in sent:
        word_freq[word] += 1
len(word_freq)


11178

In [20]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]


['know', 'right', 'get', 'yeah', 'like', 'one', 'got', 'okay', 'gon_na', 'go']

## Training the Model


In [21]:
import multiprocessing

from gensim.models import Word2Vec


In [22]:
cores = multiprocessing.cpu_count()  # Count the number of cores in a computer


In [32]:
w2v_model = Word2Vec(min_count=20,
                     vector_size=300,
                     sample=6e-5,
                     sg=1,
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)


INFO - 17:28:54: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2022-06-27T17:28:54.594144', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'created'}


In [33]:
# Building vocabulary table
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print("Time to build vocab: {} mins".format(round((time() - t) / 60, 2)))


INFO - 17:28:55: collecting all words and their counts
INFO - 17:28:55: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 17:28:55: PROGRESS: at sentence #10000, processed 26761 words, keeping 5350 word types
INFO - 17:28:55: PROGRESS: at sentence #20000, processed 55327 words, keeping 8187 word types
INFO - 17:28:55: PROGRESS: at sentence #30000, processed 84479 words, keeping 10338 word types
INFO - 17:28:55: collected 11178 word types from a corpus of 100068 raw words and 36103 sentences
INFO - 17:28:55: Creating a fresh vocabulary
INFO - 17:28:55: Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 866 unique words (7.75% of original 11178, drops 10312)', 'datetime': '2022-06-27T17:28:55.802218', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'prepare_vocab'}
INFO - 17:28:55: Word2Vec lifecycle event {'msg': 'effective_min_count=20 leaves 6

Time to build vocab: 0.0 mins


### Training of model


In [50]:
t = time()

w2v_model.train(
    sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1
)

print("Time to train the model: {} mins".format(round((time() - t) / 60, 2)))


INFO - 17:30:47: Word2Vec lifecycle event {'msg': 'training model with 11 workers on 866 vocabulary and 300 features, using sg=1 hs=0 sample=6e-05 negative=20 window=5 shrink_windows=True', 'datetime': '2022-06-27T17:30:47.149422', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'train'}
INFO - 17:30:47: EPOCH 0: training on 100068 raw words (17217 effective words) took 0.2s, 93866 effective words/s
INFO - 17:30:47: EPOCH 1: training on 100068 raw words (17292 effective words) took 0.2s, 91929 effective words/s
INFO - 17:30:47: EPOCH 2: training on 100068 raw words (17356 effective words) took 0.2s, 102355 effective words/s
INFO - 17:30:48: EPOCH 3: training on 100068 raw words (17237 effective words) took 0.3s, 53461 effective words/s
INFO - 17:30:48: EPOCH 4: training on 100068 raw words (17083 effective words) took 0.2s, 95196 effective words/s
INFO - 17:30:48: EPOCH 5: trainin

Time to train the model: 0.1 mins


In [65]:
w2v_model.save("marvel_w2v.bin")


INFO - 17:32:35: Word2Vec lifecycle event {'fname_or_handle': 'marvel_w2v.bin', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-06-27T17:32:35.947632', 'gensim': '4.2.0', 'python': '3.8.0 (default, Nov  6 2019, 21:49:08) \n[GCC 7.3.0]', 'platform': 'Linux-5.13.0-51-generic-x86_64-with-glibc2.10', 'event': 'saving'}
INFO - 17:32:35: not storing attribute cum_table
INFO - 17:32:35: saved marvel_w2v.bin


## Exploring the model


In [64]:
# check if the word is in the vocabulary before feeding in

# Get most similar words
w2v_model.wv.most_similar(positive=[""])


[('throne', 0.9986668229103088),
 ('king', 0.998475968837738),
 ('loki', 0.9984367489814758),
 ('father', 0.998172402381897),
 ('without', 0.9980611205101013),
 ('destroyed', 0.9980530738830566),
 ('came', 0.9980035424232483),
 ('yet', 0.997901201248169),
 ('war', 0.9978365898132324),
 ('much', 0.9978256225585938)]

In [69]:
w2v_model.wv.similarity("batman", "joker")


0.46241245

In [70]:
w2v_model.wv.doesnt_match(["batman", "joker", "hulk"])


'hulk'

In [78]:
w2v_model.wv.most_similar(
    positive=["titanic", "james_cameron"], negative=["christopher_nolan"], topn=3
)


[('bridge', 0.28366515040397644),
 ('kleenex', 0.28308799862861633),
 ('port', 0.27012956142425537)]

In [66]:
voc = w2v_model.wv.vocab
len(voc)


AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4