In [3]:
pip install -U spacy

Collecting spacy
  Downloading spacy-3.2.0-cp38-cp38-win_amd64.whl (12.1 MB)
Collecting wasabi<1.1.0,>=0.8.1
  Downloading wasabi-0.8.2-py3-none-any.whl (23 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.6-cp38-cp38-win_amd64.whl (113 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.6-py3-none-any.whl (17 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.6-cp38-cp38-win_amd64.whl (36 kB)
Collecting typer<0.5.0,>=0.3.0
  Downloading typer-0.4.0-py3-none-any.whl (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.8
  Downloading spacy_legacy-3.0.8-py2.py3-none-any.whl (14 kB)
Collecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.2-cp38-cp38-win_amd64.whl (452 kB)
Collecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
Collecting blis<0.8.0,>=0.4.0
  Downloading blis-0.7.5-cp38-cp38-win_amd64.whl (6.6 MB)
Collecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.13-cp38-cp38-win_amd64.whl (1.0 MB)
Collecting langcodes<4.0.0,>=3.2.0
  Do

In [24]:
from spacy.cli.download import download
download(model="en_core_web_sm")

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


## loading the Libraries and the data

In [85]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing
import gensim
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [3]:
df = pd.read_csv('simpsons_dataset.csv')
df.shape

(158314, 2)

In [4]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [5]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

## Preprocessing the data:

In [6]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [7]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [8]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [10]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 6.54 mins


In [83]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean=df_clean.reset_index(drop=True)

In [84]:
df_clean.head()

Unnamed: 0,clean
0,actually little disease magazine news show nat...
1,know sure like talk touch lesson plan teach
2,life worth live
3,poll open end recess case decide thought final...
4,victory party slide


In [166]:
df_clean.shape

(85952, 1)

## Tokenizing and creating a bigram content:

In [150]:
content=[gensim.utils.simple_preprocess(row) for row in df_clean['clean']]

In [151]:
content

[['actually',
  'little',
  'disease',
  'magazine',
  'news',
  'show',
  'natural',
  'think'],
 ['know', 'sure', 'like', 'talk', 'touch', 'lesson', 'plan', 'teach'],
 ['life', 'worth', 'live'],
 ['poll',
  'open',
  'end',
  'recess',
  'case',
  'decide',
  'thought',
  'final',
  'statement',
  'martin'],
 ['victory', 'party', 'slide'],
 ['mr', 'bergstrom', 'mr', 'bergstrom'],
 ['hey',
  'hey',
  'move',
  'morning',
  'new',
  'job',
  'take',
  'copernicus',
  'costume'],
 ['think', 'take', 'train', 'capital', 'city'],
 ['train', 'like', 'traditional', 'environmentally', 'sound'],
 ['yes',
  'backbone',
  'country',
  'leland',
  'stanford',
  'drive',
  'golden',
  'spike',
  'promontory',
  'point'],
 ['hey', 'thank', 'vote', 'man'],
 ['vote', 'voting', 'geek'],
 ['get', 'right', 'thank', 'vote', 'girl'],
 ['sweat', 'long', 'couple', 'people', 'right', 'milhouse'],
 ['martin', 'martin', 'like', 'recount'],
 ['want', 'sure', 'martin', 'martin'],
 ['way', 'mister', 'president'],

In [97]:
#Creating relevant phrases from the list of sentences by using Gensim Phrases package
from gensim.models.phrases import Phrases, Phraser

In [100]:
phrases = Phrases(content, min_count=30, progress_per=10000)

INFO - 14:11:36: collecting all words and their counts
INFO - 14:11:36: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 14:11:36: PROGRESS: at sentence #10000, processed 63557 words and 52763 word types
INFO - 14:11:36: PROGRESS: at sentence #20000, processed 130938 words and 99744 word types
INFO - 14:11:36: PROGRESS: at sentence #30000, processed 192959 words and 138351 word types
INFO - 14:11:37: PROGRESS: at sentence #40000, processed 249832 words and 172387 word types
INFO - 14:11:37: PROGRESS: at sentence #50000, processed 311271 words and 208254 word types
INFO - 14:11:37: PROGRESS: at sentence #60000, processed 373576 words and 243325 word types
INFO - 14:11:37: PROGRESS: at sentence #70000, processed 436427 words and 278322 word types
INFO - 14:11:37: PROGRESS: at sentence #80000, processed 497902 words and 311462 word types
INFO - 14:11:37: collected 330189 token types (unigram + bigrams) from a corpus of 537083 words and 85952 sentences
INFO - 14:11:37: m

In [101]:
bigram = Phraser(phrases)

INFO - 14:11:39: exporting phrases from Phrases<330189 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 14:11:40: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<127 phrases, min_count=30, threshold=10.0> from Phrases<330189 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 1.79s', 'datetime': '2021-12-11T14:11:40.854033', 'gensim': '4.1.2', 'python': '3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19042-SP0', 'event': 'created'}


In [104]:
#Transform the corpus based on the  detected bigrams
sentences = bigram[content]

## Building and training W2V Model:

In [251]:
import multiprocessing

from gensim.models import Word2Vec

In [252]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [288]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)
w2v_model.build_vocab(sentences, progress_per=10000)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

INFO - 15:33:38: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=300, alpha=0.03)', 'datetime': '2021-12-11T15:33:38.701736', 'gensim': '4.1.2', 'python': '3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19042-SP0', 'event': 'created'}
INFO - 15:33:38: collecting all words and their counts
INFO - 15:33:38: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 15:33:38: PROGRESS: at sentence #10000, processed 61694 words, keeping 9501 word types
INFO - 15:33:39: PROGRESS: at sentence #20000, processed 127310 words, keeping 14369 word types
INFO - 15:33:39: PROGRESS: at sentence #30000, processed 187765 words, keeping 17424 word types
INFO - 15:33:39: PROGRESS: at sentence #40000, processed 243261 words, keeping 20100 word types
INFO - 15:33:39: PROGRESS: at sentence #50000, processed 303119 words, keeping 22539 word types
INFO - 15:33:40: PROGRESS: at sentence #60000, processed 363855 words, keeping 

INFO - 15:34:09: worker thread finished; awaiting finish of 2 more threads
INFO - 15:34:09: worker thread finished; awaiting finish of 1 more threads
INFO - 15:34:09: worker thread finished; awaiting finish of 0 more threads
INFO - 15:34:09: EPOCH - 9 : training on 523523 raw words (199454 effective words) took 3.2s, 62172 effective words/s
INFO - 15:34:10: EPOCH 10 - PROGRESS: at 31.40% examples, 60516 words/s, in_qsize 0, out_qsize 0
INFO - 15:34:11: EPOCH 10 - PROGRESS: at 61.47% examples, 58333 words/s, in_qsize 2, out_qsize 0
INFO - 15:34:12: EPOCH 10 - PROGRESS: at 97.47% examples, 62948 words/s, in_qsize 2, out_qsize 1
INFO - 15:34:12: worker thread finished; awaiting finish of 2 more threads
INFO - 15:34:12: worker thread finished; awaiting finish of 1 more threads
INFO - 15:34:12: worker thread finished; awaiting finish of 0 more threads
INFO - 15:34:12: EPOCH - 10 : training on 523523 raw words (199092 effective words) took 3.1s, 64357 effective words/s
INFO - 15:34:14: EPOCH

INFO - 15:34:51: EPOCH - 23 : training on 523523 raw words (199299 effective words) took 2.7s, 73702 effective words/s
INFO - 15:34:52: EPOCH 24 - PROGRESS: at 29.43% examples, 57510 words/s, in_qsize 5, out_qsize 0
INFO - 15:34:53: EPOCH 24 - PROGRESS: at 65.29% examples, 62311 words/s, in_qsize 6, out_qsize 0
INFO - 15:34:54: worker thread finished; awaiting finish of 2 more threads
INFO - 15:34:54: worker thread finished; awaiting finish of 1 more threads
INFO - 15:34:54: worker thread finished; awaiting finish of 0 more threads
INFO - 15:34:54: EPOCH - 24 : training on 523523 raw words (199251 effective words) took 2.8s, 70694 effective words/s
INFO - 15:34:55: EPOCH 25 - PROGRESS: at 31.40% examples, 61982 words/s, in_qsize 1, out_qsize 0
INFO - 15:34:56: EPOCH 25 - PROGRESS: at 47.90% examples, 46427 words/s, in_qsize 3, out_qsize 1
INFO - 15:34:57: EPOCH 25 - PROGRESS: at 78.48% examples, 51318 words/s, in_qsize 3, out_qsize 1
INFO - 15:34:57: worker thread finished; awaiting fi

(5984442, 15705690)

## Exploring The Model:

## 1.	Getting term index:

In [269]:
w2v_model.wv['woman'].shape

(300,)

In [270]:
w2v_model.wv.get_index('love')

26

In [271]:
import numpy as np
np.sum(w2v_model.wv[w2v_model.wv.get_index('love') ] != w2v_model.wv['love'])

0

In [340]:
w2v_model.wv.index_to_key[30]

'need'

In [272]:
w2v_model.wv.has_index_for('abracadabra'), w2v_model.wv.has_index_for('love')

(False, True)

In [278]:
w2v_model.wv.cosine_similarities(w2v_model.wv['television'], w2v_model.wv.vectors).shape

(3323,)

## 2.	Finding the most similar words to specific word:

In [259]:
w2v_model.wv.most_similar(positive=["maggie"])

[('awww', 0.6959978342056274),
 ('lisa', 0.6897205710411072),
 ('diaper', 0.6832014918327332),
 ('mommy', 0.6822347044944763),
 ('honey', 0.6789132356643677),
 ('baby', 0.6729987263679504),
 ('asleep', 0.668789803981781),
 ('sweetie', 0.6681798696517944),
 ('strangle', 0.6657243967056274),
 ('babysitter', 0.6609887480735779)]

In [297]:
w2v_model.wv.most_similar(positive=["homer_simpson"])

[('pleased', 0.6862821578979492),
 ('congratulation', 0.6820638179779053),
 ('recent', 0.6686651706695557),
 ('select', 0.6625051498413086),
 ('speaker', 0.6583868265151978),
 ('easily', 0.657917320728302),
 ('waylon', 0.6530686020851135),
 ('montgomery_burn', 0.642357349395752),
 ('governor', 0.6418781876564026),
 ('united_states', 0.6395424604415894)]

In [302]:
w2v_model.wv.most_similar(positive=["bart"])

[('lisa', 0.8351496458053589),
 ('mom', 0.7507618069648743),
 ('strangle', 0.7376942038536072),
 ('hearing', 0.7370407581329346),
 ('babysitter', 0.7178239822387695),
 ('mom_dad', 0.71009761095047),
 ('grownup', 0.7027170062065125),
 ('homework', 0.7025983929634094),
 ('badly', 0.6985287070274353),
 ('creepy', 0.6970906257629395)]

In [280]:
w2v_model.wv.most_similar(positive = ['enemy'],topn=10 )

[('capture', 0.6891812086105347),
 ('powerful', 0.6724203824996948),
 ('ancient', 0.6638416051864624),
 ('lead', 0.6283242702484131),
 ('culture', 0.6201581358909607),
 ('ben', 0.6042076349258423),
 ('roman', 0.6034662127494812),
 ('springfielder', 0.6022446155548096),
 ('rare', 0.5997748374938965),
 ('appear', 0.5927237272262573)]

In [285]:
w2v_model.wv.most_similar(positive = ['bush'],topn=10 )

[('washington', 0.8255041241645813),
 ('george', 0.8212107419967651),
 ('clinton', 0.7739414572715759),
 ('w', 0.7218510508537292),
 ('smith', 0.7072834968566895),
 ('president', 0.7069959044456482),
 ('pope', 0.675169050693512),
 ('kennedy', 0.6705697178840637),
 ('neighborhood', 0.6596847772598267),
 ('dean', 0.6257457733154297)]

In [334]:
from sklearn.preprocessing import normalize

v = normalize(w2v_model.wv['man'].reshape(1,-1))  \
    - normalize(w2v_model.wv['woman'].reshape(1,-1)) \
    + normalize(w2v_model.wv['king'].reshape(1,-1))

[(key, sim) for key, sim in w2v_model.wv.most_similar(v) if key not in ['king', 'men', 'women']] # remove source words

[('man', 0.5787461996078491),
 ('sprinkle', 0.5294414162635803),
 ('jolly', 0.4946066737174988),
 ('aisle', 0.4904380440711975),
 ('knight', 0.47524911165237427),
 ('se', 0.466854989528656),
 ('france', 0.46579408645629883),
 ('barrel', 0.4623803496360779),
 ('crown', 0.45758697390556335)]

## 3.	How similar are two words to each other :

In [317]:
w2v_model.wv.similarity('maggie', 'awww')

0.72678053

In [314]:
w2v_model.wv.similarity("moe", 'tavern')

0.86759853

In [312]:
w2v_model.wv.similarity('homer', 'marge')

0.7433727

In [311]:
w2v_model.wv.similarity('bart', 'nelson')

0.54328275

##  4.	Closer than (odd):

In [265]:
w2v_model.wv.doesnt_match(["nelson", "bart", "milhouse"])

'nelson'

In [266]:
w2v_model.wv.doesnt_match(['homer', 'patty', 'selma'])

'homer'

In [276]:
w2v_model.wv.doesnt_match(['cat', 'dog', 'chicken', 'lion'])

'chicken'

In [287]:
w2v_model.wv.doesnt_match(['bush', 'clinton', 'kennedy', 'pope'])

'pope'

In [275]:
w2v_model.wv.doesnt_match(['window', 'door', 'chair','cat'])

'cat'

## 5.	Most  similar words to another related couple of words:

In [267]:
w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=4)

[('married', 0.6111018657684326),
 ('man', 0.5843933820724487),
 ('admire', 0.5635032653808594),
 ('boyfriend', 0.5424638390541077)]

In [268]:
w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3)

[('lisa', 0.7200313806533813),
 ('pregnant', 0.6605644226074219),
 ('parent', 0.6503780484199524)]

In [336]:
w2v_model.wv.most_similar(positive = ['king', 'woman'], negative=['man'],topn=10 )

[('queen', 0.5765933394432068),
 ('arthur', 0.5398191809654236),
 ('critic', 0.5121352076530457),
 ('david', 0.5054819583892822),
 ('plow', 0.5037469863891602),
 ('nonsense', 0.49150320887565613),
 ('coincidence', 0.4874555170536041),
 ('larry', 0.48521852493286133),
 ('simon', 0.47430115938186646),
 ('disturb', 0.46996352076530457)]

## 6.	closer_than (key1, key2):
Get all keys that are closer to key1 than key2 is to key1

In [328]:
w2v_model.wv.closer_than('hope', 'wish')

['feeling',
 'decision',
 'relationship',
 'rest_life',
 'decent',
 'sacrifice',
 'humiliate',
 'babysitter',
 'arrange',
 'grownup',
 'embarrassing',
 'adopt',
 'awake',
 'bribe',
 'eliza',
 'haunt',
 'anyhoo',
 'exploit',
 'badly']

## 7.Compute distance between terms:

In [274]:
w2v_model.wv.distance('dog', 'cat'), w2v_model.wv.distance('cool', 'amazing'),  w2v_model.wv.distance('car', 'truck')

(0.2969825267791748, 0.7218841314315796, 0.3452696204185486)

In [332]:
distances = w2v_model.wv.distances('america') # distance to all words
[model.wv.index_to_key[key] for key in np.argsort(distances)[:10]]

['america',
 'nearly',
 'hopeless',
 'pure',
 'shh',
 'suggestion',
 'bulletin',
 'sh',
 'punishment',
 'discipline']