# The **Simpsons**

Data source: https://www.kaggle.com/datasets/pierremegret/dialogue-lines-of-the-simpsons

In [1]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)



In [3]:
df = pd.read_csv('simpsons_dataset.csv')
df.shape

(158314, 2)

In [4]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [5]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [6]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [9]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [10]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [12]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 3.38 mins


In [13]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85954, 1)

In [14]:
from gensim.models.phrases import Phrases, Phraser

In [15]:
sent = [row.split() for row in df_clean['clean']]

In [16]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [18]:
bigram = Phraser(phrases)

In [19]:
sentences = bigram[sent]

In [20]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

29674

In [21]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['oh', 'like', 'know', 'get', 'hey', 'think', 'come', 'right', 'look', 'want']

In [22]:
import multiprocessing

from gensim.models import Word2Vec

In [23]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [24]:
cores

2

In [25]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores)

In [26]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.07 mins


In [27]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 1.64 mins


In [28]:
w2v_model.init_sims(replace=True)

In [29]:
w2v_model.wv.most_similar(positive=["homer"])

[('gee', 0.8141945600509644),
 ('marge', 0.7981431484222412),
 ('sweetheart', 0.7931554317474365),
 ('depressed', 0.7922211289405823),
 ('crummy', 0.761616587638855),
 ('bongo', 0.7575360536575317),
 ('sure', 0.7565381526947021),
 ('hammock', 0.7481997609138489),
 ('snuggle', 0.747738242149353),
 ('creepy', 0.7474150061607361)]

In [31]:
w2v_model.wv.most_similar(positive=["marge"])

[('homer', 0.7981431484222412),
 ('married', 0.7713540196418762),
 ('darling', 0.7662019729614258),
 ('snuggle', 0.7654224038124084),
 ('sorry', 0.7621225714683533),
 ('sure', 0.7612461447715759),
 ('crummy', 0.7574580907821655),
 ('grownup', 0.7557697892189026),
 ('becky', 0.7557134628295898),
 ('brunch', 0.7550239562988281)]

In [32]:
w2v_model.wv.most_similar(positive=["bart"])

[('lisa', 0.8335794806480408),
 ('convince', 0.7746198177337646),
 ('upset', 0.771702766418457),
 ('badly', 0.7695316076278687),
 ('dr_hibbert', 0.7613595128059387),
 ('jealous', 0.7587845325469971),
 ('grownup', 0.7543818950653076),
 ('maggie', 0.7497103214263916),
 ('mom', 0.7486655712127686),
 ('creepy', 0.7472660541534424)]

In [36]:
w2v_model.wv.similarity('moe', 'tavern')

0.9039857

In [37]:
w2v_model.wv.doesnt_match(["nelson", "bart", "milhouse"])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'nelson'

## Visualization