In [31]:
# !python -m spacy download en_core_web_sm

In [25]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
from gensim.models.phrases import Phrases, Phraser
import multiprocessing
from gensim.models import Word2Vec

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [26]:
df = pd.read_csv('C:/Users/ggiam/OneDrive/Documents/Projects/Word2Vec_Project/WordNet_Extraction/definitions.csv')


In [27]:
df.isnull().sum()

definitions    0
dtype: int64

In [110]:
nlp = spacy.load("en_core_web_sm")

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    
    # Removing this temporarily as dictionary definitions with two words 
    # can be helpful in this context 
    
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 1:
        return ' '.join(txt)
    
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['definitions'])

In [111]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 2.18 mins


In [112]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(114674, 1)

In [113]:
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 17:28:48: collecting all words and their counts
INFO - 17:28:48: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 17:28:49: PROGRESS: at sentence #10000, processed 78695 words and 76862 word types
INFO - 17:28:49: PROGRESS: at sentence #20000, processed 142890 words and 129930 word types
INFO - 17:28:49: PROGRESS: at sentence #30000, processed 210435 words and 178468 word types
INFO - 17:28:49: PROGRESS: at sentence #40000, processed 275415 words and 219402 word types
INFO - 17:28:49: PROGRESS: at sentence #50000, processed 339744 words and 259214 word types
INFO - 17:28:49: PROGRESS: at sentence #60000, processed 410641 words and 304371 word types
INFO - 17:28:49: PROGRESS: at sentence #70000, processed 482503 words and 345415 word types
INFO - 17:28:49: PROGRESS: at sentence #80000, processed 542463 words and 378538 word types
INFO - 17:28:49: PROGRESS: at sentence #90000, processed 631686 words and 416319 word types
INFO - 17:28:49: PROGRESS: at sentence #1

In [114]:
bigram = Phraser(phrases)
sentences = bigram[sent]

INFO - 17:28:54: exporting phrases from Phrases<514949 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 17:28:54: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<265 phrases, min_count=30, threshold=10.0> from Phrases<514949 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.82s', 'datetime': '2025-01-23T17:28:54.999601', 'gensim': '4.3.3', 'python': '3.11.11 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:34:19) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}


In [115]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

43042

In [116]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['have',
 'small',
 'especially',
 'relate',
 'large',
 'person',
 'form',
 'usually',
 'act',
 'manner']

In [117]:
cores = multiprocessing.cpu_count()

In [118]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

INFO - 17:29:02: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2025-01-23T17:29:02.859766', 'gensim': '4.3.3', 'python': '3.11.11 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:34:19) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}


In [119]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 17:29:04: collecting all words and their counts
INFO - 17:29:04: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 17:29:05: PROGRESS: at sentence #10000, processed 78351 words, keeping 14753 word types
INFO - 17:29:05: PROGRESS: at sentence #20000, processed 141766 words, keeping 22220 word types
INFO - 17:29:05: PROGRESS: at sentence #30000, processed 207945 words, keeping 26084 word types
INFO - 17:29:05: PROGRESS: at sentence #40000, processed 270866 words, keeping 29118 word types
INFO - 17:29:05: PROGRESS: at sentence #50000, processed 334435 words, keeping 31093 word types
INFO - 17:29:05: PROGRESS: at sentence #60000, processed 404139 words, keeping 33120 word types
INFO - 17:29:05: PROGRESS: at sentence #70000, processed 473470 words, keeping 34945 word types
INFO - 17:29:05: PROGRESS: at sentence #80000, processed 531260 words, keeping 36716 word types
INFO - 17:29:05: PROGRESS: at sentence #90000, processed 612509 words, keeping 39292 word types

Time to build vocab: 0.02 mins


In [120]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 17:29:08: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 5927 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2025-01-23T17:29:08.526618', 'gensim': '4.3.3', 'python': '3.11.11 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:34:19) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'train'}
INFO - 17:29:09: EPOCH 0 - PROGRESS: at 73.04% examples, 286967 words/s, in_qsize 13, out_qsize 5
INFO - 17:29:09: EPOCH 0: training on 774361 raw words (405306 effective words) took 1.2s, 345503 effective words/s
INFO - 17:29:10: EPOCH 1 - PROGRESS: at 48.11% examples, 186039 words/s, in_qsize 13, out_qsize 3
INFO - 17:29:11: EPOCH 1: training on 774361 raw words (405524 effective words) took 1.4s, 296973 effective words/s
INFO - 17:29:12: EPOCH 2 - PROGRESS: at 64.94% examples, 258534 words/s, in_qsize 13, out_qsize 0
INFO - 17:29:12: EPOCH 2: training on 774361 raw words

Time to train the model: 0.63 mins


In [46]:
w2v_model.init_sims(replace=True)

  w2v_model.init_sims(replace=True)


In [125]:
w2v_model.wv.most_similar(positive=["metal"])

[('steel', 0.701933741569519),
 ('hammer', 0.662426769733429),
 ('perforate', 0.6590691804885864),
 ('iron', 0.6523343324661255),
 ('alloy', 0.6490715146064758),
 ('molten', 0.6463151574134827),
 ('tin', 0.6458144187927246),
 ('zinc', 0.6158932447433472),
 ('revolving', 0.6124913096427917),
 ('oxide', 0.596883237361908)]

In [126]:
w2v_model.wv.doesnt_match(['man', 'woman', 'arian'])



'man'

In [128]:
w2v_model.wv.similarity('law', 'drink')

0.07828937