In [6]:
# !python -m spacy download en_core_web_sm

In [26]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
from gensim.models.phrases import Phrases, Phraser
import multiprocessing
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [8]:
df = pd.read_csv('C:/Users/ggiam/OneDrive/Documents/Projects/Word2Vec_Project/WordNet_Extraction/definitions.csv')


In [9]:
df.isnull().sum()

definitions    0
dtype: int64

In [10]:
nlp = spacy.load("en_core_web_sm")

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    
    # Removing this temporarily as dictionary definitions with two words 
    # can be helpful in this context 
    
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 1:
        return ' '.join(txt)
    
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['definitions'])

In [11]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 2.34 mins


In [12]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(114674, 1)

In [13]:
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 16:12:17: collecting all words and their counts
INFO - 16:12:17: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 16:12:17: PROGRESS: at sentence #10000, processed 78695 words and 76862 word types
INFO - 16:12:17: PROGRESS: at sentence #20000, processed 142890 words and 129930 word types
INFO - 16:12:17: PROGRESS: at sentence #30000, processed 210435 words and 178468 word types
INFO - 16:12:17: PROGRESS: at sentence #40000, processed 275415 words and 219402 word types
INFO - 16:12:17: PROGRESS: at sentence #50000, processed 339744 words and 259214 word types
INFO - 16:12:17: PROGRESS: at sentence #60000, processed 410641 words and 304371 word types
INFO - 16:12:18: PROGRESS: at sentence #70000, processed 482503 words and 345415 word types
INFO - 16:12:18: PROGRESS: at sentence #80000, processed 542463 words and 378538 word types
INFO - 16:12:18: PROGRESS: at sentence #90000, processed 631686 words and 416319 word types
INFO - 16:12:18: PROGRESS: at sentence #1

In [14]:
bigram = Phraser(phrases)
sentences = bigram[sent]

INFO - 16:12:18: exporting phrases from Phrases<514949 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 16:12:19: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<265 phrases, min_count=30, threshold=10.0> from Phrases<514949 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.77s', 'datetime': '2025-01-24T16:12:19.148723', 'gensim': '4.3.3', 'python': '3.11.11 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:34:19) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}


In [15]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

43042

In [16]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['have',
 'small',
 'especially',
 'relate',
 'large',
 'person',
 'form',
 'usually',
 'act',
 'manner']

In [17]:
cores = multiprocessing.cpu_count()

In [28]:
w2v_model = Word2Vec(min_count=1,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

INFO - 16:52:49: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2025-01-24T16:52:49.477557', 'gensim': '4.3.3', 'python': '3.11.11 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:34:19) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}


In [29]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 16:52:52: collecting all words and their counts
INFO - 16:52:52: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:52:52: PROGRESS: at sentence #10000, processed 78351 words, keeping 14753 word types
INFO - 16:52:52: PROGRESS: at sentence #20000, processed 141766 words, keeping 22220 word types
INFO - 16:52:52: PROGRESS: at sentence #30000, processed 207945 words, keeping 26084 word types
INFO - 16:52:53: PROGRESS: at sentence #40000, processed 270866 words, keeping 29118 word types
INFO - 16:52:53: PROGRESS: at sentence #50000, processed 334435 words, keeping 31093 word types
INFO - 16:52:53: PROGRESS: at sentence #60000, processed 404139 words, keeping 33120 word types
INFO - 16:52:53: PROGRESS: at sentence #70000, processed 473470 words, keeping 34945 word types
INFO - 16:52:53: PROGRESS: at sentence #80000, processed 531260 words, keeping 36716 word types
INFO - 16:52:53: PROGRESS: at sentence #90000, processed 612509 words, keeping 39292 word types

Time to build vocab: 0.02 mins


In [30]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 16:52:56: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 43042 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2025-01-24T16:52:56.307270', 'gensim': '4.3.3', 'python': '3.11.11 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:34:19) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'train'}
INFO - 16:52:57: EPOCH 0 - PROGRESS: at 44.28% examples, 245920 words/s, in_qsize 14, out_qsize 2
INFO - 16:52:58: EPOCH 0: training on 774361 raw words (559827 effective words) took 1.7s, 330321 effective words/s
INFO - 16:52:59: EPOCH 1 - PROGRESS: at 45.50% examples, 247180 words/s, in_qsize 14, out_qsize 0
INFO - 16:52:59: EPOCH 1: training on 774361 raw words (559434 effective words) took 1.6s, 345032 effective words/s
INFO - 16:53:00: EPOCH 2 - PROGRESS: at 50.39% examples, 262156 words/s, in_qsize 13, out_qsize 0
INFO - 16:53:01: EPOCH 2: training on 774361 raw word

Time to train the model: 0.79 mins


In [31]:
w2v_model.wv.most_similar(positive=["metal"])

[('casting', 0.7682483196258545),
 ('tungsten', 0.7507283091545105),
 ('molten', 0.7444027662277222),
 ('netting', 0.7438525557518005),
 ('weld', 0.7362831234931946),
 ('platinum', 0.7281267046928406),
 ('soldering', 0.7176530957221985),
 ('solder', 0.7149897217750549),
 ('hammer', 0.7075612545013428),
 ('electrolysis', 0.7000240087509155)]

In [32]:
w2v_model.wv.doesnt_match(['man', 'woman', 'arian'])



'man'

In [33]:
w2v_model.wv.index_to_key

['have',
 'small',
 'especially',
 'relate',
 'large',
 'person',
 'form',
 'usually',
 'act',
 'manner',
 'cause',
 'plant',
 'leave',
 'genus',
 'state',
 'united_states',
 'time',
 'water',
 'body',
 'long',
 'people',
 'work',
 'flower',
 'family',
 'tree',
 "'",
 'consist',
 'place',
 'use',
 'group',
 'resemble',
 'order',
 'contain',
 'give',
 'produce',
 'shape',
 'light',
 'high',
 'like',
 'play',
 'hold',
 'member',
 'system',
 'white',
 'good',
 'child',
 'quality',
 'point',
 'bear',
 'man',
 'line',
 'area',
 'surface',
 'head',
 'new',
 'animal',
 'low',
 'language',
 'find',
 'result',
 'great',
 'color',
 'end',
 'fruit',
 'cover',
 'sound',
 'take',
 'law',
 'lack',
 'red',
 'characterize',
 'grow',
 'include',
 'force',
 'common',
 'position',
 'change',
 'short',
 'woman',
 'food',
 'provide',
 'write',
 'make',
 'process',
 'word',
 'branch',
 'life',
 'number',
 'set',
 'mark',
 'base',
 'city',
 'property',
 'action',
 'money',
 'live',
 'right',
 'day',
 'serve'

In [34]:
w2v_model.wv.save_word2vec_format("dictionary_model.bin", binary=True)

INFO - 16:54:02: storing 43042x300 projection weights into dictionary_model.bin
