In [22]:
#Setting Up the Environment
import re #For Preprocessing
import pandas as pd #For Data Handling
from time import time #To Time the Operations
from collections import defaultdict #Use for word frequency

import spacy #More Preprocessing

import logging #Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [23]:
df = pd.read_csv('Resume.csv', usecols = ['Resume_str', 'Category'])
df.shape

(235, 2)

In [24]:
df.head()

Unnamed: 0,Resume_str,Category
0,INFORMATION TECHNOLOGY Summar...,INFORMATION-TECHNOLOGY
1,INFORMATION TECHNOLOGY SPECIALIST\tGS...,INFORMATION-TECHNOLOGY
2,INFORMATION TECHNOLOGY SUPERVISOR ...,INFORMATION-TECHNOLOGY
3,INFORMATION TECHNOLOGY INSTRUCTOR ...,INFORMATION-TECHNOLOGY
4,INFORMATION TECHNOLOGY MANAGER/ANALYS...,INFORMATION-TECHNOLOGY


In [25]:
df.isnull().sum()

Resume_str    0
Category      0
dtype: int64

In [26]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()
df.head()

Unnamed: 0,Resume_str,Category
0,INFORMATION TECHNOLOGY Summar...,INFORMATION-TECHNOLOGY
1,INFORMATION TECHNOLOGY SPECIALIST\tGS...,INFORMATION-TECHNOLOGY
2,INFORMATION TECHNOLOGY SUPERVISOR ...,INFORMATION-TECHNOLOGY
3,INFORMATION TECHNOLOGY INSTRUCTOR ...,INFORMATION-TECHNOLOGY
4,INFORMATION TECHNOLOGY MANAGER/ANALYS...,INFORMATION-TECHNOLOGY


In [27]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [28]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['Resume_str'])

In [29]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=10000)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.19 mins


In [30]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape
df_clean.head()

Unnamed: 0,clean
0,information technology summary dedicated inf...
1,information technology specialist gs experie...
2,information technology supervisor summary se...
3,information technology instructor summary se...
4,information technology manager analyst profe...


In [31]:
from gensim.models.phrases import Phrases, Phraser


In [32]:
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 19:07:17: collecting all words and their counts
INFO - 19:07:17: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 19:07:17: collected 99142 token types (unigram + bigrams) from a corpus of 146950 words and 235 sentences
INFO - 19:07:17: merged Phrases<99142 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 19:07:17: Phrases lifecycle event {'msg': 'built Phrases<99142 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.16s', 'datetime': '2021-10-31T19:07:17.361132', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [33]:
bigram = Phraser(phrases)
sentences = bigram[sent]

INFO - 19:07:17: exporting phrases from Phrases<99142 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 19:07:17: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<64 phrases, min_count=30, threshold=10.0> from Phrases<99142 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.17s', 'datetime': '2021-10-31T19:07:17.568320', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [34]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

9765

In [35]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['system',
 'management',
 'project',
 'design',
 'network',
 'support',
 'state',
 'work',
 'team',
 'engineering']

In [36]:
import multiprocessing

from gensim.models import Word2Vec

In [37]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [38]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

INFO - 19:07:17: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=300, alpha=0.03)', 'datetime': '2021-10-31T19:07:17.866203', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [39]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 19:07:17: collecting all words and their counts
INFO - 19:07:17: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 19:07:18: collected 9765 word types from a corpus of 141678 raw words and 235 sentences
INFO - 19:07:18: Creating a fresh vocabulary
INFO - 19:07:18: Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 1273 unique words (13.036354326676907%% of original 9765, drops 8492)', 'datetime': '2021-10-31T19:07:18.038359', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
INFO - 19:07:18: Word2Vec lifecycle event {'msg': 'effective_min_count=20 leaves 112507 word corpus (79.41035305410861%% of original 141678, drops 29171)', 'datetime': '2021-10-31T19:07:18.039361', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_voca

Time to build vocab: 0.0 mins


In [40]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 19:07:18: Word2Vec lifecycle event {'msg': 'training model with 23 workers on 1273 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2021-10-31T19:07:18.102417', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
INFO - 19:07:18: worker thread finished; awaiting finish of 22 more threads
INFO - 19:07:18: worker thread finished; awaiting finish of 21 more threads
INFO - 19:07:18: worker thread finished; awaiting finish of 20 more threads
INFO - 19:07:18: worker thread finished; awaiting finish of 19 more threads
INFO - 19:07:18: worker thread finished; awaiting finish of 18 more threads
INFO - 19:07:18: worker thread finished; awaiting finish of 17 more threads
INFO - 19:07:18: worker thread finished; awaiting finish of 16 more threads
INFO - 19:07:18: worker thread finished; awaiting finish of 15 more threads
I

Time to train the model: 0.09 mins


In [41]:
w2v_model.wv.most_similar(positive=["information_technology", "specialist"])

[('professional', 0.9582669138908386),
 ('director', 0.9513384699821472),
 ('current_company', 0.9442229866981506),
 ('overview', 0.9387771487236023),
 ('accounting', 0.9209386110305786),
 ('manager', 0.9201720356941223),
 ('law', 0.9143115282058716),
 ('defense', 0.9139789342880249),
 ('administrative', 0.9077222347259521),
 ('assistant', 0.9058296084403992)]

In [46]:
w2v_model.wv.most_similar(positive=["machine"])

[('circuit', 0.9702987670898438),
 ('panel', 0.9658087491989136),
 ('diagram', 0.9637961387634277),
 ('layout', 0.9611995220184326),
 ('transmission', 0.9548798203468323),
 ('motor', 0.9542325735092163),
 ('light', 0.9451925158500671),
 ('schematic', 0.9419359564781189),
 ('signal', 0.9394065141677856),
 ('machinery', 0.9380564093589783)]

In [42]:
w2v_model.save("basemodel.bin")

INFO - 19:07:23: Word2Vec lifecycle event {'fname_or_handle': 'basemodel.bin', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-10-31T19:07:23.450875', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'saving'}
INFO - 19:07:23: not storing attribute cum_table
INFO - 19:07:23: saved basemodel.bin
