In [47]:
#Setting Up the Environment
import re #For Preprocessing
import pandas as pd #For Data Handling
from time import time #To Time the Operations
from collections import defaultdict #Use for word frequency

import spacy #More Preprocessing

import logging #Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [48]:
df = pd.read_csv('Resume.csv', usecols = ['Resume_str', 'Category'])
df.shape

(235, 2)

In [49]:
df.head()

Unnamed: 0,Resume_str,Category
0,INFORMATION TECHNOLOGY Summar...,INFORMATION-TECHNOLOGY
1,INFORMATION TECHNOLOGY SPECIALIST\tGS...,INFORMATION-TECHNOLOGY
2,INFORMATION TECHNOLOGY SUPERVISOR ...,INFORMATION-TECHNOLOGY
3,INFORMATION TECHNOLOGY INSTRUCTOR ...,INFORMATION-TECHNOLOGY
4,INFORMATION TECHNOLOGY MANAGER/ANALYS...,INFORMATION-TECHNOLOGY


In [50]:
df.isnull().sum()

Resume_str    0
Category      0
dtype: int64

In [51]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()
df.head()

Unnamed: 0,Resume_str,Category
0,INFORMATION TECHNOLOGY Summar...,INFORMATION-TECHNOLOGY
1,INFORMATION TECHNOLOGY SPECIALIST\tGS...,INFORMATION-TECHNOLOGY
2,INFORMATION TECHNOLOGY SUPERVISOR ...,INFORMATION-TECHNOLOGY
3,INFORMATION TECHNOLOGY INSTRUCTOR ...,INFORMATION-TECHNOLOGY
4,INFORMATION TECHNOLOGY MANAGER/ANALYS...,INFORMATION-TECHNOLOGY


In [52]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [53]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['Resume_str'])

In [54]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=10000)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.18 mins


In [55]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape
df_clean.head()

Unnamed: 0,clean
0,information technology summary dedicated inf...
1,information technology specialist gs experie...
2,information technology supervisor summary se...
3,information technology instructor summary se...
4,information technology manager analyst profe...


In [56]:
from gensim.models.phrases import Phrases, Phraser


In [57]:
sent = [row.split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 14:24:03: collecting all words and their counts
INFO - 14:24:03: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 14:24:03: collected 99142 token types (unigram + bigrams) from a corpus of 146950 words and 235 sentences
INFO - 14:24:03: merged Phrases<99142 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 14:24:03: Phrases lifecycle event {'msg': 'built Phrases<99142 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.15s', 'datetime': '2021-11-07T14:24:03.609413', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [58]:
bigram = Phraser(phrases)
sentences = bigram[sent]

INFO - 14:24:03: exporting phrases from Phrases<99142 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 14:24:03: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<64 phrases, min_count=30, threshold=10.0> from Phrases<99142 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.16s', 'datetime': '2021-11-07T14:24:03.805397', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [59]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

9765

In [60]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['system',
 'management',
 'project',
 'design',
 'network',
 'support',
 'state',
 'work',
 'team',
 'engineering']

In [61]:
import multiprocessing

from gensim.models import Word2Vec

In [62]:
threads = multiprocessing.cpu_count() # Count the number of cores in a computer
print(threads)

24


In [63]:
w2v_model = Word2Vec(min_count=20,      #Ignores all words with total frequency lower than this 
                     window=3,          #The amount of words that it will look to at either side of the target word
                     vector_size=300,   #Dimensionality of the vectors
                     sample=6e-5,       #Thershold that randomly downsamples, word weighs less, higher-frequency words //Highly Influential - (0, 1e-5)
                     alpha=0.03,        #The intial learning rate
                     min_alpha=0.0007,  #Causes the learning rate to linearly drop as training progresses. Set by: alpha - min_alpha * epochs ~ 0.0
                     negative=20,       #Specifies how many "noise words" should be drown
                     workers=threads-1, #Uses as many worker threads as the machine has to maximize training efficiency
                     sg = 0)            #Sets the training algorithm Continous Bag Of Words CBOW(0) or Skip Gram(1)

INFO - 14:24:04: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=300, alpha=0.03)', 'datetime': '2021-11-07T14:24:04.111256', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [64]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000) #Ingests all the words and filters out unique words

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 14:24:04: collecting all words and their counts
INFO - 14:24:04: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 14:24:04: collected 9765 word types from a corpus of 141678 raw words and 235 sentences
INFO - 14:24:04: Creating a fresh vocabulary
INFO - 14:24:04: Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 1273 unique words (13.036354326676907%% of original 9765, drops 8492)', 'datetime': '2021-11-07T14:24:04.277745', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
INFO - 14:24:04: Word2Vec lifecycle event {'msg': 'effective_min_count=20 leaves 112507 word corpus (79.41035305410861%% of original 141678, drops 29171)', 'datetime': '2021-11-07T14:24:04.277745', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_voca

Time to build vocab: 0.0 mins


In [65]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 14:24:04: Word2Vec lifecycle event {'msg': 'training model with 23 workers on 1273 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=3 shrink_windows=True', 'datetime': '2021-11-07T14:24:04.330793', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
INFO - 14:24:04: worker thread finished; awaiting finish of 22 more threads
INFO - 14:24:04: worker thread finished; awaiting finish of 21 more threads
INFO - 14:24:04: worker thread finished; awaiting finish of 20 more threads
INFO - 14:24:04: worker thread finished; awaiting finish of 19 more threads
INFO - 14:24:04: worker thread finished; awaiting finish of 18 more threads
INFO - 14:24:04: worker thread finished; awaiting finish of 17 more threads
INFO - 14:24:04: worker thread finished; awaiting finish of 16 more threads
INFO - 14:24:04: worker thread finished; awaiting finish of 15 more threads
I

Time to train the model: 0.09 mins


In [66]:
w2v_model.wv.most_similar(positive=["engineering"])

[('engineering_intern', 0.9672421216964722),
 ('chemical', 0.959324061870575),
 ('engineer', 0.9496195912361145),
 ('industrial', 0.9363162517547607),
 ('technological', 0.9306609630584717),
 ('pharmacy', 0.9276175498962402),
 ('april', 0.9261232614517212),
 ('june', 0.9190258383750916),
 ('december', 0.9188588857650757),
 ('mechanical_engineering', 0.9186879992485046)]

In [67]:
w2v_model.wv.most_similar(positive=["machine"], negative=["technology"]) 

[('generator', 0.742243766784668),
 ('schematic', 0.7408343553543091),
 ('repair', 0.727952241897583),
 ('meter', 0.7125956416130066),
 ('oscilloscope', 0.7064136266708374),
 ('assembly', 0.6986247897148132),
 ('diagram', 0.6909878849983215),
 ('equipment', 0.6901581287384033),
 ('frequency', 0.6839236617088318),
 ('inspection', 0.6825978755950928)]

In [68]:
#model.wv.similarity(<Student ID>, <List of Skills>)
#Get the average of these results to give us a base metric
print(w2v_model.wv.similarity('engineering', 'machine'))
print(w2v_model.wv.similarity('engineering', 'chemical'))
print(w2v_model.wv.similarity('engineering', 'mechanical_engineering'))

0.44402772
0.959324
0.91868794


In [69]:
w2v_model.save("basemodel.bin")

INFO - 14:24:09: Word2Vec lifecycle event {'fname_or_handle': 'basemodel.bin', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-11-07T14:24:09.633594', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'saving'}
INFO - 14:24:09: not storing attribute cum_table
INFO - 14:24:09: saved basemodel.bin
