In [2]:
import pandas as pd
from pymongo import MongoClient
import warnings
warnings.filterwarnings('ignore')
from nltk import WordNetLemmatizer, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF, TruncatedSVD
from gensim.models import Word2Vec
import string
import numpy as np
import json
import pickle

#### Loading the Data

In [3]:
client = MongoClient()
client.list_database_names()

['admin',
 'books',
 'catalog',
 'config',
 'events',
 'legistlation',
 'local',
 'resumes']

In [4]:
resumes = client.resumes.resumes_cleaned
resumes.count()

21068

In [5]:
df = pd.DataFrame(list(resumes.find({'Title':{'$exists': 1}},
                                    {'Companies': 1, 'Resume Summary': 1, 'Current Location': 1, 
                                     'Start Dates': 1, 'Work Experiences': 1, 'Universities': 1, 'Analyst': 1, 
                                     'Engineer': 1, 'Scientist': 1, 'Bachelors': 1, 'Masters': 1, 'PhD': 1,
                                     'in': 1, 'Skills': 1, 'Additionals': 1, 'y':1, '_id': 0})))
df.head()

Unnamed: 0,Additionals,Analyst,Bachelors,Companies,Current Location,Engineer,Masters,PhD,Resume Summary,Scientist,Skills,Start Dates,Universities,Work Experiences,in,y
0,,1,1,"' Yoox Net- A- Porter Group', ' Yoox Net- A- P...",'',0,0,0,' Goal-oriented team player with experience in...,0,"' SERVICE', '3 years', ' SALES', '3 years', ' ...",6,' Bloomsburg University of Pennsylvania',' Formulate sales reports and other metrics us...,' Business Administration and Marketing',Analyst
1,,1,1,"' Thimble LLC/ contract', ' Florida State Univ...","' Washington, DC'",0,0,0,"' Detail-oriented, methodical, and analytical,...",0,"' Microsoft Office', '5 years', ' SAS', '2 yea...",2,' Florida State University',' Manually adjusted and corrected data collect...,' Statistics',Analyst
2,,0,1,"' New York State Office of Mental Health', ' N...","' New York, NY'",1,1,0,' Over 15 years of experience in information t...,0,"' ORACLE', '10+ years', ' PL/ SQL', '10+ years...",18,"' Arizona State University', ' Annamalai Unive...",'- Worked on Data Warehouse projects from ince...,"' Information Systems', ' Mechanical and Produ...",Engineer
3,,1,1,"' Allied Benefit Systems', ' Expedia, Inc.', '...","' Chicago, IL'",0,0,0,,0,"' VBA', '3 years', ' Excel', '3 years', ' Powe...",5,' Statistics B. Sc. Concordia University Montr...,"'- Collected, cleaned, organized data from mul...",' Mathematics of Finance',Analyst
4,'/ Licenses VMware Certified Professional ( VC...,0,1,"' Presidio', ' Presidio', ' Global', ' The Geo...","' Reston, VA'",1,1,0,""" 6+ years of experience in Data Center design...",0,"' Vmware', '6 years', ' Cisco', '6 years', ' E...",8,"' The George Washington University', ' Univers...",' Presidio was founded to fill a niche overloo...,"' Electrical Engineering', ' Electronics Engin...",Engineer


In [6]:
df = df.astype(str)

### Tokenization

In [7]:
lemmatize = WordNetLemmatizer()
translator = str.maketrans('', '', string.punctuation)
def token_text(text):
    tokens = word_tokenize(text)
    clean_tokens = [lemmatize.lemmatize(token.lower().strip().translate(translator)) for token in tokens]
    return ' '.join(clean_tokens)

In [8]:
df_ = df.copy()
def token_columns(df,df_):
    for column in df_.columns:
        df_[column] = df[column].apply(token_text)
    return df_

df_ = token_columns(df,df_)

In [10]:
df_.head()

Unnamed: 0,Additionals,Analyst,Bachelors,Companies,Current Location,Engineer,Masters,PhD,Resume Summary,Scientist,Skills,Start Dates,Universities,Work Experiences,in,y
0,,1,1,yoox net a porter group yoox net a porter ...,,0,0,0,goaloriented team player with experience in v...,0,service 3 year sale 3 year sale 2...,6,bloomsburg university of pennsylvania,formulate sale report and other metric using ...,business administration and marketing,analyst
1,,1,1,thimble llc contract florida state univers...,washington dc,0,0,0,detailoriented methodical and analytical a...,0,microsoft office 5 year sa 2 year d...,2,florida state university,manually adjusted and corrected data collecti...,statistic,analyst
2,,0,1,new york state office of mental health new...,new york ny,1,1,0,over 15 year of experience in information tec...,0,oracle 10 year pl sql 10 year sql ...,18,arizona state university annamalai univers...,worked on data warehouse project from incepti...,information system mechanical and production,engineer
3,,1,1,allied benefit system expedia inc aspi...,chicago il,0,0,0,,0,vba 3 year excel 3 year power bi ...,5,statistic b sc concordia university montreal,collected cleaned organized data from multi...,mathematics of finance,analyst
4,license vmware certified professional vcp j...,0,1,presidio presidio global the george ...,reston va,1,1,0,6 year of experience in data center design i...,0,vmware 6 year cisco 6 year emc 6 ...,8,the george washington university universit...,presidio wa founded to fill a niche overlooke...,electrical engineering electronics enginee...,engineer


In [11]:
df_combined = df_['Companies'] + df_['Skills'] + df_['Additionals'] + df_['Current Location'] + df_['Resume Summary']\
+ df_['Universities'] + df_['Work Experiences'] + df_['in']

df_combined = pd.DataFrame(df_combined, columns=['text'])
df_combined.head()

Unnamed: 0,text
0,yoox net a porter group yoox net a porter ...
1,thimble llc contract florida state univers...
2,new york state office of mental health new...
3,allied benefit system expedia inc aspi...
4,presidio presidio global the george ...


In [57]:
with open('df.pkl','wb') as file:
    pickle.dump(df_combined,file)

### Word2Vec

In [45]:
sentences = df_combined.text.str.split()
model = Word2Vec(sentences, size=100, window=5, min_count=5)

model.wv.most_similar('scientist')

[('enthusiast', 0.6711856722831726),
 ('statistician', 0.6542973518371582),
 ('analyst', 0.6500447988510132),
 ('mathematician', 0.6346460580825806),
 ('aspiring', 0.6344203948974609),
 ('resultsdriven', 0.6298266649246216),
 ('researcher', 0.6288260221481323),
 ('seasoned', 0.6143649816513062),
 ('strategist', 0.6112597584724426),
 ('engineer', 0.6069817543029785)]

In [46]:
# Function to take a document as a list of words and return the document vector
def get_doc_vec(words, model):
    good_words = []
    for word in words:
        # Words not in the original model will fail
        try:
            if model.wv[word] is not None:
                good_words.append(word)
        except:
            continue
    # If no words are in the original model
    if len(good_words) == 0:
        return None
    # Return the mean of the vectors for all the good words
    return model.wv[good_words].mean(axis=0)

In [48]:
vecs = df_combined.text.str.split().map(lambda x: get_doc_vec(x, model))
df_['vecs'] = vecs
df_.dropna(subset=['vecs'],inplace=True)
df_.head()

Unnamed: 0,Additionals,Analyst,Bachelors,Companies,Current Location,Engineer,Masters,PhD,Resume Summary,Scientist,Skills,Start Dates,Universities,Work Experiences,in,y,vecs
0,,1,1,yoox net a porter group yoox net a porter ...,,0,0,0,goaloriented team player with experience in v...,0,service 3 year sale 3 year sale 2...,6,bloomsburg university of pennsylvania,formulate sale report and other metric using ...,business administration and marketing,analyst,"[0.41427964, -0.88509446, 0.3524626, 0.5670323..."
1,,1,1,thimble llc contract florida state univers...,washington dc,0,0,0,detailoriented methodical and analytical a...,0,microsoft office 5 year sa 2 year d...,2,florida state university,manually adjusted and corrected data collecti...,statistic,analyst,"[0.2914878, -0.59264576, 0.43134314, 0.6466263..."
2,,0,1,new york state office of mental health new...,new york ny,1,1,0,over 15 year of experience in information tec...,0,oracle 10 year pl sql 10 year sql ...,18,arizona state university annamalai univers...,worked on data warehouse project from incepti...,information system mechanical and production,engineer,"[0.58659595, -0.24278928, 0.47927913, 0.431336..."
3,,1,1,allied benefit system expedia inc aspi...,chicago il,0,0,0,,0,vba 3 year excel 3 year power bi ...,5,statistic b sc concordia university montreal,collected cleaned organized data from multi...,mathematics of finance,analyst,"[0.93572193, -0.3007371, 0.44101924, 0.4377344..."
4,license vmware certified professional vcp j...,0,1,presidio presidio global the george ...,reston va,1,1,0,6 year of experience in data center design i...,0,vmware 6 year cisco 6 year emc 6 ...,8,the george washington university universit...,presidio wa founded to fill a niche overlooke...,electrical engineering electronics enginee...,engineer,"[0.17530711, -0.3703728, -0.076127686, 0.11814..."


In [49]:
# Create a Numpy array of the document vectors
np_vecs = np.zeros((len(df_), 100))
for i, vec in enumerate(df_['vecs']):
    np_vecs[i, :] = vec
    
# Combine the full dataframe with the labels
w2v_data = pd.concat([df_.reset_index(), pd.DataFrame(np_vecs)], axis=1)

In [50]:
w2v_data = w2v_data.drop(['index','Additionals','vecs','Companies','Current Location','Resume Summary',
                         'Skills','Universities','Work Experiences','in'],axis=1)
w2v_data.head()

Unnamed: 0,Analyst,Bachelors,Engineer,Masters,PhD,Scientist,Start Dates,y,0,1,...,90,91,92,93,94,95,96,97,98,99
0,1,1,0,0,0,0,6,analyst,0.41428,-0.885094,...,0.25369,0.018478,-0.135905,-0.812658,-0.206039,-0.429455,-0.375995,0.705037,0.211959,0.305924
1,1,1,0,0,0,0,2,analyst,0.291488,-0.592646,...,0.091699,-0.087341,-0.188046,-0.413816,-0.357783,-0.675707,-0.790313,0.477981,-0.170263,-0.303014
2,0,1,1,1,0,0,18,engineer,0.586596,-0.242789,...,0.607049,-0.277285,0.089873,-0.936028,0.136684,-0.193377,-0.427384,0.174778,0.108993,0.274367
3,1,1,0,0,0,0,5,analyst,0.935722,-0.300737,...,0.339273,-0.294313,0.121354,-0.538947,-0.113011,-0.777224,-0.537407,-0.062251,-0.406997,-0.05285
4,0,1,1,1,0,0,8,engineer,0.175307,-0.370373,...,-0.032867,-0.170324,-0.097706,-0.499279,-0.043168,-0.114414,-0.118862,0.3765,-0.09799,0.213679


#### Store data in MongoDB and pickle files.

In [59]:
client = MongoClient('localhost', 27017)
db = client['resumes']

w2v_data_coll = json.loads(w2v_data.T.to_json()).values()
db.w2v.insert(w2v_data_coll)

[ObjectId('5b9aad759bd8d2174df93767'),
 ObjectId('5b9aad759bd8d2174df93768'),
 ObjectId('5b9aad759bd8d2174df93769'),
 ObjectId('5b9aad759bd8d2174df9376a'),
 ObjectId('5b9aad759bd8d2174df9376b'),
 ObjectId('5b9aad759bd8d2174df9376c'),
 ObjectId('5b9aad759bd8d2174df9376d'),
 ObjectId('5b9aad759bd8d2174df9376e'),
 ObjectId('5b9aad759bd8d2174df9376f'),
 ObjectId('5b9aad759bd8d2174df93770'),
 ObjectId('5b9aad759bd8d2174df93771'),
 ObjectId('5b9aad759bd8d2174df93772'),
 ObjectId('5b9aad759bd8d2174df93773'),
 ObjectId('5b9aad759bd8d2174df93774'),
 ObjectId('5b9aad759bd8d2174df93775'),
 ObjectId('5b9aad759bd8d2174df93776'),
 ObjectId('5b9aad759bd8d2174df93777'),
 ObjectId('5b9aad759bd8d2174df93778'),
 ObjectId('5b9aad759bd8d2174df93779'),
 ObjectId('5b9aad759bd8d2174df9377a'),
 ObjectId('5b9aad759bd8d2174df9377b'),
 ObjectId('5b9aad759bd8d2174df9377c'),
 ObjectId('5b9aad759bd8d2174df9377d'),
 ObjectId('5b9aad759bd8d2174df9377e'),
 ObjectId('5b9aad759bd8d2174df9377f'),
 ObjectId('5b9aad759bd8d2

In [43]:
with open('W2V.pkl','wb') as file:
    pickle.dump(model,file)

In [51]:
with open('W2V_data.pkl','wb') as file:
    pickle.dump(w2v_data,file)

### Count Vectorizer

In [22]:
# set ngram range, stop words only english, regex patterns, set to lowercase
count_vectorizer = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True)

# fit and transform the text
cv_data = count_vectorizer.fit_transform(df_combined['text'])

#### LSA

In [23]:
lsa_cv = TruncatedSVD(n_components=100) # set to 100 dimensions

# fit and transform the text
lsa_cv_data = lsa_cv.fit_transform(cv_data)

In [24]:
# combine the lsa_cv text with the non-vectorized features
CV_LSA = pd.concat([pd.DataFrame(lsa_cv_data),df[['y','Analyst','Scientist','Engineer','Start Dates','Bachelors',
                                                'Masters','PhD']]],axis=1)

In [25]:
CV_LSA.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,y,Analyst,Scientist,Engineer,Start Dates,Bachelors,Masters,PhD
0,8.338605,5.322429,4.703971,-3.301757,-2.369379,1.787536,0.355131,-2.653454,-0.439998,-2.412284,...,0.586919,0.530225,Analyst,1,0,0,6,1,0,0
1,16.898709,-1.338696,13.986113,-0.990019,-3.746873,-2.221212,3.844439,-0.669413,-0.592488,-1.099078,...,-0.029255,1.751036,Analyst,1,0,0,2,1,0,0
2,57.576805,13.517422,-1.897632,0.838416,1.465622,-7.855823,-1.659711,-0.591778,7.723553,-3.061332,...,-1.46194,-0.486254,Engineer,0,0,1,18,1,1,0
3,5.408049,1.395045,1.935934,-1.420569,-1.844288,-0.799884,2.049063,0.853001,0.657492,-1.677186,...,-0.37971,-0.350758,Analyst,1,0,0,5,1,0,0
4,14.585224,5.018236,7.206179,-3.417568,0.346719,-4.410483,-2.474445,1.600515,-3.146223,7.442911,...,-0.527037,-2.555981,Engineer,0,0,1,8,1,1,0


#### Store data in MongoDB and pickle files.

In [32]:
# save the model
with open('LSA_CV.pkl','wb') as file:
    pickle.dump(lsa_cv,file)

# save the data
with open('LSA_CV_data.pkl','wb') as file:
    pickle.dump(CV_LSA,file)

In [60]:
# store the data in MongoDB
client = MongoClient('localhost', 27017)
db = client['resumes']

CV_LSA_coll = json.loads(CV_LSA.T.to_json()).values()
db.lsa_cv.insert(CV_LSA_coll)

[ObjectId('5b9aadaa9bd8d2174df98999'),
 ObjectId('5b9aadaa9bd8d2174df9899a'),
 ObjectId('5b9aadaa9bd8d2174df9899b'),
 ObjectId('5b9aadaa9bd8d2174df9899c'),
 ObjectId('5b9aadaa9bd8d2174df9899d'),
 ObjectId('5b9aadaa9bd8d2174df9899e'),
 ObjectId('5b9aadaa9bd8d2174df9899f'),
 ObjectId('5b9aadaa9bd8d2174df989a0'),
 ObjectId('5b9aadaa9bd8d2174df989a1'),
 ObjectId('5b9aadaa9bd8d2174df989a2'),
 ObjectId('5b9aadaa9bd8d2174df989a3'),
 ObjectId('5b9aadaa9bd8d2174df989a4'),
 ObjectId('5b9aadaa9bd8d2174df989a5'),
 ObjectId('5b9aadaa9bd8d2174df989a6'),
 ObjectId('5b9aadaa9bd8d2174df989a7'),
 ObjectId('5b9aadaa9bd8d2174df989a8'),
 ObjectId('5b9aadaa9bd8d2174df989a9'),
 ObjectId('5b9aadaa9bd8d2174df989aa'),
 ObjectId('5b9aadaa9bd8d2174df989ab'),
 ObjectId('5b9aadaa9bd8d2174df989ac'),
 ObjectId('5b9aadaa9bd8d2174df989ad'),
 ObjectId('5b9aadaa9bd8d2174df989ae'),
 ObjectId('5b9aadaa9bd8d2174df989af'),
 ObjectId('5b9aadaa9bd8d2174df989b0'),
 ObjectId('5b9aadaa9bd8d2174df989b1'),
 ObjectId('5b9aadaa9bd8d2

### TFIDF Vectorizer

In [26]:
# set ngram range, stop words only english, regex patterns, set to lowercase
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True)

tfidf_data = tfidf_vectorizer.fit_transform(df_combined['text'])

In [38]:
with open('tfidf_vectorizer.pkl','wb') as file:
    pickle.dump(tfidf_vectorizer,file)

#### LSA

In [27]:
lsa_tfidf = TruncatedSVD(n_components=100) # set to 100 dimensions

# fit and transform the text
lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)

In [28]:
# combine the lsa_tfidf text with the non-vectorized features
TFIDF_LSA = pd.concat([pd.DataFrame(lsa_tfidf_data),df[['y','Analyst','Scientist','Engineer','Start Dates','Bachelors',
                                                'Masters','PhD']]],axis=1)

In [29]:
TFIDF_LSA.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,y,Analyst,Scientist,Engineer,Start Dates,Bachelors,Masters,PhD
0,0.116538,0.064366,-0.048105,0.031719,-0.016665,0.017482,-0.034778,-0.048679,-0.017124,-0.001049,...,-0.002355,-0.015654,Analyst,1,0,0,6,1,0,0
1,0.142568,0.124692,0.021172,-0.040459,0.040853,-0.01998,-0.037097,-0.008927,0.024049,-0.008748,...,0.013125,0.015625,Analyst,1,0,0,2,1,0,0
2,0.296641,-0.054903,-0.090104,0.000666,0.012748,-0.012809,0.027657,0.037692,-0.057642,-0.070316,...,-0.018806,-0.005984,Engineer,0,0,1,18,1,1,0
3,0.104624,0.012313,-0.016656,-0.013489,0.014368,0.027677,-0.005994,-0.018574,-0.008545,-0.037078,...,0.006485,-0.011479,Analyst,1,0,0,5,1,0,0
4,0.111532,0.030344,-0.037361,0.048654,0.012509,0.02582,0.000665,0.112654,-0.025883,0.088656,...,0.002234,-0.019777,Engineer,0,0,1,8,1,1,0


#### Store data in MongoDB and pickle files.

In [33]:
with open('LSA_TFIDF.pkl','wb') as file:
    pickle.dump(lsa_tfidf,file)
with open('TFIDF_LSA_data.pkl','wb') as file:
    pickle.dump(TFIDF_LSA,file)

In [61]:
client = MongoClient('localhost', 27017)
db = client['resumes']

TFIDF_LSA_coll = json.loads(TFIDF_LSA.T.to_json()).values()
db.lsa_tfidf.insert(TFIDF_LSA_coll)

[ObjectId('5b9aadb39bd8d2174df9dbe6'),
 ObjectId('5b9aadb39bd8d2174df9dbe7'),
 ObjectId('5b9aadb39bd8d2174df9dbe8'),
 ObjectId('5b9aadb39bd8d2174df9dbe9'),
 ObjectId('5b9aadb39bd8d2174df9dbea'),
 ObjectId('5b9aadb39bd8d2174df9dbeb'),
 ObjectId('5b9aadb39bd8d2174df9dbec'),
 ObjectId('5b9aadb39bd8d2174df9dbed'),
 ObjectId('5b9aadb39bd8d2174df9dbee'),
 ObjectId('5b9aadb39bd8d2174df9dbef'),
 ObjectId('5b9aadb39bd8d2174df9dbf0'),
 ObjectId('5b9aadb39bd8d2174df9dbf1'),
 ObjectId('5b9aadb39bd8d2174df9dbf2'),
 ObjectId('5b9aadb39bd8d2174df9dbf3'),
 ObjectId('5b9aadb39bd8d2174df9dbf4'),
 ObjectId('5b9aadb39bd8d2174df9dbf5'),
 ObjectId('5b9aadb39bd8d2174df9dbf6'),
 ObjectId('5b9aadb39bd8d2174df9dbf7'),
 ObjectId('5b9aadb39bd8d2174df9dbf8'),
 ObjectId('5b9aadb39bd8d2174df9dbf9'),
 ObjectId('5b9aadb39bd8d2174df9dbfa'),
 ObjectId('5b9aadb39bd8d2174df9dbfb'),
 ObjectId('5b9aadb39bd8d2174df9dbfc'),
 ObjectId('5b9aadb39bd8d2174df9dbfd'),
 ObjectId('5b9aadb39bd8d2174df9dbfe'),
 ObjectId('5b9aadb39bd8d2