In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pyLDAvis.gensim
import re
import spacy
import squarify

from collections import Counter
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Load in the Data

In [4]:
df = pd.read_csv('indeed_fswd_4_16_2020.csv', index_col=0)
df.head()

Unnamed: 0,job_title,company_name,location,summary,salary
0,Full Stack Web Developer,ItsWorthMore.com LLC,"Sanford, FL 32771",Seeking an experienced full stack developer (P...,
1,In-House Full Stack Web Developer (Open Cart E...,"Central Infusion Alliance, Inc.","Skokie, IL 60076",Functionality of the lead form on the homepage...,$50 - $75 an hour
2,Full Stack Software Developer,Vectrus,"Dahlgren, VA",Perform full stack development to include back...,
3,.NET Full Stack Developer or Entry Level .NET ...,"Steel King Industries, Inc.","Stevens Point, WI 54481",Only full-time employees eligible.\nYou will c...,
4,Full Stack Developer,Vivid Racing,"Gilbert, AZ 85233",Solid understanding of web security.\nWrite co...,"$70,000 - $80,000 a year"


# Create the Tokenizer and filter common stopwords

In [5]:
STOPWORDS = set(STOPWORDS)

In [6]:
# Tokenizer for text
def tokenizer(text):
    new_text = re.sub(r'[^a-zA-Z ^0-9]', '', text)
    return [token for token in simple_preprocess(new_text) if token not in STOPWORDS]

In [8]:
# Just a quick view of this in action
tokenizer(df['summary'][0])

['seeking',
 'experienced',
 'stack',
 'developer',
 'php',
 'javascript',
 'mysql',
 'htmlcss',
 'convert',
 'existing',
 'internal',
 'application',
 'saas',
 'application',
 'continue']

In [9]:
# Apply to the summary column
df['tokenized_summary'] = df['summary'].apply(tokenizer)
df['tokenized_summary'][:5]

0    [seeking, experienced, stack, developer, php, ...
1    [functionality, lead, form, homepageweb, devel...
2    [perform, stack, development, include, backend...
3    [fulltime, employees, eligibleyou, collaborate...
4    [solid, understanding, web, securitywrite, cod...
Name: tokenized_summary, dtype: object

# Vector Representation of the Data

In [11]:
# First, join the tokens together in a new column
text = [' '.join(doc) for doc in df['tokenized_summary']]
df['joined_tokens'] = text

In [12]:
# Next, define the vectorizor we will use and fit the model
# We will use TFIDF vectorizer as it is good for a baseline in document models
tfidf = TfidfVectorizer(stop_words='english',
                       ngram_range=(1,3))
tfidf.fit(text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [14]:
# after transforming the vectors, we show the tokens and n_grams in the text
dtm = tfidf.transform(text)
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
dtm

Unnamed: 0,abreast,abreast ofdevelopments,abreast ofdevelopments web,academicproject,academicproject experience,academicproject experience working,actively,actively recruiting,actively recruiting fullstack,administering,...,write optimize,write optimize mysql,year,year preferred,years,years direct,years direct work,years preferred,years preferredsass,years preferredsass syntactically
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.296384,0.0,0.0,0.166207,0.166207,0.166207
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.183091,0.183091,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000
57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000
58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000
59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000


In [16]:
# Next, we will fit a NearestNeighbors model on the data
# When we do this, we will be able to query with dummy-job description
# to find out which job listings closely match our string.

# Parameters:
# - n_neighbors (the number of closely-related searches)
# - algorithm (how it compiles the data)
nn = NearestNeighbors(n_neighbors=15, algorithm='kd_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                 radius=1.0)

In [20]:
# Write  resume with which to query
fake_resume = ['''
Full Stack Developer with a track record of creating effective programs and projects quickly, without sacrificing quality or client needs.
Lifelong learner committed to staying current on new technologies.  
Team player willing to take the lead on executing tasks and experimenting with new ideas. 
''']

In [21]:
new = tfidf.transform(fake_resume)

In [22]:
new

<1x674 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [40]:
nn.kneighbors(new.todense())[1][0]

array([29,  6, 58, 23,  0, 42, 19,  2, 57, 25, 36, 13, 52, 33, 10],
      dtype=int64)

In [59]:
for i in nn.kneighbors(new.todense())[1][0]:
    print(df['company_name'][i], '\n')

Rethink 

Rethink 

Rethink 

ItsWorthMore.com LLC 

ItsWorthMore.com LLC 

ItsWorthMore.com LLC 

FullStackTechies 

Vectrus 

Vectrus 

Vectrus 

Cloud Haven Solutions 

Cloud Haven Solutions 

Cloud Haven Solutions 

Sparksoft Corporation 

Sparksoft Corporation 

