In [108]:
import pdfplumber
import itertools
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

from gensim.summarization import keywords
from gensim.summarization.summarizer import summarize

import nltk

from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

from nltk.corpus import stopwords
stops = stopwords.words('english')
from string import punctuation

In [109]:
def extractPDFContent(filePath):
    pdf = pdfplumber.open(filePath)
    page = pdf.pages[0]
    text = page.extract_text(x_tolerance=3)
    return text

In [110]:
with open('./jobDesciption.txt') as f:
    jobDespLines = f.readlines()
jobDespText = ''.join(jobDespLines)

wordList = [ sentence.split() for sentence in jobDespLines ]
flattenWordList = list(itertools.chain(*wordList))
porter = PorterStemmer()
stemmedWords = [porter.stem(words) for words in flattenWordList]
stemmedJobDespText = ' '.join(stemmedWords)


In [111]:
tokens = nltk.wordpunct_tokenize(jobDespText)

In [112]:
tokenDf = pd.DataFrame(index = tokens)
tokenDf['porter_stemmer'] = [porter_stemmer.stem(t) for t in tokens]
tokenDf['lancaster_stemmer'] = [lancaster_stemmer.stem(t) for t in tokens]
tokenDf['snowball_stemmer'] = [snowball_stemmer.stem(t) for t in tokens]
tokenDf['wordnet_lemmatizer'] = [wordnet_lemmatizer.lemmatize(t) for t in tokens]

In [113]:
keywordNum = 10

In [114]:
idxs = list(tokenDf.columns)
keywordDic = dict()
for idx in idxs:
    tokensList = list(tokenDf[idx])
    text = ' '.join(tokensList)
    keywordList = keywords(text, ratio=0.1).split('\n')
    keywordDic[idx] = keywordList[:keywordNum]

In [115]:
keywordDf = pd.DataFrame.from_dict(keywordDic)
keywordDf.index = keywords(jobDespText, ratio=0.1).split('\n')[:keywordNum]

In [116]:
keywordDf

Unnamed: 0,porter_stemmer,lancaster_stemmer,snowball_stemmer,wordnet_lemmatizer
data,data,dat,data,data
business,use,busy,use,business
experience,experi,expery,experi,experience
experiences,model,model,model,model
models,busi,develop,busi,modeling
modeling,develop,techn,develop,statistical
model,statist comput,solv,statist comput,statistics
statistical,regress,stat comput,regress,statistic
statistics,techniqu,analys company,techniqu,development
development,tool,regress,tool,develop


In [117]:
# test = ' '.join(wordnet_lemmatizer)
# print(f"[ Keywords ]\n{keywords(test, ratio=0.1)}") 

In [118]:
filePath = './resume_v2.pdf'
resumeText = extractPDFContent(filePath)
resumeTextList = resumeText.split('\n')

In [119]:
# Match Score
inputText = [resumeText, jobDespText]
cv = CountVectorizer()
countMatrix = cv.fit_transform(inputText)
matchPercentage = round(cosine_similarity(countMatrix)[0][1] * 100, 2)
print(f"[ Resume Match Score ]\n{matchPercentage}")

[ Resume Match Score ]
65.78


In [120]:
print(f"[ Summary ]\n{summarize(jobDespText, ratio=0.1)}")

[ Summary ]
We are looking for a Data Scientist who will support our product, sales, leadership and marketing teams with insights gained from analyzing company data.
The ideal candidate is adept at using large data sets to find opportunities for product and process optimization and using models to test the effectiveness of different courses of action.
Mine and analyze data from company databases to drive optimization and improvement of product development, marketing techniques and business strategies.
