In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\manis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


from nltk.corpus import stopwords
import string
from wordcloud import WordCloud

from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import docx2txt
from nltk.tokenize import WhitespaceTokenizer

import plotly.graph_objects as go
import plotly.express as px

import chart_studio.plotly as py

warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('UpdatedResumeDataSet.csv', encoding='utf-8')
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [4]:
import re

resumeDataSet = df.copy()
resumeDataSet['cleaned_resume'] = ''

def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText
    
resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))

resumeDataSet.head()

Unnamed: 0,Category,Resume,cleaned_resume
0,Data Science,Skills * Programming Languages: Python (pandas...,Skills Programming Languages Python pandas num...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,Education Details May 2013 to May 2017 B E UIT...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",Areas of Interest Deep Learning Control System...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,Skills R Python SAP HANA Tableau SAP HANA SQL ...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",Education Details MCA YMCAUST Faridabad Haryan...


## Stopword Removal


In [5]:
import spacy
import pandas as pd

# Load the Spacy language model
nlp = spacy.load('en_core_web_sm')

# Function to remove stop words using Spacy
def removeStopwords(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop]
    return ' '.join(tokens)

# Apply stop word removal to the 'cleaned_resume' column
resumeDataSet['stopword_removed'] = resumeDataSet['cleaned_resume'].apply(removeStopwords)

# Create a table of job description and stop-word-removed data
job_description = resumeDataSet['Category']
stopword_removed_data = resumeDataSet['stopword_removed']
data = pd.DataFrame({'Job Description': job_description, 'Stopword Removed Data': stopword_removed_data})

# Print the resulting table
data.head()


Unnamed: 0,Job Description,Stopword Removed Data
0,Data Science,Skills Programming Languages Python pandas num...
1,Data Science,Education Details 2013 2017 B E UIT RGPV Data ...
2,Data Science,Areas Interest Deep Learning Control System De...
3,Data Science,Skills R Python SAP HANA Tableau SAP HANA SQL ...
4,Data Science,Education Details MCA YMCAUST Faridabad Haryan...


## Tokenization

In [7]:
def tokenizeData(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

data['tokenized_data'] = data['Stopword Removed Data'].apply(tokenizeData)
data = data.drop(['Stopword Removed Data'], axis=1)
data.head()

Unnamed: 0,Job Description,Stopword Removed Data,tokenized_data
0,Data Science,Skills Programming Languages Python pandas num...,"[Skills, Programming, Languages, Python, panda..."
1,Data Science,Education Details 2013 2017 B E UIT RGPV Data ...,"[Education, Details, 2013, 2017, B, E, UIT, RG..."
2,Data Science,Areas Interest Deep Learning Control System De...,"[Areas, Interest, Deep, Learning, Control, Sys..."
3,Data Science,Skills R Python SAP HANA Tableau SAP HANA SQL ...,"[Skills, R, Python, SAP, HANA, Tableau, SAP, H..."
4,Data Science,Education Details MCA YMCAUST Faridabad Haryan...,"[Education, Details, MCA, YMCAUST, Faridabad, ..."


In [13]:

data.head()

Unnamed: 0,Job Description,tokenized_data
0,Data Science,"[Skills, Programming, Languages, Python, panda..."
1,Data Science,"[Education, Details, 2013, 2017, B, E, UIT, RG..."
2,Data Science,"[Areas, Interest, Deep, Learning, Control, Sys..."
3,Data Science,"[Skills, R, Python, SAP, HANA, Tableau, SAP, H..."
4,Data Science,"[Education, Details, MCA, YMCAUST, Faridabad, ..."


## Encoding

In [16]:
var_mod = ['Job Description']
le = LabelEncoder()
for i in var_mod:
    data[i] = le.fit_transform(data[i])

data.head()

Unnamed: 0,Job Description,tokenized_data
0,6,"[Skills, Programming, Languages, Python, panda..."
1,6,"[Education, Details, 2013, 2017, B, E, UIT, RG..."
2,6,"[Areas, Interest, Deep, Learning, Control, Sys..."
3,6,"[Skills, R, Python, SAP, HANA, Tableau, SAP, H..."
4,6,"[Education, Details, MCA, YMCAUST, Faridabad, ..."


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

requiredText = resumeDataSet['cleaned_resume'].values
requiredTarget = resumeDataSet['Category'].values

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words='english',
    max_features=1500)
word_vectorizer.fit(requiredText)
WordFeatures = word_vectorizer.transform(requiredText)

print ("Feature completed .....")

X_train,X_test,y_train,y_test = train_test_split(WordFeatures,requiredTarget,random_state=0, test_size=0.2)
print(X_train.shape)
print(X_test.shape)

Feature completed .....
(769, 1500)
(193, 1500)


In [15]:
clf = KNeighborsClassifier(n_neighbors=15)
clf = clf.fit(X_train, y_train)
yp = clf.predict(X_test)
print('Accuracy of KNeighbors Classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
print('Accuracy of KNeighbors Classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))

Accuracy of KNeighbors Classifier on training set: 0.95
Accuracy of KNeighbors Classifier on test set: 0.94


In [16]:
class JobPredictor:
    def __init__(self) -> None:
        self.le = le
        self.word_vectorizer = word_vectorizer
        self.clf = clf

    def predict(self, resume):
        feature = self.word_vectorizer.transform([resume])
        predicted = self.clf.predict(feature)
        resume_position = self.le.inverse_transform(predicted)[0]
        return resume_position

    def predict_proba(self, resume):
        feature = self.word_vectorizer.transform([resume])
        predicted_prob = self.clf.predict_proba(feature)
        return predicted_prob[0]

In [17]:
job_description = """

Skills Required:

• Hands on years of working experience with ETL integration, Core JAVA, Spring Boot and APIs
• Good knowledge of DB2 or Azure SQL server (experience developing SQL queries)
• Understanding of File Transfer protocols and processes ie. FTP, SFTP, PGP Encryption
• Understanding mainframe integration for ETL processing
• Technical working experience with UNIX shell scripting
• Knowledge and understanding of Web Services
• Experience in developing ETL processes (preferably Talend, iWay, DataStage)
• Experience in writing/creating/updating technical documents
• Experience in batch job/process scheduling
• Familiarity with data integration and data streaming, WebSphere MQ and Communication Networks
• Familiarity with event driven programming concepts
• Exposure to Data Modelling and Data Architecture

Roles & Responsibilities:
• Act as an expert technical resource for problem analysis and solution implementation
• Work closely with Delivery and Technical Architecture teams, Product Owners and Technical Platform teams to design and develop high quality solutions supporting enterprise architecture and business process improvements that support our business and technical strategies
• Deal effectively with external Vendors, Business Partners, internal Stakeholders and Management
• Implement new systems or enhancements including, reviewing programs written by team members, establishing and supporting system test procedures, developing implementation plan, developing the required program and system documentation and ensuring all functionality has been delivered as required
• Provide post implementation support and training to the Production Support staff on the production processing functionality
• Support other development areas providing technical expertise, guidance, advice and knowledge transfer to staff and more junior Developers
• Coordinate and accommodate with a geographically dispersed team
• Pager rotation mandatory during critical processing times
"""


job_description2 = """

Skills Required:

Proven working experience as a Data Analyst or Business Data Analyst
Technical expertise regarding data models, database design development, data mining and segmentation techniques
Strong knowledge of and experience with reporting packages (Business Objects etc), databases (SQL etc), programming (XML, Javascript, or ETL frameworks)
Knowledge of statistics and experience using statistical packages for analyzing datasets (Excel, SPSS, SAS etc)
Strong analytical skills with the ability to collect, organize, analyze, and disseminate significant amounts of information with attention to detail and accuracy
Adept at queries, report writing and presenting findings
BS in Mathematics, Economics, Computer Science, Information Management or Statistics

Roles & Responsibilities:

Interpret data, analyze results using statistical techniques and provide ongoing reports
Develop and implement databases, data collection systems, data analytics and other strategies that optimize statistical efficiency and quality
Acquire data from primary or secondary data sources and maintain databases/data systems
Identify, analyze, and interpret trends or patterns in complex data sets
Filter and “clean” data by reviewing computer reports, printouts, and performance indicators to locate and correct code problems
Work with management to prioritize business and information needs
Locate and define new process improvement opportunities

"""

In [18]:
resume_position = JobPredictor().predict(job_description)
f'JD uploaded! Position: {resume_position}'

'JD uploaded! Position: ETL Developer'

In [19]:
text_tokenizer= WhitespaceTokenizer()
remove_characters= str.maketrans("", "", "±§!@#$%^&*()-_=+[]}{;'\:,./<>?|")
cv = CountVectorizer()

resume_docx = docx2txt.process('resume3.docx')

#takes the texts in a list
text_docx= [resume_docx, job_description]
#creating the list of words from the word document
words_docx_list = text_tokenizer.tokenize(resume_docx)
#removing speacial charcters from the tokenized words 
words_docx_list=[s.translate(remove_characters) for s in words_docx_list]
#giving vectors to the words
count_docx = cv.fit_transform(text_docx)
#using the alogorithm, finding the match between the resume/cv and job description
similarity_score_docx = cosine_similarity(count_docx)
match_percentage_docx= round((similarity_score_docx[0][1]*100),2)
f'Match percentage with the Job description: {match_percentage_docx}'

'Match percentage with the Job description: 42.27'

In [20]:
similarity_score_docx

array([[1.        , 0.42269407],
       [0.42269407, 1.        ]])

In [24]:
fig = go.Figure(go.Indicator(
    mode = "gauge+number",
    value = match_percentage_docx,
    domain = {'x': [0, 1], 'y': [0, 1]},
    title = {'text': "Match with JD"}))

fig.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
job_predictor = JobPredictor()
resume_position = job_predictor.predict(resume_docx)

chart_data = pd.DataFrame({
    "position": [cl for cl in job_predictor.le.classes_],
    "match": job_predictor.predict_proba(resume_docx)
})

fig = px.bar(chart_data, x="position", y="match",
                title=f'Resume matched to: {resume_position}')
fig.show()

In [None]:
uploaded_files = ['resume.docx', 'resume2.docx', 'resume3.docx']
job_predictor = JobPredictor()
job_positions = {x: 0 for x in [cl for cl in job_predictor.le.classes_]}
match_percentage = {}
for uploaded_file in uploaded_files:
    resume_docx = docx2txt.process(uploaded_file)
    resume_position = job_predictor.predict(resume_docx)
    job_positions[resume_position] += 1

#     job_description= docx2txt.process("temp_jd.docx")
    text_docx= [resume_docx, job_description]
    words_docx_list = text_tokenizer.tokenize(resume_docx)
    words_docx_list=[s.translate(remove_characters) for s in words_docx_list]
    count_docx = cv.fit_transform(text_docx)
    similarity_score_docx = cosine_similarity(count_docx)
    match_percentage_docx= round((similarity_score_docx[0][1]*100),2)
    match_percentage[uploaded_file.split('/')[-1]] = match_percentage_docx

In [None]:
match_chart_data = pd.DataFrame({
    "document": match_percentage.keys(),
    "percentage": match_percentage.values()
})

fig = px.bar(match_chart_data, x="document", y="percentage", title='Document Matched Percentage')
fig.show()