In [48]:
!pip install contractions
!pip install pdfplumber
import os
import re
# import PyPDF2
import pdfplumber
import pandas as pd
import numpy as np

import string # for text cleaning
import contractions # for expanding short form words
from tqdm import tqdm
tqdm.pandas(desc="Progress Bar")

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")



In [49]:
# Define a function to extract information from a PDF
def extract_information(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        resume_text = ""
        for page in pdf.pages:
            resume_text = " ".join([resume_text, page.extract_text()])
    resume_text = resume_text.strip()
    return resume_text
    
!pip install python-docx

from docx import Document
from docx.oxml import OxmlElement

def extract_information_docx(docx_path):
    doc = Document(docx_path)
    resume_text = ""

    for paragraph in doc.paragraphs:
        resume_text += paragraph.text + " "

    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    resume_text += paragraph.text + " "

    resume_text = resume_text.strip()
    return resume_text



In [50]:
import re

def extract_details(resume_text):
    # Define regular expressions to extract Skills, Education, and Experience
    skills_pattern = r'Skills\s*([\s\S]*?)(?=\n[A-Z]|$)'
    education_pattern = r'Education\s*([\s\S]*?)(?=\n[A-Z]+[^\n]*\n|$)'
    experience_pattern = r'Experience[^\s]*\s+([\s\S]*?)(?=\n[A-Z]+[^\n]*\n|$)'
    
    # Get Skills, Education, and Experience
    skills_match = re.findall(skills_pattern, resume_text, re.DOTALL | re.IGNORECASE)
    education_match = re.findall(education_pattern, resume_text, re.DOTALL | re.IGNORECASE)
    experience_match = re.findall(experience_pattern, resume_text, re.DOTALL | re.IGNORECASE)
    
    # Skills, Education, and Experience
    if len(skills_match) != 0:
        skills = skills_match[0]
    else:
        skills_pattern = r'skills\n((?:.*)*)' 
        skills_match = re.findall(skills_pattern, resume_text, re.DOTALL)
        if len(skills_match) != 0:
            skills = skills_match[0]
        else:
            skills = None
    
    if len(education_match) != 0:
        education = education_match[0]
    else:
        education = None
    
    if len(experience_match) != 0:
        experience = experience_match[0]
    else:
        experience = None
    
    return {
        'Skills': skills,
        'Education': education,
        'Experience': experience
    }


In [51]:
%%time

# Your existing code
data_folder = 'data/data'
resume_data = []

# Iterate through sub-folders and files
for category_folder in os.listdir(data_folder):
    category_path = os.path.join(data_folder, category_folder)
    if os.path.isdir(category_path):
        for file in os.listdir(category_path):
            file_path = os.path.join(category_path, file)
            if file.endswith('.pdf'):
                text = extract_information(file_path)
            elif file.endswith('.docx'):
                text = extract_information_docx(file_path)
            else:
                continue

            details = extract_details(text)
            details['ID'] = file.replace('.pdf', '').replace('.docx', '')
            details['Category'] = category_folder
            resume_data.append(details)

print('Extraction Done!')

Extraction Done!
CPU times: user 21min 21s, sys: 1.01 s, total: 21min 22s
Wall time: 21min 44s


In [52]:
resume_df = pd.DataFrame(resume_data)
resume_df.to_csv('./pdf_extracted_skills_education.csv', index=False)

In [53]:
resume_df.shape

(2484, 5)

In [54]:
# Null values
resume_df.isna().sum()

# It seems that Education was not extracted properly

Skills        25
Education     22
Experience    55
ID             0
Category       0
dtype: int64

In [55]:
resume_df['Skills'] = resume_df['Skills'].fillna("Unknown")
resume_df['Education'] = resume_df['Education'].fillna("Unknown")
resume_df['Experience'] = resume_df['Experience'].fillna("Unknown")
resume_df = resume_df.reset_index( drop=True)

In [56]:
# We have 15 Resumes where Skills & Education were not extracted
# So, let's remove them

print(resume_df[(resume_df.Skills.isna() & resume_df.Education.isna())])

Empty DataFrame
Columns: [Skills, Education, Experience, ID, Category]
Index: []


In [57]:
resume_df.isna().sum()

Skills        0
Education     0
Experience    0
ID            0
Category      0
dtype: int64

In [58]:
# We are left with 2469 resumes after removing those 15 resumes with null data in both of them

print(resume_df[~(resume_df['Skills'].isna() & resume_df['Education'].isna())].shape)

cv_df = resume_df[~(resume_df['Skills'].isna() & resume_df['Education'].isna())].reset_index(drop=True)
cv_df.head()

(2484, 5)


Unnamed: 0,Skills,Education,Experience,ID,Category
0,"care planning, Case Management, Home Health, H...",and Training\n1976,09/2010 to 12/2011,11174187,ADVOCATE
1,with a strong motivation to succeed. I am seen as,"Bachelor of Science : Psychology , 04/2017",Unknown,25873425,ADVOCATE
2,", time management and multi-tasking abilities;","Bachelors of Science , Physical Education 2010...",working in a student-focused learning environm...,22391901,ADVOCATE
3,Active listening,High School Diploma : General-Business Communi...,Customer Service Advocate,20400279,ADVOCATE
4,Experience,"levels, physical abilities, and career goals.",Advocate 01/2012 to Current Company Name City ...,37348041,ADVOCATE


In [59]:
# New number of null values in Skills & Education Section
cv_df.isna().sum()

Skills        0
Education     0
Experience    0
ID            0
Category      0
dtype: int64

In [60]:
# Null values in Skills Section
cv_df[cv_df.Skills.isna()]

Unnamed: 0,Skills,Education,Experience,ID,Category


In [61]:
# Null values in Education Section
cv_df[cv_df.Education.isna()]


Unnamed: 0,Skills,Education,Experience,ID,Category


In [62]:
cv_df.Category.value_counts()

Category
BUSINESS-DEVELOPMENT      120
INFORMATION-TECHNOLOGY    120
FINANCE                   118
CHEF                      118
ENGINEERING               118
ADVOCATE                  118
ACCOUNTANT                118
FITNESS                   117
AVIATION                  117
SALES                     116
CONSULTANT                115
HEALTHCARE                115
BANKING                   115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64

In [63]:
!pip install contractions

import pandas as pd
import string
import contractions
import re
from tqdm import tqdm

def text_cleaning(text: str) -> str:
    if pd.isnull(text):
        return
    
    # lower-case everything
    text = text.lower().strip()
    
    # For removing punctuations
    translator = str.maketrans('', '', string.punctuation)
    
    # Expand all the short-form words
    text = contractions.fix(text)
    
    # Remove any special chars
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    text = re.sub(r'\b\d{1,3}[-./]?\d{1,3}[-./]?\d{1,4}\b', '', text)  # Remove phone numbers
    text = text.translate(translator)  # Remove punctuations
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove other non-alphanumeric characters
    
    return text.strip()

# Let's stitch together Skills & Education, similar to given in job description.
resume_df = cv_df.copy()  # Assuming cv_df is your original DataFrame

# Filling the null values in Skills & Education with Empty String before concatenating them
resume_df = resume_df.fillna(value='')

resume_df['CV'] = resume_df['Skills'] + ' ' + resume_df['Education']
resume_df['CV'] = resume_df['CV'].apply(text_cleaning)




In [64]:
resume_df.head()

Unnamed: 0,Skills,Education,Experience,ID,Category,CV
0,"care planning, Case Management, Home Health, H...",and Training\n1976,09/2010 to 12/2011,11174187,ADVOCATE,care planning case management home health hosp...
1,with a strong motivation to succeed. I am seen as,"Bachelor of Science : Psychology , 04/2017",Unknown,25873425,ADVOCATE,with a strong motivation to succeed i am seen ...
2,", time management and multi-tasking abilities;","Bachelors of Science , Physical Education 2010...",working in a student-focused learning environm...,22391901,ADVOCATE,time management and multitasking abilities bac...
3,Active listening,High School Diploma : General-Business Communi...,Customer Service Advocate,20400279,ADVOCATE,active listening high school diploma generalb...
4,Experience,"levels, physical abilities, and career goals.",Advocate 01/2012 to Current Company Name City ...,37348041,ADVOCATE,experience levels physical abilities and caree...


In [65]:
category_stats = []

for category in resume_df['Category'].unique():
    category_wise_cv = resume_df[resume_df['Category'] == category]['CV']
    stats = category_wise_cv.str.split().str.len().describe(percentiles=[0.05, 0.5, 0.8, 0.9, 0.95])
    category_stats.append({'Category': category, **stats.to_dict()})

stats_df = pd.DataFrame(category_stats)

# Display the resulting DataFrame
stats_df

Unnamed: 0,Category,count,mean,std,min,5%,50%,80%,90%,95%,max
0,ADVOCATE,118.0,14.957627,7.539519,2.0,4.0,15.0,20.6,23.0,27.6,37.0
1,TEACHER,102.0,15.166667,9.284378,0.0,2.05,15.0,22.0,25.9,29.95,52.0
2,ACCOUNTANT,118.0,16.686441,9.112496,2.0,4.0,15.0,23.0,30.0,34.0,47.0
3,HR,110.0,15.918182,8.86818,0.0,3.45,15.0,23.0,27.0,31.1,43.0
4,BPO,22.0,18.727273,9.842482,0.0,3.1,19.5,26.0,28.9,33.75,35.0
5,CONSULTANT,115.0,16.695652,8.678643,2.0,4.0,16.0,24.0,29.0,32.3,37.0
6,FITNESS,117.0,15.897436,8.273748,2.0,4.0,16.0,23.8,27.0,30.2,35.0
7,APPAREL,97.0,15.845361,7.650354,1.0,3.8,17.0,22.0,24.4,28.0,37.0
8,ARTS,103.0,15.475728,8.291083,0.0,3.0,17.0,21.6,25.0,29.8,36.0
9,DESIGNER,107.0,17.336449,9.394844,2.0,3.3,16.0,24.0,30.4,33.0,48.0


In [66]:
overall_infos = []
for i in range(0, resume_df.shape[0]):
    overall_infos.append(resume_df['Skills'][i]+' '+resume_df['Education'][i]+' '+resume_df['Experience'][i])
resume_df['overall_infos'] = overall_infos

In [67]:
df_new = resume_df[['overall_infos']]
df_new.head(2)

Unnamed: 0,overall_infos
0,"care planning, Case Management, Home Health, H..."
1,with a strong motivation to succeed. I am seen...


In [68]:
#Stopwords help us to get rid of unwanted words like: a, an, are, is, ...
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [69]:
def text_preprocessing(column):
    #make all words with lower letters
    column = column.str.lower()
    #getting rid of any punctution
    column = column.str.replace('http\S+|www.\S+|@|%|:|,|', '', case=False)
    #spliting each sentence to words to apply previous funtions on them 
    word_tokens = column.str.split()
    keywords = word_tokens.apply(lambda x: [item for item in x if item not in stop])
    #assemble words of each sentence again and assign them in new column
    for i in range(len(keywords)):
        keywords[i] = " ".join(keywords[i])
        column = keywords

    return column

In [70]:
df_new.loc[:, 'cleaned_infos'] = text_preprocessing(df_new['overall_infos'])

In [71]:
resume_df['overall_infos']

0       care planning, Case Management, Home Health, H...
1       with a strong motivation to succeed. I am seen...
2       , time management and multi-tasking abilities;...
3       Active listening High School Diploma : General...
4       Experience levels, physical abilities, and car...
                              ...                        
2479    Strong problem solving ability High School Dip...
2480    ; talent for quickly establishing and Training...
2481    billing, credit, English, forms, insurance, pr...
2482    in promoting products and boosting revenue by ...
2483    OSHA inspections opportunities for all staff 1...
Name: overall_infos, Length: 2484, dtype: object

In [72]:
df_new['cleaned_infos']

0       care planning, case management, home health, h...
1       strong motivation succeed. seen bachelor scien...
2       , time management multi-tasking abilities; bac...
3       active listening high school diploma : general...
4       experience levels, physical abilities, career ...
                              ...                        
2479    strong problem solving ability high school dip...
2480    ; talent quickly establishing training seminar...
2481    billing, credit, english, forms, insurance, pr...
2482    promoting products boosting revenue connecting...
2483    osha inspections opportunities staff 12/2013 0...
Name: cleaned_infos, Length: 2484, dtype: object

In [73]:
!pip install torch

import torch
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity



In [74]:
import re

def clean_text(text):
    # Remove special characters and non-alphanumeric characters
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text

# Example job descriptions
job_descriptions = [
    '''
Job Description

 

Deliver and manage cybersecurity services and solutions .
Perform vulnerability assessment using VA tools (Nessus, Burp, etc…)
Perform external and internal penetration tests including manual testing for vulnerability exploitation (preparing scripts etc.).
Perform web app and mobile app (android & iOS) penetration tests
Perform configuration review for network devices
Perform source code reviews
Identify risks and its impact on business
Develop remediation plans and implement fixes
Manage client relation as part of project delivery
Work with company’s team to develop and enhance Business’s Security services and solutions.
Support the sales team during pre-sales activities
 

 

Required Experience & Skills

5+ years of experience in pen testing .
Technical skills in external and internal penetration tests.
Technical skills in web application testing
Vulnerability assessment and business impact analysis.
Familiar with manual testing for vulnerability exploitation (preparing scripts etc.).
Familiar with testing tools (Nessus, Burp)
Strong reporting, communication and presentation skills.
 

Education

Bachelor/Master’s degree in computer science or equivalent.
Certifications in offensive security (OSCP or equivalent) preferred.

'''
]

# Apply the cleaning function to each job description
cleaned_descriptions = [clean_text(desc) for desc in job_descriptions]

# Display the cleaned job descriptions
for original, cleaned in zip(job_descriptions, cleaned_descriptions):
    print(f"Original: {original}\nCleaned: {cleaned}\n")


Original: 
Job Description

 

Deliver and manage cybersecurity services and solutions .
Perform vulnerability assessment using VA tools (Nessus, Burp, etc…)
Perform external and internal penetration tests including manual testing for vulnerability exploitation (preparing scripts etc.).
Perform web app and mobile app (android & iOS) penetration tests
Perform configuration review for network devices
Perform source code reviews
Identify risks and its impact on business
Develop remediation plans and implement fixes
Manage client relation as part of project delivery
Work with company’s team to develop and enhance Business’s Security services and solutions.
Support the sales team during pre-sales activities
 

 

Required Experience & Skills

5+ years of experience in pen testing .
Technical skills in external and internal penetration tests.
Technical skills in web application testing
Vulnerability assessment and business impact analysis.
Familiar with manual testing for vulnerability explo

In [19]:
# # We have 15 Resumes where Skills & Education were not extracted -- REFER EDA notebook
# # So, let's remove them
# cv_df = resume_df[~(resume_df['Skills'].isna() & resume_df['Education'].isna())].reset_index(drop=True)

# # Filling the null values in Skills & Education with Empty String before concatinating them
# cv_df = cv_df.fillna(value='')

# # Let's stitch together Skills & Education, similar to given in job description.
# cv_df['CV'] = cv_df['Skills'] + ' ' + cv_df['Education']

# # Doing text cleaning
# cv_df['CV'] = cv_df['CV'].apply(text_cleaning)

In [20]:
# cv_df.shape

(4, 5)

In [21]:


# # Sample resumes (replace with your extracted resume data)
# resumes = cv_df['CV'].to_list()

In [75]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel

# Assuming df_new['cleaned_infos'][4] contains a list of resumes
resumes = df_new['cleaned_infos'][4]


# Initialize the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Tokenize and embed job descriptions
job_description_embeddings = []
for description in job_descriptions:
    tokens = tokenizer(description, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).numpy()
    job_description_embeddings.append(embeddings[0])  # Flatten the embeddings to 1D

# Tokenize and embed resumes
resume_embeddings = []
for resume in resumes:
    tokens = tokenizer(resume, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).numpy()
    resume_embeddings.append(embeddings[0])  # Flatten the embeddings to 1D


In [76]:
# Calculate cosine similarity between job descriptions and resumes
similarity_scores = cosine_similarity(job_description_embeddings, resume_embeddings)
similarity_scores

array([[0.21590823, 0.23894486, 0.2134819 , 0.21590823, 0.24512848,
        0.23702353, 0.21590823, 0.201846  , 0.24643421, 0.21590823,
        0.05252843, 0.21735987, 0.21590823, 0.20046145, 0.21590823,
        0.21735987, 0.23658565, 0.23169848, 0.05252843, 0.2134819 ,
        0.18756524, 0.25091588, 0.23658565, 0.23702353, 0.24643421,
        0.23533645, 0.21735987, 0.05252843, 0.23533645, 0.2581148 ,
        0.23702353, 0.21735987, 0.23702353, 0.20158684, 0.23702353,
        0.21590823, 0.23658565, 0.23169848, 0.05252843, 0.24643421,
        0.23533645, 0.24512848, 0.21590823, 0.21590823, 0.24512848,
        0.05252843, 0.22676277, 0.23226923, 0.23533645, 0.21735987,
        0.23658565, 0.2526767 , 0.05252843, 0.23533645, 0.21614738,
        0.20046145, 0.23226923, 0.24643421, 0.23533645, 0.20158684,
        0.21590823, 0.05252843, 0.22303173, 0.1777793 , 0.23395087,
        0.2046431 , 0.22303173, 0.1777793 , 0.2046431 , 0.05252843,
        0.24643421, 0.21831267, 0.24512848, 0.24

In [78]:
# Rank candidates for each job description based on similarity scores
num_top_candidates = 5
top_candidates = []

for i, job_description in enumerate(cleaned_descriptions):
    candidates_with_scores = list(enumerate(similarity_scores[i]))
    candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
    top_candidates_for_job = candidates_with_scores[:num_top_candidates]
    top_candidates.append(top_candidates_for_job)

# Print the top candidates for each job description
for i, job_description in enumerate(cleaned_descriptions):
    print(f"Top candidates for JD {i+1}")
    for candidate_index, score in top_candidates[i]:
        print(f"  Candidate {candidate_index + 1} - Similarity Score: {score:.4f}")
        # Print additional information if needed
        # print(f"  Resume: {resumes[candidate_index]}")
    print()

Top candidates for JD 1
  Candidate 30 - Similarity Score: 0.2581
  Candidate 52 - Similarity Score: 0.2527
  Candidate 22 - Similarity Score: 0.2509
  Candidate 85 - Similarity Score: 0.2509
  Candidate 95 - Similarity Score: 0.2509

