In [72]:
!pip install contractions
!pip install pdfplumber
import os
import re
# import PyPDF2
import pdfplumber
import pandas as pd
import numpy as np

import string # for text cleaning
import contractions # for expanding short form words
from tqdm import tqdm
tqdm.pandas(desc="Progress Bar")

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")



In [73]:
# Define a function to extract information from a PDF
def extract_information(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        resume_text = ""
        for page in pdf.pages:
            resume_text = " ".join([resume_text, page.extract_text()])
    resume_text = resume_text.strip()
    return resume_text
    
!pip install python-docx

from docx import Document
from docx.oxml import OxmlElement

def extract_information_docx(docx_path):
    doc = Document(docx_path)
    resume_text = ""

    for paragraph in doc.paragraphs:
        resume_text += paragraph.text + " "

    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    resume_text += paragraph.text + " "

    resume_text = resume_text.strip()
    return resume_text



In [74]:
# Define a function to extract Skills, and Education
def extract_details(resume_text):
    # Define regular expressions to extract Skills & Education
    skills_pattern = r'Skills\s*([\s\S]*?)(?=\n[A-Z]|$)'
    education_pattern = r'Education\s*([\s\S]*?)(?=\n[A-Z]+[^\n]*\n|$)'
    
    # Get Skills & Education
    skills_match = re.findall(skills_pattern, resume_text, re.DOTALL |re.IGNORECASE)
    education_match = re.findall(education_pattern, resume_text, re.DOTALL |re.IGNORECASE)
    
    # Skills & Education
    if len(skills_match)!=0:
        skills = skills_match[0]
    else:
        skills_pattern = r'skills\n((?:.*)*)' 
        skills_match = re.findall(skills_pattern, resume_text, re.DOTALL)
        if len(skills_match)!=0:
            skills = skills_match[0]
        else:
            skills = None
            
    if len(education_match)!=0:
        education = education_match[0]
    else:
        education = None
    
    return {
        'Skills': skills,
        'Education': education
    }

In [75]:
%%time

# Your existing code
data_folder = 'media/cvs'
resume_data = []

# Iterate through sub-folders and files
for category_folder in os.listdir(data_folder):
    category_path = os.path.join(data_folder, category_folder)
    if os.path.isdir(category_path):
        for file in os.listdir(category_path):
            file_path = os.path.join(category_path, file)
            if file.endswith('.pdf'):
                text = extract_information(file_path)
            elif file.endswith('.docx'):
                text = extract_information_docx(file_path)
            else:
                continue

            details = extract_details(text)
            details['ID'] = file.replace('.pdf', '').replace('.docx', '')
            details['Category'] = category_folder
            resume_data.append(details)

print('Extraction Done!')

Extraction Done!
CPU times: user 374 ms, sys: 4.03 ms, total: 378 ms
Wall time: 399 ms


In [76]:
resume_df = pd.DataFrame(resume_data)
resume_df.to_csv('./pdf_extracted_skills_education.csv', index=False)

In [77]:
resume_df.shape

(5, 4)

In [78]:
# Null values
resume_df.isna().sum()

# It seems that Education was not extracted properly

Skills       2
Education    1
ID           0
Category     0
dtype: int64

In [79]:
# We have 15 Resumes where Skills & Education were not extracted
# So, let's remove them

print(resume_df[(resume_df.Skills.isna() & resume_df.Education.isna())])

  Skills Education                      ID Category
4   None      None  sprint_meeting_minutes       IT


In [80]:
# We are left with 2469 resumes after removing those 15 resumes with null data in both of them

print(resume_df[~(resume_df['Skills'].isna() & resume_df['Education'].isna())].shape)

cv_df = resume_df[~(resume_df['Skills'].isna() & resume_df['Education'].isna())].reset_index(drop=True)
cv_df.head()

(4, 4)


Unnamed: 0,Skills,Education,ID,Category
0,● Background and advanced knowledge in compute...,● Refactory Academy | Software Engineering | M...,VANESSA SUZAN NALUGYA,IT
1,": Computer-literate, well-versed with most co...",AND QUALIFICATIONS NDEJJE UNIVERSITY 2010-...,BRENDA CV,IT
2,● Backgroundandadvancedknowledgeincomputerdeve...,● Refactory Academy | Software Engineering | M...,CV,IT
3,,: University of the North,John_Snow_CV,IT


In [70]:
# New number of null values in Skills & Education Section
cv_df.isna().sum()

Skills       0
Education    2
ID           0
Category     0
dtype: int64

In [71]:
# Null values in Skills Section
cv_df[cv_df.Skills.isna()]

Unnamed: 0,Skills,Education,ID,Category


In [38]:
# Null values in Education Section
cv_df[cv_df.Education.isna()]


Unnamed: 0,Skills,Education,ID,Category


In [39]:
cv_df.Category.value_counts()

Category
IT    1
Name: count, dtype: int64

In [40]:
!pip install contractions

import pandas as pd
import string
import contractions
import re
from tqdm import tqdm

def text_cleaning(text: str) -> str:
    if pd.isnull(text):
        return
    
    # lower-case everything
    text = text.lower().strip()
    
    # For removing punctuations
    translator = str.maketrans('', '', string.punctuation)
    
    # Expand all the short-form words
    text = contractions.fix(text)
    
    # Remove any special chars
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    text = re.sub(r'\b\d{1,3}[-./]?\d{1,3}[-./]?\d{1,4}\b', '', text)  # Remove phone numbers
    text = text.translate(translator)  # Remove punctuations
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove other non-alphanumeric characters
    
    return text.strip()

# Let's stitch together Skills & Education, similar to given in job description.
resume_df = cv_df.copy()  # Assuming cv_df is your original DataFrame

# Filling the null values in Skills & Education with Empty String before concatenating them
resume_df = resume_df.fillna(value='')

resume_df['CV'] = resume_df['Skills'] + ' ' + resume_df['Education']
resume_df['CV'] = resume_df['CV'].apply(text_cleaning)




In [41]:
resume_df.head()

Unnamed: 0,Skills,Education,ID,Category,CV
0,● Backgroundandadvancedknowledgeincomputerdeve...,● Refactory Academy | Software Engineering | M...,CV,IT,backgroundandadvancedknowledgeincomputerdevelo...


In [42]:
category_stats = []

for category in resume_df['Category'].unique():
    category_wise_cv = resume_df[resume_df['Category'] == category]['CV']
    stats = category_wise_cv.str.split().str.len().describe(percentiles=[0.05, 0.5, 0.8, 0.9, 0.95])
    category_stats.append({'Category': category, **stats.to_dict()})

stats_df = pd.DataFrame(category_stats)

# Display the resulting DataFrame
stats_df

Unnamed: 0,Category,count,mean,std,min,5%,50%,80%,90%,95%,max
0,IT,1.0,24.0,,24.0,24.0,24.0,24.0,24.0,24.0,24.0


In [43]:
!pip install torch

import torch
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity



In [44]:
import re

def clean_text(text):
    # Remove special characters and non-alphanumeric characters
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text

# Example job descriptions
job_descriptions = [
    '''
Job Description

 

Deliver and manage cybersecurity services and solutions .
Perform vulnerability assessment using VA tools (Nessus, Burp, etc…)
Perform external and internal penetration tests including manual testing for vulnerability exploitation (preparing scripts etc.).
Perform web app and mobile app (android & iOS) penetration tests
Perform configuration review for network devices
Perform source code reviews
Identify risks and its impact on business
Develop remediation plans and implement fixes
Manage client relation as part of project delivery
Work with company’s team to develop and enhance Business’s Security services and solutions.
Support the sales team during pre-sales activities
 

 

Required Experience & Skills

5+ years of experience in pen testing .
Technical skills in external and internal penetration tests.
Technical skills in web application testing
Vulnerability assessment and business impact analysis.
Familiar with manual testing for vulnerability exploitation (preparing scripts etc.).
Familiar with testing tools (Nessus, Burp)
Strong reporting, communication and presentation skills.
 

Education

Bachelor/Master’s degree in computer science or equivalent.
Certifications in offensive security (OSCP or equivalent) preferred.

'''
]

# Apply the cleaning function to each job description
cleaned_descriptions = [clean_text(desc) for desc in job_descriptions]

# Display the cleaned job descriptions
for original, cleaned in zip(job_descriptions, cleaned_descriptions):
    print(f"Original: {original}\nCleaned: {cleaned}\n")


Original: 
Job Description

 

Deliver and manage cybersecurity services and solutions .
Perform vulnerability assessment using VA tools (Nessus, Burp, etc…)
Perform external and internal penetration tests including manual testing for vulnerability exploitation (preparing scripts etc.).
Perform web app and mobile app (android & iOS) penetration tests
Perform configuration review for network devices
Perform source code reviews
Identify risks and its impact on business
Develop remediation plans and implement fixes
Manage client relation as part of project delivery
Work with company’s team to develop and enhance Business’s Security services and solutions.
Support the sales team during pre-sales activities
 

 

Required Experience & Skills

5+ years of experience in pen testing .
Technical skills in external and internal penetration tests.
Technical skills in web application testing
Vulnerability assessment and business impact analysis.
Familiar with manual testing for vulnerability explo

In [45]:
# We have 15 Resumes where Skills & Education were not extracted -- REFER EDA notebook
# So, let's remove them
cv_df = resume_df[~(resume_df['Skills'].isna() & resume_df['Education'].isna())].reset_index(drop=True)

# Filling the null values in Skills & Education with Empty String before concatinating them
cv_df = cv_df.fillna(value='')

# Let's stitch together Skills & Education, similar to given in job description.
cv_df['CV'] = cv_df['Skills'] + ' ' + cv_df['Education']

# Doing text cleaning
cv_df['CV'] = cv_df['CV'].apply(text_cleaning)

In [46]:
cv_df.shape

(1, 5)

In [47]:


# Sample resumes (replace with your extracted resume data)
resumes = cv_df['CV'].to_list()

In [48]:
%%time

# Initialize the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


# Tokenize and embed job descriptions
job_description_embeddings = []
for description in job_descriptions:
    tokens = tokenizer(description, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).numpy()
    job_description_embeddings.append(embeddings[0])  # Flatten the embeddings to 1D

# Tokenize and embed resumes
resume_embeddings = []
for resume in resumes:
    tokens = tokenizer(resume, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).numpy()
    resume_embeddings.append(embeddings[0])  # Flatten the embeddings to 1D

CPU times: user 1.38 s, sys: 227 ms, total: 1.61 s
Wall time: 2.51 s


In [49]:
# Calculate cosine similarity between job descriptions and resumes
similarity_scores = cosine_similarity(job_description_embeddings, resume_embeddings)
similarity_scores

array([[0.7740958]], dtype=float32)

In [50]:
# Rank candidates for each job description based on similarity scores
num_top_candidates = 5
top_candidates = []

for i, job_description in enumerate(cleaned_descriptions):
    candidates_with_scores = list(enumerate(similarity_scores[i]))
    candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
    top_candidates_for_job = candidates_with_scores[:num_top_candidates]
    top_candidates.append(top_candidates_for_job)

# Print the top candidates for each job description
for i, job_description in enumerate(cleaned_descriptions):
    print(f"Top candidates for JD {i+1}")
    for candidate_index, score in top_candidates[i]:
        print(f"  Candidate {candidate_index + 1} - Similarity Score: {score:.4f}")
        # Print additional information if needed
        # print(f"  Resume: {resumes[candidate_index]}")
    print()

Top candidates for JD 1
  Candidate 1 - Similarity Score: 0.7741

