In [1]:
import os
from PyPDF2 import PdfReader
import re

# Define a function to extract key details from a PDF file
def extract_details(pdf_file):
    # Open the PDF file
    with open(pdf_file, 'rb') as pdf:
        pdf_reader = PdfReader(pdf)
        
        # Initialize variables to store extracted details
        category = None
        skills = None
        education = None

        # Combine all text from the PDF
        all_text = ''.join(page.extract_text() for page in pdf_reader.pages)

        # Extract Category (Job Role)
        category_match = re.search(r'(.+?)\nExperience', all_text)
        if category_match:
            category = category_match.group(1).strip()

        # Extract Skills
        skills_match = re.search(r'Skills\n(.*?)\n', all_text)
        if skills_match:
            skills = skills_match.group(1).strip()

        # Extract Education (Degree and Institution)
        education_match = re.search(r'Education\n(.*?)\n', all_text)
        if education_match:
            education = education_match.group(1).strip()

        return {
            'Category': category,
            'Skills': skills,
            'Education': education
        }

# Specify the path to the main folder containing subfolders for different job categories
main_folder = r'C:\Users\Lenovo\Downloads\archive (1)\data\data'

# Initialize a dictionary to store resume details for each job category
job_details = {}

# Traverse through subdirectories
for job_category in os.listdir(main_folder):
    job_folder = os.path.join(main_folder, job_category)

    # Ensure that it's a directory
    if os.path.isdir(job_folder):
        # List all PDF files in the job category folder
        pdf_files = [os.path.join(job_folder, filename) for filename in os.listdir(job_folder) if filename.endswith('.pdf')]

        # Extract details from each PDF file and store them in a list
        resume_details = []
        for pdf_file in pdf_files:
            details = extract_details(pdf_file)
            resume_details.append(details)

        # Store the resume details for this job category in the dictionary
        job_details[job_category] = resume_details

# Print the extracted details for each job category
for job_category, details_list in job_details.items():
    print(f"Job Category: {job_category}")
    for i, details in enumerate(details_list):
        print(f"Resume {i + 1} Details:")
        print("Category:", details['Category'])
        print("Skills:", details['Skills'])
        print("Education:", details['Education'])
        print("\n")


Job Category: ACCOUNTANT
Resume 1 Details:
Category: deobligate over $5M in duplicate obligations.
Skills: None
Education: Northern Maine Community College


Resume 2 Details:
Category: Business Administration Finance
Skills: accounting, accounts payable, Accounts Receivable, ADP, advertising, AR, balance sheet, balance, bank reconciliations, benefits, billing, billings,
Education: Bachelor of Science


Resume 3 Details:
Category: Customer Service
Skills: accounts payables, accounts receivables, Accounts Payable, Accounts Receivable, administrative functions, trial balance, banking, budget, bi,
Education: Computer Applications Specialist Certificate Program


Resume 4 Details:
Category: SENIOR ACCOUNTANT
Skills: accounting, balance sheet, budgets, client, clients, derivatives, drafting, equity, financial, financial accounting, financial statements, fixed assets,
Education: EMORY UNIVERSITY, Goizueta Business School


Resume 5 Details:
Category: None
Skills: Aderant/CMS
Education: Bache

In [2]:
!pip install datasets
from datasets import load_dataset

DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063




In [3]:
# Load the job descriptions dataset
job_descriptions_dataset = load_dataset('jacob-hugging-face/job-descriptions')


In [4]:
# Display 10-15 job descriptions
sample_job_descriptions = job_descriptions_dataset['train']['job_description'][:15]
for idx, job_description in enumerate(sample_job_descriptions, start=1):
    print(f"Job Description {idx}:\n{job_description}\n")

Job Description 1:
minimum qualifications
bachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managing enterprise accounts with sales cycles
preferred qualifications
 years of experience building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent account management writtenverbal communication strategic and analyticalthinking skills
about the job
as a member of the google cloud team you inspire leading companies schools and government agencies to work smarter with google tools like google workspace search and chrome you advocate the innovative power of our products to make organiz

In [5]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, TFBertModel

In [6]:

# Sample job descriptions
job_descriptions = [
    "Looking for a biology teacher with experience in STEM education.",
    "Seeking a clinical lab manager with strong molecular diagnostics skills.",
    "Hiring a quality control technician with experience in TCA/E coordination.",
]

# Sample CV details (embeddings should be calculated)
cv_details = [
    "I am a biology teacher with a background in STEM education.",
    "I have extensive experience as a clinical lab manager in molecular diagnostics.",
    "I am a quality control technician with expertise in TCA/E coordination.",
    "I have skills in biology education and experience as a lab manager.",
    "Experienced quality control technician with a background in molecular diagnostics.",
]

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode job descriptions
job_desc_tokens = tokenizer(job_descriptions, padding=True, truncation=True, return_tensors="tf", max_length=128)
job_desc_embeddings = model(job_desc_tokens)['last_hidden_state'].numpy().mean(axis=1)


# Tokenize and encode CV details
cv_tokens = tokenizer(cv_details, padding=True, truncation=True, return_tensors="tf", max_length=128)
cv_embeddings = model(cv_tokens)['last_hidden_state'].numpy().mean(axis=1)


# Calculate cosine similarity between job descriptions and CVs
similarities = cosine_similarity(job_desc_embeddings, cv_embeddings)

# Rank CVs based on similarity for each job description
top_cv_indices = np.argsort(similarities, axis=1)[:, -5:][:, ::-1]


# Print the top 5 CVs for each job description
for i, job_desc in enumerate(job_descriptions):
    print(f"Top 5 CVs for Job Description {i+1} - {job_desc}\n")
    for j, cv_idx in enumerate(top_cv_indices[i]):
        print(f"CV {j+1} - Similarity: {similarities[i, cv_idx]}")
        print(cv_details[cv_idx])
        print("-" * 50)
    print("=" * 50)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Top 5 CVs for Job Description 1 - Looking for a biology teacher with experience in STEM education.

CV 1 - Similarity: 0.8201055526733398
I am a biology teacher with a background in STEM education.
--------------------------------------------------
CV 2 - Similarity: 0.7896248698234558
I have skills in biology education and experience as a lab manager.
--------------------------------------------------
CV 3 - Similarity: 0.7731002569198608
Experienced quality control technician with a background in molecular diagnostics.
--------------------------------------------------
CV 4 - Similarity: 0.7311567664146423
I have extensive experience as a clinical lab manager in molecular diagnostics.
--------------------------------------------------
CV 5 - Similarity: 0.6833124756813049
I am a quality control technician with expertise in TCA/E coordination.
--------------------------------------------------
Top 5 CVs for Job Description 2 - Seeking a clinical lab manager with strong molecular diagn

In [7]:
!pip install pymupdf



DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063





In [8]:
import os
import fitz  # PyMuPDF
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from datasets import load_dataset

# Function to preprocess and tokenize text
def preprocess_text(text, tokenizer, max_length):
    inputs = tokenizer(text, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    return inputs

# Function to extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_path):
    text = ''
    try:
        # Open the PDF file
        pdf_document = fitz.open(pdf_path)
        for page_num in range(pdf_document.page_count):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
    except Exception as e:
        print(f"Failed to extract text from {pdf_path}: {str(e)}")
    
    return text

# Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

# Define the path to the 'data' folder containing CVs
data_folder = r'C:\Users\Lenovo\Downloads\archive (1)\data\data' # Replace with the actual path to your 'data' folder

# Load the job descriptions dataset
# Load the job descriptions dataset
dataset = load_dataset("jacob-hugging-face/job-descriptions")
job_descriptions = dataset["train"]["job_description"]


# Initialize a dictionary to store CV embeddings
cv_embeddings = {}

# Iterate through job folders and process CVs
for job_folder in os.listdir(data_folder):
    print(f"Processing job folder: {job_folder}")
    job_folder_path = os.path.join(data_folder, job_folder)
    cv_embeddings[job_folder] = []

    # Get PDF files for the current job
    pdf_files = [os.path.join(job_folder_path, file) for file in os.listdir(job_folder_path) if file.endswith(".pdf")]

    # Process each CV for the current job
    for pdf_file in pdf_files:
        # Extract text from CV PDF using PyMuPDF
        text = extract_text_from_pdf(pdf_file)

        # Tokenize and preprocess CV text
        cv_tokens = preprocess_text(text, tokenizer, max_length=128)

        # Calculate embeddings
        with torch.no_grad():
            output = model(**cv_tokens)
        embeddings = output.last_hidden_state.mean(dim=1)  # Mean pooling over tokens
        cv_embeddings[job_folder].append(embeddings)

# Process job descriptions and calculate embeddings
job_description_embeddings = {}
for idx, description in enumerate(job_descriptions):
    print(f"Processing job description {idx + 1}/{len(job_descriptions)}")
    inputs = preprocess_text(description, tokenizer, max_length=128)
    with torch.no_grad():
        output = model(**inputs)
    embeddings = output.last_hidden_state.mean(dim=1)  # Mean pooling over tokens
    job_description_embeddings[idx] = embeddings

# Rank CVs for each job description based on cosine similarity
top_cv_matches = {}
for job_idx, job_description_embedding in job_description_embeddings.items():
    similarity_scores = []
    for cv_folder, cv_embeddings_list in cv_embeddings.items():
        for cv_embedding in cv_embeddings_list:
            similarity = cosine_similarity(job_description_embedding, cv_embedding)
            similarity_scores.append((cv_folder, similarity.item()))

    # Sort CVs by similarity score
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    top_cv_matches[job_descriptions[job_idx]] = similarity_scores[:5]

# Display top 5 CV matches for each job description
for job_description, cv_matches in top_cv_matches.items():
    print(f"\nTop 5 CV matches for Job Description:\n{job_description}")
    for cv_folder, similarity in cv_matches:
        print(f"Similarity Score: {similarity:.3f}")
        print(f"CV Folder: {cv_folder}")


Processing job folder: ACCOUNTANT
Processing job folder: ADVOCATE
Processing job folder: AGRICULTURE
Processing job folder: APPAREL
Processing job folder: ARTS
Processing job folder: AUTOMOBILE
Processing job folder: AVIATION
Processing job folder: BANKING
Processing job folder: BPO
Processing job folder: BUSINESS-DEVELOPMENT
Processing job folder: CHEF
Processing job folder: CONSTRUCTION
Processing job folder: CONSULTANT
Processing job folder: DESIGNER
Processing job folder: DIGITAL-MEDIA
Processing job folder: ENGINEERING
Processing job folder: FINANCE
Processing job folder: FITNESS
Processing job folder: HEALTHCARE
Processing job folder: HR
Processing job folder: INFORMATION-TECHNOLOGY
Processing job folder: PUBLIC-RELATIONS
Processing job folder: SALES
Processing job folder: TEACHER
Processing job description 1/853
Processing job description 2/853
Processing job description 3/853
Processing job description 4/853
Processing job description 5/853
Processing job description 6/853
Proc

Processing job description 216/853
Processing job description 217/853
Processing job description 218/853
Processing job description 219/853
Processing job description 220/853
Processing job description 221/853
Processing job description 222/853
Processing job description 223/853
Processing job description 224/853
Processing job description 225/853
Processing job description 226/853
Processing job description 227/853
Processing job description 228/853
Processing job description 229/853
Processing job description 230/853
Processing job description 231/853
Processing job description 232/853
Processing job description 233/853
Processing job description 234/853
Processing job description 235/853
Processing job description 236/853
Processing job description 237/853
Processing job description 238/853
Processing job description 239/853
Processing job description 240/853
Processing job description 241/853
Processing job description 242/853
Processing job description 243/853
Processing job descr

Processing job description 453/853
Processing job description 454/853
Processing job description 455/853
Processing job description 456/853
Processing job description 457/853
Processing job description 458/853
Processing job description 459/853
Processing job description 460/853
Processing job description 461/853
Processing job description 462/853
Processing job description 463/853
Processing job description 464/853
Processing job description 465/853
Processing job description 466/853
Processing job description 467/853
Processing job description 468/853
Processing job description 469/853
Processing job description 470/853
Processing job description 471/853
Processing job description 472/853
Processing job description 473/853
Processing job description 474/853
Processing job description 475/853
Processing job description 476/853
Processing job description 477/853
Processing job description 478/853
Processing job description 479/853
Processing job description 480/853
Processing job descr

Processing job description 688/853
Processing job description 689/853
Processing job description 690/853
Processing job description 691/853
Processing job description 692/853
Processing job description 693/853
Processing job description 694/853
Processing job description 695/853
Processing job description 696/853
Processing job description 697/853
Processing job description 698/853
Processing job description 699/853
Processing job description 700/853
Processing job description 701/853
Processing job description 702/853
Processing job description 703/853
Processing job description 704/853
Processing job description 705/853
Processing job description 706/853
Processing job description 707/853
Processing job description 708/853
Processing job description 709/853
Processing job description 710/853
Processing job description 711/853
Processing job description 712/853
Processing job description 713/853
Processing job description 714/853
Processing job description 715/853
Processing job descr

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

