In [1]:
import torch

In [2]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
    return text


In [3]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

def preprocess_text(text):
    # Add your text preprocessing steps here
    return text.lower().strip()


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(job_description, resume):
    job_embedding = get_bert_embedding(preprocess_text(job_description))
    resume_embedding = get_bert_embedding(preprocess_text(resume))
    similarity = cosine_similarity([job_embedding], [resume_embedding])[0][0]
    return similarity


In [5]:
def rank_resumes(job_description, resume_paths):
    similarities = []
    for resume_path in resume_paths:
        resume_text = extract_text_from_pdf(resume_path)
        similarity = calculate_similarity(job_description, resume_text)
        similarities.append((resume_path, similarity))
    
    # Sort resumes by similarity score in descending order
    ranked_resumes = sorted(similarities, key=lambda x: x[1], reverse=True)
    return ranked_resumes

In [6]:
job_description = """Job Description:

We are seeking a talented and motivated Data Scientist to join our team. In this role, you will work on advanced machine learning models, leverage artificial intelligence techniques, and utilize Python to analyze complex data sets and deliver actionable insights.
Key Responsibilities:

Develop, implement, and optimize machine learning models to solve business challenges.
Use Python and related libraries (Numpy, Pandas, Scikit-learn, TensorFlow, etc.) for data analysis, model training, and prediction.
Collaborate with cross-functional teams to understand business requirements and translate them into data-driven solutions.
Perform data preprocessing, including cleaning, normalization, and feature engineering, to prepare data for modeling.
Conduct exploratory data analysis (EDA) to uncover trends, patterns, and relationships in data.
Design and execute experiments to validate model performance and improve accuracy.
Deploy and monitor machine learning models in production environments.
Stay current with the latest advancements in AI/ML and propose the integration of new technologies to enhance our capabilities.
Utilize AI tools like GitHub Copilot for efficient coding and model development.
Communicate findings and insights to stakeholders through clear and concise reports, dashboards, and visualizations."""

In [7]:
pd1 = "D:\\self\\Resume\\JOY_BISWAS.pdf"
pd2 = "D:\\self\\Resume\\Software\\JoyBiswas_060.pdf"
pd3 = "D:\\self\\Resume\\Data Science\\JOY-BISWAS_BWU-BTA-21-060.pdf"

In [8]:
resume_paths = [pd1,pd2,pd3]

ranked_resumes = rank_resumes(job_description, resume_paths)

# Print the ranking
for rank, (resume_path, score) in enumerate(ranked_resumes, start=1):
    print(f"Rank {rank}: {resume_path} with similarity score {score:.4f}")


Rank 1: D:\self\Resume\JOY_BISWAS.pdf with similarity score 0.8398
Rank 2: D:\self\Resume\Software\JoyBiswas_060.pdf with similarity score 0.4341
Rank 3: D:\self\Resume\Data Science\JOY-BISWAS_BWU-BTA-21-060.pdf with similarity score 0.2820


In [9]:
# Save the tokenizer and model
tokenizer.save_pretrained('bert_tokenizer')
model.save_pretrained('bert_model')