In [1]:
!pip install numpy pandas pdfplumber python-docx nltk scikit-learn



In [2]:
import pdfplumber
import docx
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Function to extract text from PDF/DOCX
def extract_text(file_path):
    text = ""
    if file_path.endswith(".pdf"):
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                extracted = page.extract_text()
                if extracted:
                    text += extracted + "\n"
    elif file_path.endswith(".docx"):
        doc = docx.Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    return text.strip()

In [5]:
# Preprocessing function
def clean_text(text, remove_stopwords=False):
    text = re.sub(r'\W', ' ', text)  
    text = text.lower() 
    if remove_stopwords:
        text = ' '.join([word for word in text.split() if word not in stop_words])  
    return text

In [6]:
def calculate_similarity(job_desc, resume_text):
    corpus = [clean_text(job_desc)] + [clean_text(resume_text)]
    
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english") 
    tfidf_matrix = vectorizer.fit_transform(corpus)
    
    similarity_score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()[0]
    
    return similarity_score * 100 

In [7]:
# Function to rank multiple resumes
def rank_resumes(job_desc, resumes):
    job_desc_cleaned = clean_text(job_desc)
    resumes_cleaned = [clean_text(resume) for resume in resumes]
    
    corpus = [job_desc_cleaned] + resumes_cleaned
    
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(corpus)
    
    similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    
    ranked_resumes = sorted(zip(resumes, similarity_scores), key=lambda x: x[1], reverse=True)

    return [(resume, score * 100) for resume, score in ranked_resumes]

In [8]:
job_description = "Looking for a Python developer with experience in ML and NLP."

resume_texts = [
    "Python developer skilled in NLP and Machine Learning. Experience with TensorFlow and PyTorch.",
    "Experienced Java developer with expertise in Spring Boot and Microservices.",
]

ranked_results = rank_resumes(job_description, resume_texts)

In [9]:
# Printing ranked resumes
for i, (resume, score) in enumerate(ranked_results):
    print(f"Rank {i+1}: Score = {score:.2f}%\n{resume[:300]}\n")

Rank 1: Score = 23.62%
Python developer skilled in NLP and Machine Learning. Experience with TensorFlow and PyTorch.

Rank 2: Score = 3.37%
Experienced Java developer with expertise in Spring Boot and Microservices.

