In [6]:
!pip install python-docx pypdf2 nltk scikit-learn

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: python-docx, pypdf2, nltk
Successfully installed nltk-3.9.1 pypdf2-3.0.1 python-docx-1.1.2


In [9]:
from functools import wraps
from docx import Document
from PyPDF2 import PdfReader
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class RelvanceScore(object):
    def __init__(self) -> None:
        self.punctuations = string.punctuation + string.digits + '’' + '“' + '”'
        self.Lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english'))
        self.Vectorizer = TfidfVectorizer()

    @staticmethod
    def Exception(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except (Exception, FileNotFoundError, ValueError, NameError, TypeError) as e:
                return e
        return wrapper
    
    @Exception
    def document_parser(self, document) -> str:
        if document.endswith('.docx'):
            self.doc = Document(document)
            text: str = ''
            for para in self.doc.paragraphs:
                text += para.text
            return text
        elif document.endswith('.pdf'):
            self.PDF = PdfReader(document)
            text = ''
            for page in range(len(self.PDF.pages)):
                text += self.PDF.pages[page].extract_text()
            return text
        elif document.endswith('.txt'):
            with open(document, "r", encoding="utf-8") as file:
                text = file.read()
            return text
        else:
            return 'Invalid File Format'
    @Exception
    def data_preprocessing(self, document: str):
        for token in word_tokenize(document):
            if token not in self.stopwords and token not in self.punctuations and len(token) > 1:
                sub_tokens = re.split(r'[_/\\\-\|]', token)
                for sub_token in sub_tokens:
                    if sub_token not in self.stopwords and len(sub_token) > 1:
                        yield self.Lemmatizer.lemmatize(sub_token.lower())

    @Exception
    def get_score(self, RESUME: list, JD: list):
        # Combine tokens into strings
        CORPUS = [' '.join(RESUME), ' '.join(JD)]
        TFIDF_MATRIX = self.Vectorizer.fit_transform(CORPUS)
        SIMILARITY = cosine_similarity(TFIDF_MATRIX[0], TFIDF_MATRIX[1])
        return round(abs(SIMILARITY[0][0] * 100),2)

    def main(self) -> Exception:
        RESUME = "Resume.pdf"
        JD = "JD.txt"
        RESUME = self.document_parser(RESUME)
        JD = self.document_parser(JD)
        RESUME = list(self.data_preprocessing(RESUME))
        JD = list(self.data_preprocessing(JD))
        return self.get_score(RESUME, JD)

if __name__ == "__main__":
    obj = RelvanceScore()
    score = obj.main()
    print(score)

9.44


In [10]:
from functools import wraps
from docx import Document
from PyPDF2 import PdfReader
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

class RelevanceScore:
    def __init__(self) -> None:
        self.punctuations = string.punctuation + string.digits + '’' + '“' + '”'
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english'))
        self.vectorizer = TfidfVectorizer()

    @staticmethod
    def exception_handler(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            try:
                return func(*args, **kwargs)
            except (Exception, FileNotFoundError, ValueError, NameError, TypeError) as e:
                return str(e)  # Return error message as string
        return wrapper
    
    @exception_handler
    def document_parser(self, document_path) -> str:
        if document_path.endswith('.docx'):
            doc = Document(document_path)
            return '\n'.join([para.text for para in doc.paragraphs])
        elif document_path.endswith('.pdf'):
            pdf = PdfReader(document_path)
            return '\n'.join([page.extract_text() for page in pdf.pages if page.extract_text()])
        elif document_path.endswith('.txt'):
            with open(document_path, "r", encoding="utf-8") as file:
                return file.read()
        else:
            return 'Invalid File Format'
    
    @exception_handler
    def data_preprocessing(self, document: str):
        processed_tokens = []
        for token in word_tokenize(document):
            if token not in self.stopwords and token not in self.punctuations and len(token) > 1:
                sub_tokens = re.split(r'[_/\\\-\|]', token)
                for sub_token in sub_tokens:
                    if sub_token not in self.stopwords and len(sub_token) > 1:
                        processed_tokens.append(self.lemmatizer.lemmatize(sub_token.lower()))
        return processed_tokens

    @exception_handler
    def get_score(self, resume_tokens: list, jd_tokens: list):
        corpus = [' '.join(resume_tokens), ' '.join(jd_tokens)]
        tfidf_matrix = self.vectorizer.fit_transform(corpus)
        similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
        return round(abs(similarity[0][0] * 100), 2)

    def main(self, resume_path: str, jd_path: str):
        resume_text = self.document_parser(resume_path)
        jd_text = self.document_parser(jd_path)

        if isinstance(resume_text, str) and isinstance(jd_text, str):
            resume_tokens = self.data_preprocessing(resume_text)
            jd_tokens = self.data_preprocessing(jd_text)
            return self.get_score(resume_tokens, jd_tokens)
        return "Error in processing documents."

# File paths (update these paths based on your Jupyter Notebook environment)
resume_path = "Resume.pdf"  # Update with the actual resume file path
jd_path = "JD.txt"  # Update with the actual job description file path

# Run the Relevance Score Calculation
obj = RelevanceScore()
score = obj.main(resume_path, jd_path)
print(f"Resume Relevance Score: {score}%")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\smani\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Resume Relevance Score: 9.44%
