In [38]:
# Model 1 point = 1
import os
import pandas as pd
import PyPDF2 as pdf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def extract_text_from_pdf(pdf_path):
    try:
        pdf_reader = pdf.PdfReader(pdf_path)
        return pdf_reader.pages[0].extract_text()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

def calculate_cosine_similarity(given_text, pdf_text):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([given_text, pdf_text])
    return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

def main(folder_path, given_text):
    pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]

    # Initialize a dictionary to store cosine similarity scores
    similarity_scores = {}

    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        pdf_text = extract_text_from_pdf(pdf_path)
        similarity = calculate_cosine_similarity(given_text, pdf_text)
        similarity_scores[pdf_file] = similarity

    # Sort the scores in descending order
    sorted_scores = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)

    # Print the top 20 matches
    print("Top 20 Resumes:")
    for i, (resume, score) in enumerate(sorted_scores[:20], start=1):
        print(f"{i}. {resume} (Cosine Similarity: {score:.4f})")

if __name__ == "__main__":
    folder_path = "C:/Users/chakr/OneDrive/Desktop/resumes"
    given_text =  """As a Data Scientist at Zyient.io, you will play a pivotal role in driving data-driven solutions within our Application Team. Leveraging your expertise in AI and ML, you will work on developing and implementing advanced models and algorithms to extract valuable insights from data, enhance OCR capabilities, and revolutionize processes in the Insurance and Accounts Receivable sectors.


Responsibilities:

You’ll be working on AI platforms, which are a set of products responsible for building, training & deploying ML models for extraction & interpretation of semi & unstructured documents using NLP & Computer vision.
You’ll be working on ML model development as well as building automated training pipelines that deliver high-performance models on a given dataset.
You’ll have to work with application team to understand their data extraction requirements that can be solved using ML.
Educate SMEs on ensuring high-quality data annotations & help them to validate your ML models.
Take part in building APIs around your model for production & ensure that it is able to deliver expected accuracies & throughput requirements.


Skillset: 



Must have Minimum 2 (for DS-1) and 4 (for DS-2) years of Data Science Experience
Strong theoretical & practical knowledge of ML model development, hyper-parameter tuning & production deployment.
Strong experience in building models using libraries like Tensorflow, Pytorch
Good experience in writing code in Python (3.x)
Understanding of well-known architecture/algorithms in NLP like Transformers, LSTMs & GRUs
Experience in fine-tuning pre-trained models like BERT, and ELECTRA for downstream tasks such as NER, Classification, etc.
Understanding of Object detection using libraries like Yolo or TF object detection API
Knowledge of standard packaging & deployment solutions like Tensorflow Model Server, MLflow, or ONNX.
Practical knowledge of libraries like Numpy, Pandas & Spacy
Practical knowledge of building RESTful APIs around your model using Fast/Flask API
Strong Understanding on MLOPs - Life cycle of Machine Learning Models
"""
    main(folder_path, given_text)


Top 20 Resumes:
1. 2 Pager Resume.pdf (Cosine Similarity: 0.3455)
2. my resume.pdf (Cosine Similarity: 0.3142)
3. Manish Chaudhary CV.pdf (Cosine Similarity: 0.1966)
4. SouravResume  - 0-2 (2).pdf (Cosine Similarity: 0.1625)
5. SOUVICK GHOSH.pdf (Cosine Similarity: 0.1166)


In [39]:
# Model 2
import os
import pandas as pd
import PyPDF2 as pdf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import nltk

# Download NLTK stop words (if not already downloaded)
nltk.download("stopwords")

def extract_text_from_pdf(pdf_path):
    try:
        pdf_reader = pdf.PdfReader(pdf_path)
        return pdf_reader.pages[0].extract_text()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

def calculate_cosine_similarity(given_text, pdf_text):
    stop_words = set(stopwords.words("english"))

    # Remove stop words from both texts
    given_text_cleaned = " ".join([word for word in given_text.lower().split() if word not in stop_words])
    pdf_text_cleaned = " ".join([word for word in pdf_text.lower().split() if word not in stop_words])

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([given_text_cleaned, pdf_text_cleaned])
    return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

def main(folder_path, given_text):
    pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]

    # Initialize a dictionary to store cosine similarity scores
    similarity_scores = {}

    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        pdf_text = extract_text_from_pdf(pdf_path)
        similarity = calculate_cosine_similarity(given_text, pdf_text)
        similarity_scores[pdf_file] = similarity

    # Sort the scores in descending order
    sorted_scores = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)

    # Print the top 20 matches
    print("Top 20 Resumes:")
    for i, (resume, score) in enumerate(sorted_scores[:20], start=1):
        print(f"{i}. {resume} (Cosine Similarity: {score:.4f})")

if __name__ == "__main__":
    folder_path = "C:/Users/chakr/OneDrive/Desktop/resumes"
    given_text = """As a Data Scientist at Zyient.io, you will play a pivotal role in driving data-driven solutions within our Application Team. Leveraging your expertise in AI and ML, you will work on developing and implementing advanced models and algorithms to extract valuable insights from data, enhance OCR capabilities, and revolutionize processes in the Insurance and Accounts Receivable sectors.


Responsibilities:

You’ll be working on AI platforms, which are a set of products responsible for building, training & deploying ML models for extraction & interpretation of semi & unstructured documents using NLP & Computer vision.
You’ll be working on ML model development as well as building automated training pipelines that deliver high-performance models on a given dataset.
You’ll have to work with application team to understand their data extraction requirements that can be solved using ML.
Educate SMEs on ensuring high-quality data annotations & help them to validate your ML models.
Take part in building APIs around your model for production & ensure that it is able to deliver expected accuracies & throughput requirements.


Skillset: 



Must have Minimum 2 (for DS-1) and 4 (for DS-2) years of Data Science Experience
Strong theoretical & practical knowledge of ML model development, hyper-parameter tuning & production deployment.
Strong experience in building models using libraries like Tensorflow, Pytorch
Good experience in writing code in Python (3.x)
Understanding of well-known architecture/algorithms in NLP like Transformers, LSTMs & GRUs
Experience in fine-tuning pre-trained models like BERT, and ELECTRA for downstream tasks such as NER, Classification, etc.
Understanding of Object detection using libraries like Yolo or TF object detection API
Knowledge of standard packaging & deployment solutions like Tensorflow Model Server, MLflow, or ONNX.
Practical knowledge of libraries like Numpy, Pandas & Spacy
Practical knowledge of building RESTful APIs around your model using Fast/Flask API
Strong Understanding on MLOPs - Life cycle of Machine Learning Models
"""
    main(folder_path, given_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chakr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top 20 Resumes:
1. my resume.pdf (Cosine Similarity: 0.1583)
2. 2 Pager Resume.pdf (Cosine Similarity: 0.1286)
3. Manish Chaudhary CV.pdf (Cosine Similarity: 0.0442)
4. SouravResume  - 0-2 (2).pdf (Cosine Similarity: 0.0436)
5. SOUVICK GHOSH.pdf (Cosine Similarity: 0.0327)


In [40]:
# Model 3
import os
import pandas as pd
import PyPDF2 as pdf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download NLTK stop words (if not already downloaded)
nltk.download("stopwords")

def extract_text_from_pdf(pdf_path):
    try:
        pdf_reader = pdf.PdfReader(pdf_path)
        return pdf_reader.pages[0].extract_text()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

def preprocess_text(text):
    stop_words = set(stopwords.words("english"))
    stemmer = PorterStemmer()

    # Remove stop words and perform stemming
    cleaned_tokens = [stemmer.stem(word) for word in text.lower().split() if word not in stop_words]
    return " ".join(cleaned_tokens)

def calculate_cosine_similarity(given_text, pdf_text):
    given_text_cleaned = preprocess_text(given_text)
    pdf_text_cleaned = preprocess_text(pdf_text)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([given_text_cleaned, pdf_text_cleaned])
    return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

def main(folder_path, given_text):
    pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]

    similarity_scores = {}

    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        pdf_text = extract_text_from_pdf(pdf_path)
        similarity = calculate_cosine_similarity(given_text, pdf_text)
        similarity_scores[pdf_file] = similarity

    sorted_scores = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)

    print("Top 20 Resumes:")
    for i, (resume, score) in enumerate(sorted_scores[:20], start=1):
        print(f"{i}. {resume} (Cosine Similarity: {score:.4f})")

if __name__ == "__main__":
    folder_path = "C:/Users/chakr/OneDrive/Desktop/resumes"
    given_text = """
  As a Data Scientist at Zyient.io, you will play a pivotal role in driving data-driven solutions within our Application Team. Leveraging your expertise in AI and ML, you will work on developing and implementing advanced models and algorithms to extract valuable insights from data, enhance OCR capabilities, and revolutionize processes in the Insurance and Accounts Receivable sectors.


Responsibilities:

You’ll be working on AI platforms, which are a set of products responsible for building, training & deploying ML models for extraction & interpretation of semi & unstructured documents using NLP & Computer vision.
You’ll be working on ML model development as well as building automated training pipelines that deliver high-performance models on a given dataset.
You’ll have to work with application team to understand their data extraction requirements that can be solved using ML.
Educate SMEs on ensuring high-quality data annotations & help them to validate your ML models.
Take part in building APIs around your model for production & ensure that it is able to deliver expected accuracies & throughput requirements.


Skillset: 



Must have Minimum 2 (for DS-1) and 4 (for DS-2) years of Data Science Experience
Strong theoretical & practical knowledge of ML model development, hyper-parameter tuning & production deployment.
Strong experience in building models using libraries like Tensorflow, Pytorch
Good experience in writing code in Python (3.x)
Understanding of well-known architecture/algorithms in NLP like Transformers, LSTMs & GRUs
Experience in fine-tuning pre-trained models like BERT, and ELECTRA for downstream tasks such as NER, Classification, etc.
Understanding of Object detection using libraries like Yolo or TF object detection API
Knowledge of standard packaging & deployment solutions like Tensorflow Model Server, MLflow, or ONNX.
Practical knowledge of libraries like Numpy, Pandas & Spacy
Practical knowledge of building RESTful APIs around your model using Fast/Flask API
Strong Understanding on MLOPs - Life cycle of Machine Learning Models

"""
    main(folder_path, given_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chakr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top 20 Resumes:
1. my resume.pdf (Cosine Similarity: 0.2061)
2. 2 Pager Resume.pdf (Cosine Similarity: 0.1711)
3. SouravResume  - 0-2 (2).pdf (Cosine Similarity: 0.0876)
4. Manish Chaudhary CV.pdf (Cosine Similarity: 0.0661)
5. SOUVICK GHOSH.pdf (Cosine Similarity: 0.0489)
