


 # Install libraries



In [87]:
!pip install PyPDF2
!pip install pdfplumber
!pip install datasets



# Total Implementation

In [86]:
# importing libraries
import os
import numpy as np
import pandas as pd
import pdfplumber
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
nltk.download('punkt')

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = []
        for page in pdf.pages:
            text.append(page.extract_text())
        return ''.join(text).replace("\n", "")

# Tokenize text using NLTK
def tokenize_text(text):
    text = text.lower()
    return word_tokenize(text)

# Path to the folder containing 500 plus PDF files
pdf_folder = "/content/drive/MyDrive/AIMLIntern/archive/data/ARTS"

# Load the job descriptions dataset
dataset = load_dataset("jacob-hugging-face/job-descriptions")

# Fetch job descriptions and company names
num_job_descriptions = 15
job_descriptions = dataset["train"]["model_response"][:num_job_descriptions]
company_names = dataset["train"]["company_name"][:num_job_descriptions]

# Initialize a dictionary to store match percentages for each company
company_match_percentages = {company_name: [] for company_name in company_names}

# Initialize a list to store PDF file names
pdf_files = []

# Loop through PDF files in the folder
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, pdf_file)
        CV_Clear = extract_text_from_pdf(pdf_path)
        CV_Clear_tokens = tokenize_text(CV_Clear)

        # Calculate cosine similarity with each job description
        for company_name, job_description in zip(company_names, job_descriptions):
            job_description_tokens = tokenize_text(job_description)
            Match_Test = [' '.join(CV_Clear_tokens), ' '.join(job_description_tokens)]  # Tokenized text
            cv = CountVectorizer()
            count_matrix = cv.fit_transform(Match_Test)
            similarity = cosine_similarity(count_matrix)[0][1] * 100
            company_match_percentages[company_name].append(similarity)

        # Store the PDF file name
        pdf_files.append(pdf_file)

# Initialize a dictionary to store company match percentages and file names
data = {
    'Pdf Number': pdf_files
}

# Add match percentages for each company
for company_name in company_names:
    data[company_name] = company_match_percentages[company_name]

# Create a DataFrame to store the results
result_df = pd.DataFrame(data)

# Add a new column 'Max Similarity' to store the maximum similarity for each row (PDF)
result_df['Max Similarity'] = result_df[company_names].max(axis=1)

# Sort the DataFrame by the maximum similarity in descending order
result_df = result_df.sort_values(by='Max Similarity', ascending=False)

# Display the top 5 rows
result_df.head(5)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Pdf Number,Google,Apple,Netflix,Robert Half,TrackFive,DesignUps,"Equisolve, Inc.",Zander Insurance Agency,Tuff,General Dynamics Information Technology,Sony Music Entertainment,Snapshot Interactive,Deloitte,Themesoft Inc,Western Governors University,Max Similarity
425,85417107.pdf,47.159385,45.288673,62.589512,47.353852,66.524989,25.462773,49.134534,57.719431,56.723195,49.620544,60.339628,59.89738,18.808328,48.106005,47.248289,66.524989
516,46260230.pdf,47.007837,47.930283,60.510046,48.458915,66.408619,27.014949,51.697496,60.916997,55.066937,50.931163,62.904356,60.67721,21.430401,51.068152,49.683064,66.408619
471,80162314.pdf,45.72987,50.379063,60.307217,46.012194,66.211771,26.814636,50.013074,59.403131,53.896926,50.619024,61.302933,58.632765,20.935624,47.990752,51.470695,66.211771
274,91635250.pdf,47.055854,43.843357,59.163207,46.208714,65.630477,26.019413,49.279313,58.135446,54.126571,48.532375,61.4998,57.949448,17.812696,49.972325,44.790595,65.630477
292,99244405.pdf,48.812511,50.529458,63.878417,50.811163,65.61277,26.788484,52.764039,59.093836,56.763843,52.021984,63.866656,61.522683,23.067872,49.839037,50.198133,65.61277
