In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import PyPDF2
import re
import numpy as np

In [2]:
# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [4]:
# Define the job titles and their corresponding labels
job_titles = ['Data Scientist', 'Software Engineer', 'Marketing Manager']
labels = [1, 2, 3]

In [5]:
# Function to encode a resume text into a BERT-compatible format
def encode_resume(resume_text):
    input_ids = torch.tensor(tokenizer.encode(resume_text, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
    with torch.no_grad():
        outputs = model(input_ids)
    logits = outputs[0]
    return logits

In [6]:
# Function to classify a resume based on its job title
def classify_resume(resume_text):
    logits = encode_resume(resume_text)
    probs = torch.softmax(logits, dim=1)
    top_prob, top_label = probs.max(1)
    predicted_label = job_titles[top_label.item()]
    return predicted_label

In [8]:
pdf = open('resume\PU.pdf', 'rb') #resume input

In [9]:
pdfReader = PyPDF2.PdfReader(pdf)

In [10]:
pdfReader.metadata #About PDF

{'/Author': 'Prateek Kala',
 '/Creator': 'MicrosoftÂ® Word 2016',
 '/CreationDate': "D:20221205042833+00'00'",
 '/Producer': 'www.ilovepdf.com',
 '/ModDate': 'D:20221205042833Z'}

In [11]:
x= (len(pdfReader.pages))
print("Number of pages in the document:",x)

Number of pages in the document: 1


In [12]:
text = " "

In [13]:
for page_number in range(x):
    pdfReader = PyPDF2.PdfReader(pdf)
    if pdfReader.is_encrypted:
        pdfReader.decrypt('')
    page = pdfReader.pages[page_number]
    text += page.extract_text()

In [14]:
pdf.close() # closing the pdf file object

## Pre-Processing:

In [15]:
text = re.sub('http\S+\s*', ' ', text)  # remove URLs
text = re.sub('RT|cc', ' ', text)  # remove RT and cc
text = re.sub('#\S+', '', text)  # remove hashtags
text = re.sub('@\S+', '  ', text)  # remove mentions
text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)  # remove punctuation
text = re.sub(r'[0-9]+', '', text)
text = re.sub(r'[^\x00-\x7f]',r' ', text) 
text = re.sub('\s+', ' ', text)  # remove extra whitespace

In [16]:
text

' CAREER OBJECTIVE Training and Certifications Jobs and Responsibilities COMPUTER SKILLS ACADEMIC BACKGROUND PRASHANT UPADHYAY Phone Email prashantupadhyay Prashant is a career oriented professional with decent communication and interpersonal skills who knows how to make sense of data and translate it into actionable insights He is familiar with gathering cleaning and organizing data for the use of technical and non technical personnel He is seeking a challenging position in a growth oriented organization w here he can effectively contribute through his skills and abilities Certifications Institution Google May Certificate Google Data analytics professional certification Institution LinkedIn Jul Certificate Tableau Essential Training Zummit Infolabs Nov Present Jr Data Scientist Writing code in Python Tensorflow Keras projects related to CNN GAN and RNN for the Hospitality or Financial domain Used Jupyter Collab notebooks for Machine Learning Deep Learning problems Yoshops com Jun Aug 

In [17]:
# Example usage
# resume_text = "John Doe is a highly skilled data scientist with 5 years of experience in the field. He has a PhD in Computer Science and has worked on several data-driven projects. John has strong programming skills in Python and is proficient in using data visualization tools such as Matplotlib and Seaborn. He is also familiar with machine learning algorithms and has experience working with large datasets."
predicted_label = classify_resume(text)
print("Predicted job title:", predicted_label)  # Output: Predicted job title: Data Scientist

Predicted job title: Data Scientist
