<a href="https://colab.research.google.com/github/KarthikAlagarsamy/Resume-Semantic-Search/blob/main/Karthik_HealTether.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Resume Semantic Search with BERT Embeddings**

In [4]:
# Install necessary packages
!pip install gradio
!pip install transformers
!pip install PyPDF2
!pip install bs4
!pip install requests

import gradio as gr
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import requests
from bs4 import BeautifulSoup
import PyPDF2



In [5]:
# Initialize the BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [6]:
# Function to extract text from a PDF URL
def extract_text_from_pdf_url(pdf_url):
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open("pdf_file", "wb") as f:
            f.write(response.content)

        pdf_reader = PyPDF2.PdfReader("pdf_file")
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        return text
    else:
        print(f"Failed to fetch {pdf_url}")
        return None

In [7]:
# Function to fetch resume text from a web link (HTML)
def fetch_resume_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        content = response.content
        soup = BeautifulSoup(content, 'html.parser')
        resume_text = soup.get_text()
        return resume_text
    else:
        print(f"Failed to fetch {url}")
        return None

In [8]:
# Function to generate embeddings using BERT
def generate_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze(0).numpy()
    return embeddings

In [9]:
# Function to preprocess resume text into different sections
def preprocess_resume(text):
    try:
        sections = {'work_experience': '', 'education': '', 'skills': ''}
        work_experience_keywords = ['work experience', 'professional experience', 'employment history']
        education_keywords = ['education', 'academic background', 'educational qualifications']
        skills_keywords = ['skills', 'technical skills', 'skills summary']

        # Split resume into sections based on keywords
        text_lower = text.lower()
        start_work_exp = min(text_lower.find(keyword) for keyword in work_experience_keywords if keyword in text_lower)
        start_education = min(text_lower.find(keyword) for keyword in education_keywords if keyword in text_lower)
        start_skills = min(text_lower.find(keyword) for keyword in skills_keywords if keyword in text_lower)

        sections['work_experience'] = text[start_work_exp:start_education] if start_work_exp != -1 else ''
        sections['education'] = text[start_education:start_skills] if start_education != -1 else ''
        sections['skills'] = text[start_skills:] if start_skills != -1 else ''

        return sections
    except Exception as e:
        print(f"Error preprocessing resume text: {e}")
        return {'work_experience': '', 'education': '', 'skills': ''}


In [10]:
# Define the search function for Gradio using BERT embeddings
def search_resumes(queries, resume_urls):
    results = []
    try:
        for query in queries.split('\n'):
            query = query.strip()
            if not query:
                continue
            query_embedding = generate_embeddings(query)

            for resume_url in resume_urls.split('\n'):
                resume_url = resume_url.strip()
                if not resume_url:
                    continue
                if resume_url.endswith('.pdf'):
                    resume_text = extract_text_from_pdf_url(resume_url)
                else:
                    resume_text = fetch_resume_from_url(resume_url)

                if resume_text:
                    preprocessed_resume = preprocess_resume(resume_text)
                    work_exp_embedding = generate_embeddings(preprocessed_resume['work_experience'])
                    education_embedding = generate_embeddings(preprocessed_resume['education'])
                    skills_embedding = generate_embeddings(preprocessed_resume['skills'])

                    resume_embedding = (work_exp_embedding + education_embedding + skills_embedding) / 3.0
                    similarity = cosine_similarity([query_embedding], [resume_embedding])[0][0]
                    results.append({
                        'query': query,
                        'resume_url': resume_url,
                        'similarity': similarity
                    })
                else:
                    results.append({
                        'resume_url': resume_url,
                        'resume_text_preview': "Failed to fetch and process the resume."
                    })

        # Sort resumes based on similarity
        results = sorted(results, key=lambda x: x['similarity'], reverse=True)

        # Prepare the top resumes output
        top_resumes = ""
        for result in results:
            top_resumes += f"Resume URL: {result['resume_url']}\n"
            top_resumes += f"Similarity: {result['similarity']}\n\n"
    except Exception as e:
        top_resumes = f"Error occurred: {str(e)}\n"

    return top_resumes

In [11]:
# Gradio Frontend interface for resume semantic search
iface = gr.Interface(
    fn=search_resumes,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your job description/query here", label="Query"),
        gr.Textbox(lines=5, placeholder="Enter resume URLs here", label="Resume URLs")
    ],
    outputs=gr.Textbox(label="Top Resumes"),
    title="Resume Semantic Search with BERT Embeddings"
)

iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://920887ccb39c7d5b41.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




# **Sample resume url for checking in Gradio interface**

https://www.ischool.berkeley.edu/system/files/resume/resume_juanjosecarin.pdf

https://yunlongjiao.github.io/resume/resume.pdf

# **Sample Job Description**

Hiring for AI developer with 5-6 years of experience in the e-commerce vertical