<a href="https://colab.research.google.com/github/NandhiniAnne/nandhiniresumechatbot/blob/main/Resume_Chatbot_%26_Semantic_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import gradio as gr
from sentence_transformers import SentenceTransformer
import faiss
import fitz  # PyMuPDF
import json
import os
import asyncio
import aiohttp
GEMINI_API_KEY = "AIzaSyBrSqwcNmaDUVoEFHVH4m1qjfdKorLaSEQ"
# --- 1. Data Loading and Preprocessing ---
def load_and_parse_resume_by_line(pdf_path, filename):
    """Loads a single PDF and parses it line-by-line to create clean sections."""
    try:
        doc = fitz.open(pdf_path)
        full_text = "".join(page.get_text() for page in doc)
    except Exception as e:
        print(f"Error reading PDF {filename}: {e}")
        return []

    headings = [
        'OBJECTIVE', 'EDUCATION', 'SKILLS', 'PROJECTS', 'CERTIFICATIONS',
        'EXTRACURRICULAR ACTIVITIES', 'ADDITIONAL INFORMATION', 'WORK EXPERIENCE',
        'EXPERIENCE', 'EMPLOYMENT HISTORY', 'PUBLICATIONS', 'SUMMARY', 'CONTACT'
    ]
    lines = full_text.split('\n')
    chunks = []
    current_chunk_lines = []
    header_chunk_finished = False

    for line in lines:
        cleaned_line = line.strip()
        is_heading = cleaned_line.upper() in headings

        if is_heading and not header_chunk_finished:
            if current_chunk_lines: chunks.append("\n".join(current_chunk_lines).strip())
            current_chunk_lines = [cleaned_line]
            header_chunk_finished = True
        elif is_heading and header_chunk_finished:
            if current_chunk_lines: chunks.append("\n".join(current_chunk_lines).strip())
            current_chunk_lines = [cleaned_line]
        else:
            if cleaned_line: current_chunk_lines.append(cleaned_line)

    if current_chunk_lines: chunks.append("\n".join(current_chunk_lines).strip())
    return [{'source': filename, 'content': chunk} for chunk in chunks if len(chunk) > 20]

# --- 2. Semantic Embedding and Indexing ---
def build_semantic_index(chunks_with_source, model):
    """Builds a FAISS index from a list of chunk dictionaries."""
    if not chunks_with_source:
        return None
    text_chunks = [chunk['content'] for chunk in chunks_with_source]
    embeddings = model.encode(text_chunks, convert_to_tensor=True)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings.cpu().numpy())
    return index

# --- 3. Semantic Search ---
def search(query, model, index, all_chunks_with_source, top_k=3):
    """Searches the index and returns the original chunk dictionaries."""
    if index is None:
        return []
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [all_chunks_with_source[i] for i in indices[0]]

# --- 4. Question Answering with Gemini API ---
async def answer_question_with_gemini(question, context_chunks):
    """Uses the Gemini 2.5 Flash model via the provided API key."""
    api_key = GEMINI_API_KEY
    if not api_key or api_key == "PASTE_YOUR_NEW_API_KEY_HERE":
        return "ERROR: Gemini API key is missing. Please paste your key at the top of the python script."

    if not context_chunks:
        return "No relevant information found across the uploaded resumes."

    context = "\n\n---\n\n".join([f"CONTEXT from resume '{c['source']}':\n{c['content']}" for c in context_chunks])
    system_prompt = "You are an expert HR assistant. Answer questions strictly based on the provided resume context. If information is missing, state that clearly. For lists like skills, use bullet points."
    user_prompt = f"{context}\n\nQUESTION: {question}"

    api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-05-20:generateContent?key={api_key}"
    payload = {
        "contents": [{"parts": [{"text": user_prompt}]}],
        "systemInstruction": {"parts": [{"text": system_prompt}]},
        "generationConfig": {"temperature": 0.1, "topP": 0.9, "maxOutputTokens": 800}
    }

    max_retries = 3
    delay = 1
    async with aiohttp.ClientSession() as session:
        for attempt in range(max_retries):
            try:
                async with session.post(api_url, json=payload, headers={'Content-Type': 'application/json'}) as response:
                    if response.status == 200:
                        result = await response.json()
                        candidate = result.get('candidates', [{}])[0]
                        if candidate.get('content', {}).get('parts', [{}])[0].get('text'):
                            return candidate['content']['parts'][0]['text']
                        else:
                            return f"AI returned an empty response. Reason: {candidate.get('finishReason', 'Unknown Error')}"
                    elif response.status == 429:
                        print(f"Rate limited. Retrying in {delay}s...")
                        await asyncio.sleep(delay)
                        delay *= 2
                    else:
                        error_text = await response.text()
                        print(f"API Error: {response.status} - {error_text}")
                        return f"Error: API request failed with status: {response.status}. Check your API key and billing."
            except aiohttp.ClientError as e:
                print(f"Network Error (Attempt {attempt+1}/{max_retries}): {e}")
                await asyncio.sleep(delay)
                delay *= 2

    return "Sorry, the AI service is currently unavailable. Please try again later."


# --- Main Application Logic & Gradio UI ---
class MultiResumeChatbot:
    def __init__(self):
        print("Loading embedding model...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.all_chunks_with_source = []
        self.faiss_index = None
        print("Model loaded.")

    def index_resumes(self, file_paths):
        if not file_paths: return "Please upload at least one resume file."
        self.all_chunks_with_source = []
        for temp_file in file_paths:
            filename = os.path.basename(temp_file.name)
            print(f"Parsing '{filename}'...")
            chunks = load_and_parse_resume_by_line(temp_file.name, filename)
            self.all_chunks_with_source.extend(chunks)
        if not self.all_chunks_with_source:
             return "Could not parse any text from the provided files."
        print(f"Building index for {len(self.all_chunks_with_source)} chunks...")
        self.faiss_index = build_semantic_index(self.all_chunks_with_source, self.embedding_model)
        return f"Successfully indexed {len(file_paths)} resumes. Ready for questions."

    async def query(self, question):
        if self.faiss_index is None:
            return "Please upload and index resumes first."
        relevant_chunks = search(question, self.embedding_model, self.faiss_index, self.all_chunks_with_source)
        return await answer_question_with_gemini(question, relevant_chunks)

def create_chatbot_interface():
    chatbot_instance = MultiResumeChatbot()
    with gr.Blocks(theme=gr.themes.Soft(), title="Multi-Resume Chatbot") as demo:
        gr.Markdown("# AI-Powered Multi-Resume Chatbot (Gemini Edition)")
        gr.Markdown("Upload one or more resumes, click 'Index Resumes', then ask your questions.")
        with gr.Row():
            with gr.Column(scale=1):
                file_uploader = gr.File(label="Upload Resumes", file_count="multiple", file_types=[".pdf"])
                index_button = gr.Button("Index Resumes", variant="primary")
                status_output = gr.Markdown(value="*No resumes indexed yet.*")
            with gr.Column(scale=2):
                chatbot = gr.Chatbot(label="Chat History", height=550)
                msg = gr.Textbox(label="Your Question", placeholder="e.g., Compare the skills of the candidates...", interactive=False)
                clear = gr.Button("Clear Chat")
        def handle_indexing(files):
            if not files: return "Please upload files first.", gr.Textbox(interactive=False)
            status = chatbot_instance.index_resumes(files)
            return status, gr.Textbox(interactive=True)
        async def user(user_message, history):
            return "", history + [[user_message, None]]
        async def bot(history):
            user_message = history[-1][0]
            bot_message = await chatbot_instance.query(user_message)
            history[-1][1] = bot_message
            return history
        index_button.click(handle_indexing, inputs=[file_uploader], outputs=[status_output, msg])
        msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot)
        def clear_chat(): return None, ""
        clear.click(clear_chat, None, [chatbot, msg], queue=False)
    return demo

if __name__ == "__main__":
    if GEMINI_API_KEY == "PASTE_YOUR_NEW_API_KEY_HERE":
        print("="*50)
        print("WARNING: You haven't added your API key to the script yet!")
        print("Please edit the 'multi_resume_chatbot.py' file and paste your key.")
        print("="*50)
    try:
        interface = create_chatbot_interface()
        interface.launch(share=True, debug=True)
    except Exception as e:
        print(f"An error occurred: {e}")



Loading embedding model...
Model loaded.


  chatbot = gr.Chatbot(label="Chat History", height=550)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://57376791d522c50545.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Parsing 'Nandhini_Resume.pdf'...
Building index for 8 chunks...
Parsing 'Nandhini_Resume.pdf'...
Building index for 8 chunks...
Parsing 'Nandhini_Resume.pdf'...
Parsing 'Surya_Vardhan_Resume_new1.pdf'...
Building index for 16 chunks...
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://57376791d522c50545.gradio.live
