<a href="https://colab.research.google.com/github/NandhiniAnne/nandhiniresumechatbot/blob/main/Resume_Chatbot_%26_Semantic_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import gradio as gr
from sentence_transformers import SentenceTransformer
import faiss
import fitz  # PyMuPDF
import re
import json
import time

# --- 1. Data Loading and Preprocessing (Unchanged) ---
def load_and_parse_resume_by_line(pdf_path):
    """
    Loads a PDF and parses it line-by-line to create the most accurate and clean sections.
    """
    try:
        doc = fitz.open(pdf_path)
        full_text = "".join(page.get_text() for page in doc)
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
        return []

    headings = [
        'OBJECTIVE', 'EDUCATION', 'SKILLS', 'PROJECTS', 'CERTIFICATIONS',
        'EXTRACURRICULAR ACTIVITIES', 'ADDITIONAL INFORMATION', 'WORK EXPERIENCE',
        'EXPERIENCE', 'EMPLOYMENT HISTORY', 'PUBLICATIONS', 'SUMMARY'
    ]
    lines = full_text.split('\n')
    chunks = []
    current_chunk_lines = []
    header_chunk_finished = False

    for line in lines:
        cleaned_line = line.strip()
        is_heading = cleaned_line.upper() in headings

        if is_heading and not header_chunk_finished:
            if current_chunk_lines:
                chunks.append("\n".join(current_chunk_lines).strip())
            current_chunk_lines = [cleaned_line]
            header_chunk_finished = True
        elif is_heading and header_chunk_finished:
            if current_chunk_lines:
                chunks.append("\n".join(current_chunk_lines).strip())
            current_chunk_lines = [cleaned_line]
        else:
            if cleaned_line:
                current_chunk_lines.append(cleaned_line)

    if current_chunk_lines:
        chunks.append("\n".join(current_chunk_lines).strip())

    return [chunk for chunk in chunks if len(chunk) > 20]

# --- 2. Semantic Embedding and Indexing (Unchanged) ---
def build_semantic_index(text_chunks, model):
    embeddings = model.encode(text_chunks, convert_to_tensor=True)
    embedding_dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(embedding_dim)
    index.add(embeddings.cpu().numpy())
    return index

# --- 3. Semantic Search (Now finds more context) ---
def search(query, model, index, text_chunks, top_k=3): # Increased to 3 for richer context
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [text_chunks[i] for i in indices[0]]

# --- 4. Question Answering (UPGRADED to use Gemini API) ---
async def answer_question_with_gemini(question, context_chunks):
    """
    Uses the Gemini 2.5 Flash model for superior understanding, summarization, and accuracy.
    """
    if not context_chunks:
        return "I couldn't find any relevant information in the resume."

    context = "\n\n---\n\n".join(context_chunks)

    # A more sophisticated prompt for the powerful LLM
    system_prompt = """
    You are an expert HR assistant. Your task is to answer questions about a candidate's resume accurately and concisely.
    - Base your answer STRICTLY on the provided resume context.
    - If the information is not present in the context, state that clearly. Do not make assumptions.
    - For lists (like skills or projects), present them clearly, preferably using bullet points.
    - Keep answers directly relevant to the user's question.
    """

    user_prompt = f"""
    CONTEXT:
    {context}

    QUESTION:
    {question}
    """

    api_key = "" # This will be handled by the execution environment.
    api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-05-20:generateContent?key={api_key}"

    payload = {
        "contents": [{"parts": [{"text": user_prompt}]}],
        "systemInstruction": {"parts": [{"text": system_prompt}]},
        "generationConfig": {
            "temperature": 0.2,
            "topP": 0.9,
            "maxOutputTokens": 500,
        }
    }

    # API call with exponential backoff for robustness
    max_retries = 5
    delay = 1
    for attempt in range(max_retries):
        try:
            response = await fetch(api_url, {
                "method": 'POST',
                "headers": {'Content-Type': 'application/json'},
                "body": json.dumps(payload)
            })

            if response.ok:
                result = await response.json()
                candidate = result.get('candidates', [{}])[0]
                if candidate.get('content', {}).get('parts', [{}])[0].get('text'):
                    return candidate['content']['parts'][0]['text']
                else:
                    return "Sorry, I received an unexpected response from the AI. Please try again."
            else:
                error_text = await response.text()
                print(f"API Error (Attempt {attempt+1}/{max_retries}): {response.status} - {error_text}")
                if response.status == 429: # Rate limit error
                    time.sleep(delay)
                    delay *= 2 # Exponential backoff
                else:
                    return f"Error: Could not get an answer from the AI. Status: {response.status}"

        except Exception as e:
            print(f"Fetch Error (Attempt {attempt+1}/{max_retries}): {e}")
            time.sleep(delay)
            delay *= 2

    return "Sorry, the AI service is currently unavailable after multiple retries. Please try again later."


# --- Main Application Logic (Updated to be async) ---
class ResumeChatbot:
    def __init__(self, resume_path):
        print("Loading embedding model...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        print("Model loaded.")

        print("Processing resume with precision parser...")
        self.text_chunks = load_and_parse_resume_by_line(resume_path)
        if not self.text_chunks:
            raise ValueError(f"Could not extract any text sections from {resume_path}")

        self.faiss_index = build_semantic_index(self.text_chunks, self.embedding_model)
        print(f"Resume indexed into {len(self.text_chunks)} sections.")

    async def query(self, question):
        print(f"\nReceived question: {question}")
        relevant_chunks = search(question, self.embedding_model, self.faiss_index, self.text_chunks)
        print(f"Found {len(relevant_chunks)} relevant section(s)...")
        answer = await answer_question_with_gemini(question, relevant_chunks)
        print(f"Generated answer: {answer}")
        return answer

# --- Gradio Web Interface (Updated to be async) ---
def create_chatbot_interface(chatbot_instance):
    with gr.Blocks(theme=gr.themes.Soft(), title="Resume Chatbot") as demo:
        gr.Markdown(
            """
            # AI-Powered Resume Chatbot (Gemini Edition)
            Ask any question about the resume for an accurate, synthesized answer.
            """
        )
        chatbot = gr.Chatbot(label="Chat History", height=500)
        msg = gr.Textbox(label="Your Question", placeholder="e.g., Summarize their fraud detection project")
        clear = gr.Button("Clear Chat")

        async def user(user_message, history):
            return "", history + [[user_message, None]]

        async def bot(history):
            user_message = history[-1][0]
            bot_message = await chatbot_instance.query(user_message)
            history[-1][1] = bot_message
            return history

        msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
            bot, chatbot, chatbot
        )
        clear.click(lambda: None, None, chatbot, queue=False)

    return demo

if __name__ == "__main__":
    resume_file_path = "Nandhini_Resume.pdf"
    try:
        # NOTE: Gradio uses 'asyncio' under the hood, so we don't need to manage the event loop.
        chatbot_app = ResumeChatbot(resume_file_path)
        interface = create_chatbot_interface(chatbot_app)
        interface.launch(share=True)

    except FileNotFoundError:
        print(f"Error: The file '{resume_file_path}' was not found.")
        print("Please make sure your resume PDF is in the same directory as this script.")
    except Exception as e:
        print(f"An error occurred: {e}")










Loading embedding model...
Model loaded.
Processing resume with precision parser...
Resume indexed into 8 sections.


  chatbot = gr.Chatbot(label="Chat History", height=500)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1f40c9e37784f19ad1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [7]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.4
