<a href="https://colab.research.google.com/github/NandhiniAnne/nandhiniresumechatbot/blob/main/Resume_Chatbot_%26_Semantic_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import gradio as gr
from sentence_transformers import SentenceTransformer
import faiss
import fitz  # PyMuPDF
import json
import os
import asyncio
import aiohttp
from dotenv import load_dotenv
import numpy as np

# --- Load API Key from .env file ---
# This line looks for a .env file in your project folder and loads the API key.
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

# --- Define JSON Schemas for Structured Output ---
# This schema tells the AI exactly how to format the candidate summary.
SUMMARY_SCHEMA = {
    "type": "OBJECT",
    "properties": {
        "name": {"type": "STRING"},
        "summary": {"type": "STRING"},
        "top_skills": {
            "type": "ARRAY",
            "items": {"type": "STRING"}
        }
    },
    "required": ["name", "summary", "top_skills"]
}

# This schema tells the AI exactly how to format the job match score and rationale.
MATCH_SCHEMA = {
    "type": "OBJECT",
    "properties": {
        "candidate_name": {"type": "STRING"},
        "match_score": {
            "type": "INTEGER",
            "description": "A score from 0 to 100 representing how well the candidate matches the job description."
        },
        "rationale": {
            "type": "STRING",
            "description": "A brief, one or two sentence explanation for the score."
        }
    },
    "required": ["candidate_name", "match_score", "rationale"]
}


# --- 1. Data Loading and Preprocessing ---
def parse_resume_text(full_text, filename):
    """Parses the full text of a resume into logical sections."""
    headings = [
        'OBJECTIVE', 'EDUCATION', 'SKILLS', 'PROJECTS', 'CERTIFICATIONS',
        'EXTRACURRICULAR ACTIVITIES', 'ADDITIONAL INFORMATION', 'WORK EXPERIENCE',
        'EXPERIENCE', 'EMPLOYMENT HISTORY', 'PUBLICATIONS', 'SUMMARY', 'CONTACT'
    ]
    lines = full_text.split('\n')
    chunks = []
    
    # Capture the header/contact info as the first chunk
    header_chunk = []
    first_heading_found = False
    for line in lines:
        cleaned_line = line.strip()
        if cleaned_line.upper() in headings:
            first_heading_found = True
            break
        if cleaned_line:
            header_chunk.append(cleaned_line)
    
    if header_chunk:
        chunks.append({'source': filename, 'content': "\n".join(header_chunk)})

    # Process the rest of the sections
    current_chunk_lines = []
    if first_heading_found:
      # Find where the actual sections start
      start_index = 0
      for i, line in enumerate(lines):
          if line.strip().upper() in headings:
              start_index = i
              break
      
      for line in lines[start_index:]:
          cleaned_line = line.strip()
          is_heading = cleaned_line.upper() in headings
          if is_heading:
              if current_chunk_lines:
                  chunks.append({'source': filename, 'content': "\n".join(current_chunk_lines).strip()})
              current_chunk_lines = [cleaned_line]
          else:
              if cleaned_line:
                  current_chunk_lines.append(cleaned_line)

    if current_chunk_lines:
        chunks.append({'source': filename, 'content': "\n".join(current_chunk_lines).strip()})
        
    return [chunk for chunk in chunks if len(chunk['content']) > 20]

# --- 2. Semantic Embedding and Indexing ---
def build_semantic_index(chunks_with_source, model):
    if not chunks_with_source: return None, []
    text_chunks = [chunk['content'] for chunk in chunks_with_source]
    embeddings = model.encode(text_chunks)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings).astype('float32'))
    return index, text_chunks

# --- 3. Semantic Search ---
def search(query, model, index, all_chunks_with_source, top_k=3):
    if index is None or index.ntotal == 0: return []
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding).astype('float32'), top_k)
    # Ensure indices are within bounds
    valid_indices = [i for i in indices[0] if i < len(all_chunks_with_source)]
    return [all_chunks_with_source[i] for i in valid_indices]

# --- 4. Gemini API Communication ---
async def call_gemini_api(system_prompt, user_prompt, json_schema=None):
    if not GEMINI_API_KEY:
        return {"error": "API key not found. Please check your .env file and ensure it is in the same folder as the script."}

    api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-05-20:generateContent?key={GEMINI_API_KEY}"
    
    generation_config = {"temperature": 0.2, "topP": 0.9, "maxOutputTokens": 1024}
    if json_schema:
        generation_config["responseMimeType"] = "application/json"
        generation_config["responseSchema"] = json_schema

    payload = {
        "contents": [{"parts": [{"text": user_prompt}]}],
        "systemInstruction": {"parts": [{"text": system_prompt}]},
        "generationConfig": generation_config
    }

    async with aiohttp.ClientSession() as session:
        try:
            async with session.post(api_url, json=payload, headers={'Content-Type': 'application/json'}) as response:
                if response.status == 200:
                    result = await response.json()
                    candidate = result.get('candidates', [{}])[0]
                    part = candidate.get('content', {}).get('parts', [{}])[0]
                    if 'text' in part:
                        return {"text": part['text']}
                    return {"error": f"AI returned an empty response. Reason: {candidate.get('finishReason', 'Unknown')}"}
                else:
                    error_text = await response.text()
                    return {"error": f"API Error {response.status}: {error_text}"}
        except aiohttp.ClientError as e:
            return {"error": f"Network Error: {e}"}

# --- Main Application Logic & Gradio UI ---
class RecruitingAssistant:
    def __init__(self):
        print("Loading embedding model...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.all_chunks_with_source = []
        self.full_resumes = {}
        self.summaries = {}
        self.faiss_index = None
        print("Model loaded.")

    async def analyze_and_index_resumes(self, file_paths):
        if not file_paths:
            return "Please upload at least one resume.", gr.HTML(visible=False), gr.Accordion(visible=False)

        self.all_chunks_with_source = []
        self.full_resumes = {}
        self.summaries = {}
        
        for temp_file in file_paths:
            filename = os.path.basename(temp_file.name)
            try:
                doc = fitz.open(temp_file.name)
                full_text = "".join(page.get_text() for page in doc)
                self.full_resumes[filename] = full_text
                chunks = parse_resume_text(full_text, filename)
                self.all_chunks_with_source.extend(chunks)
            except Exception as e:
                print(f"Error processing {filename}: {e}")
        
        if not self.all_chunks_with_source:
            return "Could not parse any text.", gr.HTML(visible=False), gr.Accordion(visible=False)

        self.faiss_index, _ = build_semantic_index(self.all_chunks_with_source, self.embedding_model)
        
        # --- Generate Summaries ---
        summary_tasks = []
        for filename, text in self.full_resumes.items():
            prompt = f"Analyze the following resume text for '{filename}' and provide a summary.\n\n{text}"
            system_prompt = "You are an expert resume analyzer. Extract the candidate's name, a brief summary, and their top 5 skills based on the provided text."
            summary_tasks.append(call_gemini_api(system_prompt, prompt, SUMMARY_SCHEMA))
        
        results = await asyncio.gather(*summary_tasks)
        
        summary_html = "<div>"
        for i, result in enumerate(results):
            filename = list(self.full_resumes.keys())[i]
            if "error" in result:
                self.summaries[filename] = {"error": result["error"]}
                summary_html += f"""
                <div style='border: 1px solid #ddd; padding: 15px; margin-bottom: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>
                    <h3 style='margin:0 0 10px 0;'>{filename}</h3>
                    <p style='color: red;'><strong>Error:</strong> {result['error']}</p>
                </div>
                """
            else:
                try:
                    summary_data = json.loads(result['text'])
                    self.summaries[filename] = summary_data
                    skills_html = "".join([f"<span style='background-color: #e0e7ff; color: #4338ca; padding: 3px 8px; border-radius: 12px; font-size: 0.9em; margin: 2px;'>{skill}</span>" for skill in summary_data.get('top_skills', [])])
                    summary_html += f"""
                    <div style='border: 1px solid #ddd; padding: 15px; margin-bottom: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>
                        <h3 style='margin:0 0 10px 0;'>{summary_data.get('name', 'N/A')} <span style='font-size:0.8em; color:#666;'>({filename})</span></h3>
                        <p style='margin:0 0 10px 0;'>{summary_data.get('summary', 'No summary available.')}</p>
                        <div style='display: flex; flex-wrap: wrap; gap: 5px;'>{skills_html}</div>
                    </div>
                    """
                except json.JSONDecodeError:
                     self.summaries[filename] = {"error": "Invalid JSON response from AI."}

        summary_html += "</div>"
        
        return f"Indexed {len(self.full_resumes)} resumes.", gr.HTML(summary_html, visible=True), gr.Accordion(label="Candidate Summaries", open=True, visible=True)

    async def chat_query(self, question, history):
        if self.faiss_index is None:
            return "Please upload and index resumes first."
        relevant_chunks = search(question, self.embedding_model, self.faiss_index, self.all_chunks_with_source)
        if not relevant_chunks:
             return "No relevant information found."
        
        context = "\n\n---\n\n".join([f"CONTEXT from resume '{c['source']}':\n{c['content']}" for c in relevant_chunks])
        user_prompt = f"{context}\n\nQUESTION: {question}"
        system_prompt = "You are an expert HR assistant. Answer questions strictly based on the provided resume context. If information is missing, state that clearly. For lists like skills, use bullet points."
        
        result = await call_gemini_api(system_prompt, user_prompt)
        return result.get('text', result.get('error', 'An unknown error occurred.'))

    async def match_job_description(self, job_description):
        if not self.full_resumes:
            return "Please upload and analyze resumes first.", gr.HTML(visible=False)

        match_tasks = []
        for filename, text in self.full_resumes.items():
            user_prompt = f"Job Description:\n{job_description}\n\n---\n\nCandidate Resume ('{filename}'):\n{text}"
            system_prompt = "You are an expert recruiter. Analyze the resume against the job description and return a match score and rationale in JSON format."
            match_tasks.append(call_gemini_api(system_prompt, user_prompt, MATCH_SCHEMA))
            
        results = await asyncio.gather(*match_tasks)
        
        matches = []
        for result in results:
            if "error" not in result:
                try:
                    match_data = json.loads(result['text'])
                    matches.append(match_data)
                except json.JSONDecodeError:
                    continue
        
        # Sort by score descending
        matches.sort(key=lambda x: x.get('match_score', 0), reverse=True)
        
        match_html = "<div>"
        for match in matches:
            score = match.get('match_score', 0)
            color = "#dc2626" if score < 50 else ("#f59e0b" if score < 75 else "#16a34a")
            match_html += f"""
            <div style='border: 1px solid #ddd; padding: 15px; margin-bottom: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>
                <div style='display: flex; justify-content: space-between; align-items: center;'>
                    <h3 style='margin:0;'>{match.get('candidate_name', 'N/A')}</h3>
                    <div style='text-align: right;'>
                        <span style='font-size: 1.2em; font-weight: bold; color: {color};'>{score}/100</span>
                        <div style='width: 100px; background-color: #e5e7eb; border-radius: 5px; margin-top: 5px;'><div style='width: {score}%; height: 8px; background-color: {color}; border-radius: 5px;'></div></div>
                    </div>
                </div>
                <p style='margin-top: 10px; font-style: italic; color: #333;'><strong>Rationale:</strong> {match.get('rationale', 'No rationale provided.')}</p>
            </div>
            """
        match_html += "</div>"
        
        return f"Matching complete for {len(matches)} candidates.", gr.HTML(match_html, visible=True)


def create_ui():
    assistant = RecruitingAssistant()
    with gr.Blocks(theme=gr.themes.Soft(), title="AI Recruiting Assistant") as demo:
        gr.Markdown("# AI Recruiting Assistant")
        gr.Markdown("Upload resumes, get instant summaries, and match candidates to job descriptions.")

        with gr.Row():
            with gr.Column(scale=1):
                file_uploader = gr.File(label="Upload Resumes (PDFs)", file_count="multiple", file_types=[".pdf"])
                index_button = gr.Button("Analyze & Index Resumes", variant="primary")
                status_output = gr.Markdown(value="*No resumes indexed yet.*")
                
                with gr.Accordion("Candidate Summaries", open=False, visible=False) as summary_accordion:
                    summary_display = gr.HTML()
            
            with gr.Column(scale=2):
                with gr.Tabs():
                    with gr.TabItem("Chat with Resumes"):
                        chat_interface = gr.ChatInterface(assistant.chat_query, chatbot=gr.Chatbot(height=550), textbox=gr.Textbox(placeholder="Ask about skills, experience, or compare candidates..."))
                    with gr.TabItem("Job Matcher"):
                        with gr.Column():
                            job_desc_box = gr.Textbox(lines=10, label="Job Description", placeholder="Paste the full job description here...")
                            match_button = gr.Button("Match to Job Description", variant="primary")
                            match_status = gr.Markdown("*Matching results will appear below.*")
                            match_results_display = gr.HTML(visible=False)

        index_button.click(
            assistant.analyze_and_index_resumes,
            inputs=[file_uploader],
            outputs=[status_output, summary_display, summary_accordion]
        )
        
        match_button.click(
            assistant.match_job_description,
            inputs=[job_desc_box],
            outputs=[match_status, match_results_display]
        )

    return demo

if __name__ == "__main__":
    if not GEMINI_API_KEY or len(GEMINI_API_KEY) < 10:
        print("="*60)
        print("--- WARNING: GEMINI_API_KEY NOT FOUND OR INVALID ---")
        print("Please create a file named '.env' in the same folder as this script.")
        print("In that file, add one line: ")
        print('GEMINI_API_KEY="YOUR_ACTUAL_API_KEY_HERE"')
        print("You can get a key from Google AI Studio.")
        print("="*60)
    
    app = create_ui()
    app.launch(share=True, debug=True)




Loading embedding model...
Model loaded.


  chat_interface = gr.ChatInterface(assistant.chat_query, chatbot=gr.Chatbot(height=550), textbox=gr.Textbox(placeholder="Ask about skills, experience, or compare candidates..."))


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://3cef7c38b133b1b5c6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://3cef7c38b133b1b5c6.gradio.live
