<a href="https://colab.research.google.com/github/Manish7512/Resume-analyser-and-job-profiler/blob/main/resumeanalyser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain_google_genai langchain-community huggingface_hub PyPDF2 langchain-huggingface faiss-cpu gradio

Collecting langchain_google_genai
  Downloading langchain_google_genai-2.1.6-py3-none-any.whl.metadata (7.0 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.0-py3-none-any.whl.metadata (996 bytes)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain_google_genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.18 (from langchain_google_genai)
  Downloading google_ai_generativelanguage-0.6.18-py3-none-any.whl.metadata (9.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4

In [None]:
import os
import gradio as gr
from dotenv import load_dotenv
from PyPDF2 import PdfReader
# import tempfile
# import shutil

from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.memory import ConversationBufferMemory
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.schema.output_parser import StrOutputParser

from google.colab import userdata

# Load environment variables
# load_dotenv()

class ResumeAnalyzer:
    def __init__(self):
        self.vectorstore = None
        self.conversation_chain = None
        self.memory = None
        self.processed_files = []

    def extract_pdf_text(self, pdf_files):
        """Extract text from uploaded PDF files"""
        if not pdf_files:
            return ""

        text = ""
        self.processed_files = []

        for pdf_file in pdf_files:
            try:
                # Handle file path (Gradio returns file paths as strings)
                pdf_path = pdf_file if isinstance(pdf_file, str) else pdf_file.name
                self.processed_files.append(os.path.basename(pdf_path))

                pdf_reader = PdfReader(pdf_path)
                for page in pdf_reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
            except Exception as e:
                print(f"Error processing {pdf_path}: {str(e)}")
                continue

        return text

    def create_text_chunks(self, text):
        """Split text into chunks for processing"""
        if not text.strip():
            return []

        text_splitter = CharacterTextSplitter(
            separator="\n",
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len
        )

        chunks = text_splitter.split_text(text)
        return chunks

    def create_vectorstore(self, text_chunks):
        """Create FAISS vector store from text chunks"""
        if not text_chunks:
            return None

        try:
            embeddings = HuggingFaceEmbeddings(
                model_name="hkunlp/instructor-xl",
                # model_kwargs={"device": "cpu"}
                model_kwargs={"device": "cuda"}
            )

            vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
            return vectorstore
        except Exception as e:
            print(f"Error creating vectorstore: {str(e)}")
            return None

    def setup_conversation_chain(self, vectorstore):
        """Setup the conversation chain with Gemini LLM"""
        if not vectorstore:
            return None

        try:
            # Initialize Gemini
            llm = ChatGoogleGenerativeAI(
                model="gemini-2.5-flash",  # Updated model name
                temperature=0.7,
            )

            # Memory
            self.memory = ConversationBufferMemory(
                memory_key="chat_history",
                return_messages=True
            )

            # Prompt template
            prompt = ChatPromptTemplate.from_messages([
                ("system", """You are an AI assistant for a resume analyzer system.
You MUST ONLY answer questions related to resume analysis, job profiling, and candidate evaluation based on the uploaded resumes.

STRICT RULES:
1. ONLY respond to queries about:
   - Finding candidates for specific job roles
   - Analyzing skills and qualifications from resumes
   - Comparing candidates for positions
   - Extracting contact information from resumes
   - Summarizing candidate profiles
   - Job-related questions about the uploaded resumes

2. If asked about ANYTHING else (history, general knowledge, unrelated topics, etc.), respond with:
   "I can only help with resume analysis and job profiling based on the uploaded resumes. Please ask questions about finding candidates, analyzing skills, or job-related queries."

3. IMPORTANT: If the context shows "No relevant resume information found", it means no candidates in the database match the query. In this case, respond with:
   "❌ No candidates found matching your criteria. This could mean:
   • No resumes in the database match the specified skills/role
   • The job title or skills mentioned aren't present in the uploaded resumes
   • Try broadening your search criteria or using different keywords

   Consider rephrasing your query or checking if the relevant resumes were properly uploaded."

4. For valid resume-related queries with relevant context, provide:
   - Full name
   - Email address (if available)
   - LinkedIn profile link (if available)
   - Phone number (if available)
   - A concise summary of their qualifications and experience
   - Key skills that match the job requirements
   - Years of experience (if mentioned)

5. Present information in a clear, organized format. If contact information is not available, mention "Not provided" for those fields.

6. Never make up or hallucinate information about candidates. Only use information explicitly provided in the context.

Context from uploaded resumes: {context}"""),
                MessagesPlaceholder(variable_name="chat_history"),
                ("human", "{question}")
            ])

            # Helper functions
            def format_docs(docs):
                if not docs:
                    return "No relevant resume information found."

                # Check if documents have meaningful content
                meaningful_docs = []
                for doc in docs:
                    if doc.page_content and len(doc.page_content.strip()) > 10:
                        meaningful_docs.append(doc)

                if not meaningful_docs:
                    return "No relevant resume information found."

                return "\n\n".join(doc.page_content for doc in meaningful_docs)

            def get_chat_history(inputs):
                return self.memory.chat_memory.messages if self.memory else []

            def enhanced_retriever(query):
                """Enhanced retriever with similarity threshold checking"""
                try:
                    # Perform similarity search with scores
                    docs_with_scores = vectorstore.similarity_search_with_score(query, k=5)

                    # Filter documents based on similarity threshold
                    # Lower scores indicate higher similarity in FAISS
                    similarity_threshold = 1.5  # Adjust based on your needs

                    relevant_docs = []
                    for doc, score in docs_with_scores:
                        if score < similarity_threshold:  # Lower score = more similar
                            relevant_docs.append(doc)

                    # If no documents meet the threshold, return empty list
                    if not relevant_docs:
                        return []

                    return relevant_docs

                except Exception as e:
                    print(f"Retrieval error: {e}")
                    return []

            # Create the chain
            rag_chain = (
                {
                    "context": RunnableLambda(enhanced_retriever) | format_docs,
                    "question": RunnablePassthrough(),
                    "chat_history": RunnableLambda(get_chat_history)
                }
                | prompt
                | llm
                | StrOutputParser()
            )

            # Wrapper to handle memory
            def conversation_with_memory(question):
                try:
                    response = rag_chain.invoke(question)
                    # Save to memory
                    if self.memory:
                        self.memory.chat_memory.add_user_message(question)
                        self.memory.chat_memory.add_ai_message(response)
                    return response
                except Exception as e:
                    return f"Error processing query: {str(e)}"

            return conversation_with_memory

        except Exception as e:
            print(f"Error setting up conversation chain: {str(e)}")
            return None

    def process_resumes(self, pdf_files, progress=gr.Progress()):
        """Process uploaded resume PDFs"""
        if not pdf_files:
            return "❌ No files uploaded. Please upload PDF resumes.", ""

        try:
            progress(0.1, desc="Extracting text from PDFs...")

            # Extract text from PDFs
            raw_text = self.extract_pdf_text(pdf_files)

            if not raw_text.strip():
                return "❌ No text could be extracted from the uploaded PDFs.", ""

            progress(0.3, desc="Creating text chunks...")

            # Create text chunks
            text_chunks = self.create_text_chunks(raw_text)

            if not text_chunks:
                return "❌ Could not create text chunks from the extracted text.", ""

            progress(0.6, desc="Creating vector database...")

            # Create vector store
            self.vectorstore = self.create_vectorstore(text_chunks)

            if not self.vectorstore:
                return "❌ Failed to create vector database.", ""

            progress(0.8, desc="Setting up AI conversation chain...")

            # Setup conversation chain
            self.conversation_chain = self.setup_conversation_chain(self.vectorstore)

            if not self.conversation_chain:
                return "❌ Failed to setup AI conversation chain.", ""

            progress(1.0, desc="Processing complete!")

            success_msg = f"""✅ **Processing Complete!**

📄 **Files Processed:** {len(self.processed_files)}
📝 **Text Chunks Created:** {len(text_chunks)}
🔍 **Vector Database:** Ready
🤖 **AI System:** Initialized

**Processed Files:**
{chr(10).join(f"• {file}" for file in self.processed_files)}

You can now query for job profiles using the chat interface below."""

            return success_msg, ""

        except Exception as e:
            return f"❌ Error processing resumes: {str(e)}", ""

    def chat_with_system(self, message, history):
        """Handle chat interactions with context validation"""
        if not self.conversation_chain:
            return history + [(message, "❌ Please upload and process resumes first.")], ""

        if not message.strip():
            return history, ""

        # Check if the question is resume/job-related
        if not self._is_resume_related_query(message):
            response = "I can only help with resume analysis and job profiling based on the uploaded resumes. Please ask questions about finding candidates, analyzing skills, or job-related queries."
            history.append((message, response))
            return history, ""

        try:
            # Get response from the conversation chain
            response = self.conversation_chain(message)

            # Update chat history
            history.append((message, response))

            return history, ""

        except Exception as e:
            error_msg = f"Error: {str(e)}"
            history.append((message, error_msg))
            return history, ""

    def _is_resume_related_query(self, query):
        """Check if the query is related to resume analysis or job profiling"""
        query_lower = query.lower()

        # Keywords that indicate resume/job-related queries
        resume_keywords = [
            'candidate', 'candidates', 'resume', 'resumes', 'job', 'position', 'role',
            'skill', 'skills', 'experience', 'qualification', 'qualifications',
            'developer', 'engineer', 'manager', 'analyst', 'designer', 'consultant',
            'hire', 'hiring', 'recruit', 'recruitment', 'interview', 'profile',
            'background', 'expertise', 'competency', 'competencies', 'ability',
            'python', 'java', 'javascript', 'react', 'node', 'sql', 'database',
            'frontend','degree', 'certification', 'portfolio', 'github',
            'linkedin', 'contact', 'email', 'phone', 'name', 'find', 'search',
            'best', 'suitable', 'match', 'fit', 'senior', 'junior', 'entry level',
            'years of experience', 'cv', 'curriculum vitae'
        ]

        # Check if any resume-related keywords are present
        return any(keyword in query_lower for keyword in resume_keywords)

# Initialize the resume analyzer
analyzer = ResumeAnalyzer()

# Create Gradio interface
def create_interface():
    with gr.Blocks(
        title="Resume Analyzer & Job Profiler",
        theme=gr.themes.Soft(),
        css="""
        .header { text-align: center; margin-bottom: 20px; }
        .status-box { padding: 15px; border-radius: 10px; margin: 10px 0; }
        .upload-area { border: 2px dashed #ccc; padding: 20px; border-radius: 10px; }
        """
    ) as demo:

        gr.HTML("""
        <div class="header">
            <h1>🎯 Resume Analyzer & Job Profiler</h1>
            <p>Upload resumes and find the best candidates for any job profile using AI</p>
        </div>
        """)

        with gr.Tab("📤 Upload & Process Resumes"):
            gr.HTML("""
            <!-- <div style="background: #f0f8ff; padding: 15px; border-radius: 10px; margin-bottom: 20px;"> -->
            <div style="padding: 15px; border-radius: 10px; margin-bottom: 20px;">
                <h3>Step 1: Upload Resume PDFs</h3>
                <p>Upload multiple PDF resumes to build your candidate database. The system will extract text and create a searchable vector database.</p>
            </div>
            """)

            with gr.Row():
                with gr.Column(scale=2):
                    file_upload = gr.File(
                        label="Upload Resume PDFs",
                        file_count="multiple",
                        file_types=[".pdf"],
                        interactive=True
                    )

                    process_btn = gr.Button(
                        "🚀 Process Resumes",
                        variant="primary",
                        size="lg"
                    )

                with gr.Column(scale=1):
                    gr.HTML("""
                    <!-- <div style="background: #fff3cd; padding: 15px; border-radius: 10px;"> -->
                    <div style="padding: 15px; border-radius: 10px;">
                        <h4>📋 Requirements</h4>
                        <ul>
                            <li>PDF format only</li>
                            <li>Text-based PDFs (not scanned images)</li>
                            <li>Multiple files supported</li>
                            <li>Processing may take a few minutes</li>
                        </ul>
                    </div>
                    """)

            status_output = gr.HTML(label="Processing Status")

        with gr.Tab("💬 Query Candidates"):
            gr.HTML("""
            <!-- <div style="background: #f0fff0; padding: 15px; border-radius: 10px; margin-bottom: 20px;"> -->
            <div style="padding: 15px; border-radius: 10px; margin-bottom: 20px;">
                <h3>Step 2: Find the Best Candidates</h3>
                <p>Ask questions about job profiles to find the most suitable candidates from your uploaded resumes.</p>
            </div>
            """)

            chatbot = gr.Chatbot(
                label="AI Resume Analyzer",
                height=500,
                placeholder="Process resumes first, then start chatting..."
            )

            with gr.Row():
                msg_input = gr.Textbox(
                    label="Your Query",
                    placeholder="e.g., 'Who are the best candidates for a senior Python developer position?'",
                    lines=2,
                    scale=4
                )
                send_btn = gr.Button("Send", variant="primary", scale=1)

            gr.Examples(
                examples=[
                    "Who are the best candidates for a software engineer position?",
                    "Find candidates with React.js and Node.js experience",
                    "Show me candidates suitable for a data scientist role",
                    "Who has the most experience in machine learning?",
                    "Find candidates with project management experience",
                    "Show me candidates with both frontend and backend skills"
                ],
                inputs=msg_input,
                label="Example Queries"
            )

        with gr.Tab("ℹ️ About"):
            gr.HTML("""
            <div style="padding: 20px;">
                <h2>About Resume Analyzer & Job Profiler</h2>

                <h3>🔧 Technology Stack</h3>
                <ul>
                    <li><strong>LangChain:</strong> Framework for building AI applications</li>
                    <li><strong>FAISS:</strong> Vector database for similarity search</li>
                    <li><strong>Google Gemini AI:</strong> Advanced language model</li>
                    <li><strong>HuggingFace Embeddings:</strong> Text embedding generation</li>
                    <li><strong>Gradio:</strong> Web interface framework</li>
                </ul>

                <h3>📋 How It Works</h3>
                <ol>
                    <li><strong>Upload:</strong> Upload multiple PDF resumes</li>
                    <li><strong>Process:</strong> System extracts text and creates vector embeddings</li>
                    <li><strong>Query:</strong> Ask for candidates matching specific job profiles</li>
                    <li><strong>Results:</strong> AI analyzes and returns best matching candidates</li>
                </ol>

                <h3>🎯 Use Cases</h3>
                <ul>
                    <li>HR recruitment and candidate screening</li>
                    <li>Talent acquisition for specific roles</li>
                    <li>Resume database management</li>
                    <li>Quick candidate profiling</li>
                </ul>

                <h3>⚙️ Setup Requirements</h3>
                <p>Make sure you have the following API keys configured:</p>
                <ul>
                    <li><code>GOOGLE_API_KEY</code> - For Gemini AI</li>
                    <li><code>HUGGINGFACEHUB_API_TOKEN</code> - For embeddings</li>
                </ul>
            </div>
            """)

        # Event handlers
        process_btn.click(
            fn=analyzer.process_resumes,
            inputs=[file_upload],
            outputs=[status_output, msg_input],
            show_progress=True
        )

        send_btn.click(
            fn=analyzer.chat_with_system,
            inputs=[msg_input, chatbot],
            outputs=[chatbot, msg_input]
        )

        msg_input.submit(
            fn=analyzer.chat_with_system,
            inputs=[msg_input, chatbot],
            outputs=[chatbot, msg_input]
        )

    return demo

# Launch the application
if __name__ == "__main__":
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HUGGINGFACEHUB_API_TOKEN')
    os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
    # Check for required environment variables
    required_vars = ["GOOGLE_API_KEY", "HUGGINGFACEHUB_API_TOKEN"]
    missing_vars = [var for var in required_vars if not os.getenv(var)]

    if missing_vars:
        print(f"⚠️  Missing environment variables: {', '.join(missing_vars)}")
        print("Please set these variables in your .env file or environment")

    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        debug=True 'backend', 'fullstack', 'devops', 'data science', 'machine learning',
            'project management', 'leadership', 'team', 'work', 'employment',
            'education',
    )

  chatbot = gr.Chatbot(


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://685210afc1359e0d54.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

2_Dense/pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

  self.memory = ConversationBufferMemory(
