In [5]:
!pip install faiss-cpu rank_bm25 langchain_community torch transformers pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `rag_app` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-auth

In [7]:
import pdfplumber
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import nltk
import random


In [26]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [9]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file using pdfplumber.

    Args:
        pdf_path (str): Path to the CV PDF file.

    Returns:
        str: Extracted text or empty string if extraction fails.
    """
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""
        if not text.strip():
            raise ValueError("No text extracted from PDF")
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""  # Fallback to empty string

In [10]:
def create_knowledge_base(cv_text, job_des):
    """
    Creates a knowledge base by splitting CV and job description into sentences.

    Args:
        cv_text (str): Text extracted from the CV.
        job_des (str): Job description text.

    Returns:
        list: List of text chunks.
    """
    knowledge_base = cv_text.split(". ") + job_des.split(". ")
    knowledge_base = [chunk.strip() for chunk in knowledge_base if chunk.strip()]
    return knowledge_base

In [11]:
def embed_knowledge_base(knowledge_base):
    """
    Embeds the knowledge base using SentenceTransformer.

    Args:
        knowledge_base (list): List of text chunks.

    Returns:
        tuple: Embeddings, knowledge base, and embedder instance.
    """
    embedder = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = embedder.encode(knowledge_base, convert_to_numpy=True)
    return embeddings, knowledge_base, embedder

In [12]:
def build_faiss_index(embeddings):
    """
    Builds a FAISS index for efficient similarity search.

    Args:
        embeddings (np.ndarray): Embedded text chunks.

    Returns:
        faiss.IndexFlatL2: FAISS index.
    """
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

In [13]:
# Step 3: Retrieval Function
def retrieve_relevant_chunks(query, index, knowledge_base, embedder, k=2):
    """
    Retrieves the most relevant chunks based on the query.

    Args:
        query (str): Query text (initial or candidate response).
        index (faiss.IndexFlatL2): FAISS index.
        knowledge_base (list): List of text chunks.
        embedder (SentenceTransformer): Embedding model.
        k (int): Number of chunks to retrieve.

    Returns:
        list: Retrieved relevant chunks.
    """
    query_embedding = embedder.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    relevant_chunks = [knowledge_base[idx] for idx in indices[0]]
    return relevant_chunks

In [14]:
# Step 4: Load Gemma-3-4b-it Model with Pipeline
def load_gemma_model():
    """
    Loads the Gemma-3-4b-it model and tokenizer, creating a text generation pipeline.

    Returns:
        pipeline: Text generation pipeline.
    """
    model_name = "google/gemma-3-4b-it"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded")

    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
    print("Model loaded with device_map='auto'")

    llm_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=60,
        temperature=0.7,
        do_sample=True
    )
    return llm_pipeline

In [15]:
# Step 5: Dynamic Prompt and Question Generation
def generate_dynamic_prompt(cv_chunks, job_chunks, keyword, prev_question=None, prev_response=None):
    """
    Generates a dynamic prompt for the language model based on context and keyword.

    Args:
        cv_chunks (list): CV-related retrieved chunks.
        job_chunks (list): Job description-related retrieved chunks.
        keyword (str): Focus keyword for the question.
        prev_question (str, optional): Previous question asked.
        prev_response (str, optional): Candidate's previous response.

    Returns:
        str: Generated prompt.
    """
    prompt = (
        "You are an AI interviewer generating questions based on a candidate’s CV and job description.\n"
        "CV Information:\n" + "\n".join(cv_chunks) + "\n"
        "Job Description Information:\n" + "\n".join(job_chunks) + "\n"
    )
    if prev_question and prev_response:
        prompt += (
            "Previous Question: " + prev_question + "\n"
            "Candidate’s Response: " + prev_response + "\n"
            "Based on the CV, job description, and the candidate’s previous response, "
            "generate a relevant follow-up question focusing on " + keyword + " to assess their fit for the role.\n"
        )
    else:
        prompt += (
            "Based on the CV and job description, generate an initial interview question "
            "focusing on " + keyword + " to assess the candidate’s fit for the role.\n"
        )
    return prompt

In [16]:
def generate_question(prompt, llm_pipeline):
    """
    Generates a question using the language model pipeline.

    Args:
        prompt (str): Input prompt for the model.
        llm_pipeline (pipeline): Text generation pipeline.

    Returns:
        str: Generated question.
    """
    try:
        output = llm_pipeline(prompt, return_full_text=False)[0]["generated_text"]
        return output.strip() or "Tell me about your experience."
    except Exception as e:
        print(f"Error generating question: {e}")
        return "Tell me about your experience."

In [17]:
# Helper Function for Keyword Extraction
def extract_keywords(text, top_n=10):
    """
    Extracts the top N keywords (nouns) from the text using NLTK.

    Args:
        text (str): Input text.
        top_n (int): Number of keywords to extract.

    Returns:
        list: List of top keywords.
    """
    words = nltk.word_tokenize(text.lower())
    tagged = nltk.pos_tag(words)
    nouns = [word for word, pos in tagged if pos.startswith('NN')]
    freq_dist = nltk.FreqDist(nouns)
    return [word for word, _ in freq_dist.most_common(top_n)]

In [18]:
# Main Application Logic with Conversational Flow
def run_interview_bot(cv_path, job_des):
    """
    Runs the AI interviewer bot, asking questions based on CV and job description.

    Args:
        cv_path (str): Path to the CV PDF file.
        job_des (str): Job description text.
    """
    # Extract CV text
    cv_text = extract_text_from_pdf(cv_path)
    if not cv_text:
        print("Cannot proceed without CV text.")
        return

    # Extract keywords from job description
    job_keywords = extract_keywords(job_des, top_n=20)
    if not job_keywords:
        print("No keywords extracted from job description.")
        return

    # Create and embed knowledge base
    knowledge_base = create_knowledge_base(cv_text, job_des)
    if not knowledge_base:
        print("Knowledge base is empty. Cannot proceed.")
        return

    embeddings, knowledge_base, embedder = embed_knowledge_base(knowledge_base)
    index = build_faiss_index(embeddings)
    llm_pipeline = load_gemma_model()

    # Initialize conversation variables
    prev_question = None
    prev_response = None

    # Welcome message
    print("Welcome to the AI Interviewer. I will ask you questions based on your CV and the job description.")
    print("Type 'exit' at any time to end the interview.")

    # Interview loop
    while True:
        # Set query based on whether it's the first question or a follow-up
        if prev_response:
            query = prev_response
        else:
            query = "Assess candidate’s fit for the role"

        # Retrieve relevant chunks
        retrieved_chunks = retrieve_relevant_chunks(query, index, knowledge_base, embedder)
        chunk_text = " ".join(retrieved_chunks).lower()

        # Select a keyword: prefer one present in retrieved chunks
        present_keywords = [kw for kw in job_keywords if kw in chunk_text]
        if present_keywords:
            keyword = random.choice(present_keywords)
        else:
            keyword = random.choice(job_keywords)

        # Categorize retrieved chunks
        cv_chunks = [chunk for chunk in retrieved_chunks if chunk in cv_text]
        job_chunks = [chunk for chunk in retrieved_chunks if chunk in job_des]

        # Generate and ask question
        prompt = generate_dynamic_prompt(cv_chunks, job_chunks, keyword, prev_question, prev_response)
        question = generate_question(prompt, llm_pipeline)

        print(f"Generated Question: {question}")
        prev_response = input("Candidate Response (or type 'exit' to stop): ")

        # Check for exit condition
        if prev_response.lower() == "exit":
            print("Thank you for participating. The interview has ended.")
            break

        prev_question = question

In [22]:
cv_path = "/content/NavnishPandey_cv (1).pdf"

job_des = """
    Key Responsibilities
Collect, clean, and preprocess data from diverse sources.
Design, implement, and deploy machine learning models to solve real-world problems.
Analyze large datasets to discover trends, patterns, and insights.
Collaborate with cross-functional teams to define and solve business problems using data.
Communicate findings through dashboards, reports, and presentations.
Continuously monitor and improve the performance of deployed models.
Required Qualifications
Bachelor’s or Master’s degree in Data Science, Computer Science, Statistics, or a related field.
Strong programming skills in Python or R.
Hands-on experience with machine learning libraries (e.g., Scikit-learn, TensorFlow, PyTorch).
Proficiency in data manipulation using Pandas, NumPy, SQL.
Familiarity with data visualization tools (e.g., Matplotlib, Seaborn, Tableau, Power BI).
Experience in statistical modeling and A/B testing.
Preferred Qualifications
Experience working with cloud platforms (AWS, GCP, or Azure).
Knowledge of NLP, time series, or computer vision is a plus.
Exposure to MLOps tools and practices.
Experience with big data tools like Spark, Hadoop is a bonus.
"""



In [25]:
run_interview_bot(cv_path, job_des)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Tokenizer loaded


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Device set to use cuda:0


Model loaded with device_map='auto'
Welcome to the AI Interviewer. I will ask you questions based on your CV and the job description.
Type 'exit' at any time to end the interview.
Generated Question: Question: “Based on your recent certificate program in Data Science, specifically the 'Statistics Essential for Analytics' certificate, can you describe a statistical concept you learned that you believe would be particularly valuable in this role, and how you might apply it to a real-world data analysis problem?”
Candidate Response (or type 'exit' to stop): During the 'Statistics Essentials for Analytics' course, one concept that stood out to me was hypothesis testing, particularly A/B testing, which is highly relevant to this role.  A/B testing is a statistical method used to compare two versions of a variable to determine which one performs better. I learned how to set up null and alternative hypotheses, calculate p-values, and interpret statistical significance — skills that directly s