In [1]:
# Install required packages
%pip install -qU langchain langgraph langgraph-swarm langchain-google-genai google-generativeai langchain_community faiss-cpu tavily-python google-cloud-speech sounddevice scipy pdfminer.six python-dotenv langchain-openai numpy pandas pytesseract openpyxl langchain-ollama


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## 1. Setup and Imports

Install necessary libraries and import required modules.


In [2]:
# Install required packages (uncomment if needed)
#%pip install -qU langchain langgraph langgraph-swarm langchain-google-genai langchain_community faiss-cpu tavily-python google-cloud-speech sounddevice scipy pdfminer.six python-dotenv langchain-openai

import os
import json
import re

import uuid
import numpy as np
import pandas as pd
import sounddevice as sd
import scipy.io.wavfile as wav
from typing import List, Dict, Any, Optional, TypedDict
import pytesseract

# Replace Ollama with Google Generative AI (Gemini)
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
# from langchain_ollama import ChatOllama # Removed unused import
# from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
# Use pydantic.v1 for compatibility as suggested by the warning
from pydantic.v1 import BaseModel, Field 
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage, ToolMessage
from langchain_core.tools import tool

from langgraph.checkpoint.memory import InMemorySaver
from langgraph.graph import StateGraph, END
from langgraph.prebuilt import ToolNode, tools_condition, create_react_agent
from langgraph_swarm import create_handoff_tool, create_swarm
# Removed unused langgraph_swarm imports

from google.cloud import speech
from pdfminer.high_level import extract_text
from dotenv import load_dotenv

# Load environment variables (for API keys like Tavily, Google Cloud)
load_dotenv()

# --- Configuration ---
FAISS_RESUME_PATH = "./faiss/resume_embeddings"
FAISS_JD_PATH = "./faiss/jd_embeddings"
FAISS_RUBRIC_PATH = "./faiss/rubric_embeddings"
FAISS_KNOWLEDGE_PATH = "./faiss/knowledge_embeddings"
os.makedirs(os.path.dirname(FAISS_RESUME_PATH), exist_ok=True)
os.makedirs(os.path.dirname(FAISS_JD_PATH), exist_ok=True)
os.makedirs(os.path.dirname(FAISS_RUBRIC_PATH), exist_ok=True)
os.makedirs(os.path.dirname(FAISS_KNOWLEDGE_PATH), exist_ok=True)
# Retrieve the API key loaded by load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")

# Initialize LLM (using Gemini)
# Pass the API key explicitly
# import google.generativeai as genai
# genai.configure(api_key=google_api_key)
# models = genai.list_models()

# for m in models:
#     print(m.name, m.supported_generation_methods)

gemini_model = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",  # Must be valid in your project
    temperature=0.1,
    convert_system_message_to_human=True,
    google_api_key=google_api_key  # from .env
)

# gemini_model = ChatOllama(model="llama3.1:latest")

# llm = ChatOllama(model="llama3", temperature=0.1)

# Initialize Embeddings Model (using Gemini)
# Pass the API key explicitly
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=google_api_key
)
# embeddings = OllamaEmbeddings(model="nomic-embed-text")

# Initialize Checkpointer
checkpointer = InMemorySaver()

  from .autonotebook import tqdm as notebook_tqdm


Okay, I will update the interview-unlocked.ipynb notebook to implement the agentic workflow described in design.md using `langgraph` and `langgraph_swarm`.

Made changes.

OK. I have populated the interview-unlocked.ipynb notebook with the Python code to implement the multi-agent interview preparation system based on your design.md file and using LangGraph.

The notebook now contains the following sections:

1.  **Setup and Imports:** Installs necessary packages (commented out by default) and imports all required libraries and modules. It also sets up configurations like FAISS paths and initializes the LLM and embedding models.
2.  **Tool Definitions:** Implements all the tools required by the agents, such as text extraction, embedding generation, vector retrieval, web search, LeetCode retrieval (placeholder), audio recording/transcription, and evaluation helper tools (placeholders).
3.  **Agent Definitions:** Defines the `InterviewState` TypedDict to manage the data flow between agents. It includes the prompts for each agent (Preprocessing, Knowledge, Planner, Question, Evaluation) derived from design.md and sets up the agent nodes using LangGraph's `ToolNode` and a helper function.
4.  **Graph Definition:** Constructs the `StateGraph`, adds nodes for each agent and its tools, defines the sequential edges connecting them (Preprocessing -> Knowledge -> Planner -> Question -> Record Answer -> Evaluate -> END), and compiles the graph. It also includes code to optionally visualize the graph structure.
5.  **Execution and Interaction:** Provides an example of how to run the graph. You'll need to **modify the `resume_file` and `jd_file` variables** to point to your actual resume and job description files. It then runs the graph asynchronously, streams the events (like tool calls and LLM outputs), and finally prints the key outputs extracted from the final state (study plan, suggested LeetCode, generated question, transcribed answer, and evaluation feedback).

Please review the notebook, especially the placeholder file paths in the "Execution and Interaction" section, and ensure your environment (API keys, Ollama server if used, Google Cloud credentials) is set up correctly before running the cells.


# Interview Unlocked: Agentic Interview Preparation System

This notebook implements a multi-agent system using LangGraph and LangGraph Swarm to help users prepare for job interviews. It follows the design outlined in `design.md`.


In [3]:
# --- Tool Implementations ---

@tool
def extract_text_with_ocr(file_path: str) -> str:
    """
    Extracts text from a file. Uses pdfminer.six for text-based PDF files,
    pytesseract OCR for image-based PDFs (if Tesseract is installed),
    otherwise reads as plain text.
    """
    try:
        if file_path.lower().endswith('.pdf'):
            print(f"Extracting text from PDF: {file_path}")
            try:
                # Use pdfminer.six for direct text extraction
                text = extract_text(file_path)
                print("PDF text extraction with pdfminer.six finished.")
                return text.strip()
            except Exception as e_pdfminer:
                print(f"pdfminer.six failed: {e_pdfminer}. Falling back to OCR if possible.")
                # Fallback to OCR if pdfminer fails (optional, requires Tesseract)
                try:
                    pytesseract.get_tesseract_version() # Check if Tesseract is available
                    # If you still want OCR as a fallback, you'd need pdf2image back.
                    # For now, we just report the pdfminer error if OCR isn't the primary path.
                    # If you re-introduce pdf2image for fallback:
                    # from pdf2image import convert_from_path
                    # images = convert_from_path(file_path)
                    # full_text = ""
                    # for i, image in enumerate(images):
                    #     print(f"Processing page {i+1}/{len(images)} via OCR fallback...")
                    #     ocr_text = pytesseract.image_to_string(image)
                    #     full_text += ocr_text + "\n"
                    # print("PDF OCR fallback finished.")
                    # return full_text.strip()
                    return f"Error extracting text with pdfminer.six: {e_pdfminer}. OCR fallback not fully implemented without pdf2image."

                except Exception as e_ocr_check:
                     return f"Error extracting text with pdfminer.six: {e_pdfminer}. Tesseract for OCR fallback not found: {e_ocr_check}"
        else:
            # Handle non-PDF files as plain text
            print(f"Reading text file: {file_path}")
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                return f.read()
    except FileNotFoundError:
        return f"Error: File not found at {file_path}"
    except Exception as e:
        return f"An unexpected error occurred while processing {file_path}: {e}"

@tool
def generate_resume_embeddings_and_save(text: str) -> str:
    """Generates embeddings for the Job Description text and saves/updates the FAISS Resume index."""
    index_path = FAISS_RESUME_PATH # Use the specific path
    try:
        texts = [text] # FAISS expects a list
        if os.path.exists(index_path):
            vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
            vectorstore.add_texts(texts)
        else:
            vectorstore = FAISS.from_texts(texts, embeddings)
        vectorstore.save_local(index_path)
        return f"Resume Embeddings generated and saved to {index_path}"
    except Exception as e:
        return f"Error generating/saving Resume embeddings: {e}"

@tool
def generate_jd_embeddings_and_save(text: str) -> str:
    """Generates embeddings for the Job Description text and saves/updates the FAISS JD index."""
    index_path = FAISS_JD_PATH # Use the specific path
    try:
        texts = [text] # FAISS expects a list
        if os.path.exists(index_path):
            vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
            vectorstore.add_texts(texts)
        else:
            vectorstore = FAISS.from_texts(texts, embeddings)
        vectorstore.save_local(index_path)
        return f"JD Embeddings generated and saved to {index_path}"
    except Exception as e:
        return f"Error generating/saving JD embeddings: {e}"

@tool
def generate_knowledge_embeddings_and_save(text: str) -> str:
    """Generates embeddings for the Job Description text and saves/updates the FAISS Knowledge index."""
    index_path = FAISS_KNOWLEDGE_PATH # Use the specific path
    try:
        texts = [text] # FAISS expects a list
        if os.path.exists(index_path):
            vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
            vectorstore.add_texts(texts)
        else:
            vectorstore = FAISS.from_texts(texts, embeddings)
        vectorstore.save_local(index_path)
        return f"Knowledge Embeddings generated and saved to {index_path}"
    except Exception as e:
        return f"Error generating/saving Knowledge embeddings: {e}"


@tool
def generate_rubric_embeddings_and_save(text: str) -> str:
    """Generates embeddings for the Job Description text and saves/updates the FAISS Rubric index."""
    index_path = FAISS_RUBRIC_PATH # Use the specific path
    try:
        texts = [text] # FAISS expects a list
        if os.path.exists(index_path):
            vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
            vectorstore.add_texts(texts)
        else:
            vectorstore = FAISS.from_texts(texts, embeddings)
        vectorstore.save_local(index_path)
        return f"Rubric Embeddings generated and saved to {index_path}"
    except Exception as e:
        return f"Error generating/saving Rubric embeddings: {e}"

@tool
def retrieve_resume_embeddings_from_vector_db(query: str, k: int = 3) -> List[str]:
    """Retrieves relevant documents from the resume FAISS index."""
    try:
        index_path = FAISS_RESUME_PATH
        if not os.path.exists(index_path):
            return ["Resume vector index not found."]
        vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
        results = vectorstore.similarity_search(query, k=k)
        return [doc.page_content for doc in results]
    except Exception as e:
        return [f"Error retrieving from resume vector DB: {e}"]
    
@tool
def retrieve_jd_embeddings_from_vector_db(query: str, k: int = 3) -> List[str]:
    """Retrieves relevant documents from the JD FAISS index."""
    try:
        index_path = FAISS_JD_PATH
        if not os.path.exists(index_path):
            return ["JD vector index not found."]
        vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
        results = vectorstore.similarity_search(query, k=k)
        return [doc.page_content for doc in results]
    except Exception as e:
        return [f"Error retrieving from JD vector DB: {e}"]

@tool
def retrieve_knowledge_embeddings_from_vector_db(query: str, k: int = 3) -> List[str]:
    """Retrieves relevant documents from the Knowledge FAISS index."""
    try:
        index_path = FAISS_KNOWLEDGE_PATH
        if not os.path.exists(index_path):
            return ["Knowledge vector index not found."]
        vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
        results = vectorstore.similarity_search(query, k=k)
        return [doc.page_content for doc in results]
    except Exception as e:
        return [f"Error retrieving from Knowledge vector DB: {e}"]

@tool
def retrieve_rubric_embeddings_from_vector_db(query: str, k: int = 3) -> List[str]:
    """Retrieves relevant documents from the Rubric FAISS index."""
    try:
        index_path = FAISS_RUBRIC_PATH
        if not os.path.exists(index_path):
            return ["Rubric vector index not found."]
        vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
        results = vectorstore.similarity_search(query, k=k)
        return [doc.page_content for doc in results]
    except Exception as e:
        return [f"Error retrieving from Rubric vector DB: {e}"]


load_dotenv()

# Retrieve the API key from environment variables
tavily_api_key = os.getenv("TAVILY_API_KEY")

# Check if the API key was loaded
if not tavily_api_key:
    raise ValueError("TAVILY_API_KEY not found in environment variables. Please ensure it is set in your .env file.")

# Tavily Search Tool (already integrated in LangChain)
# Pass the API key during initialization
tavily_tool = TavilySearchResults(
    tavily_api_key=tavily_api_key,
    max_results=10
)


# @tool
# def web_retrieval_tavily_search() -> str:
#     """Used for searching the web for relevant discussion threads about the company"""
#     # Load environment variables from .env file
#     load_dotenv()

#     # Retrieve the API key from environment variables
#     tavily_api_key = os.getenv("TAVILY_API_KEY")

#     # Check if the API key was loaded
#     if not tavily_api_key:
#         raise ValueError("TAVILY_API_KEY not found in environment variables. Please ensure it is set in your .env file.")

#     # Tavily Search Tool (already integrated in LangChain)
#     # Pass the API key during initialization
#     tavily_tool = TavilySearchResults(
#         tavily_api_key=tavily_api_key,
#         max_results=20
#     )

#     print(tavily_tool)
#     # results = tavily_tool.invoke("Amazon system design interview ")

@tool
def company_leetcode_problem_retriever(company: str, role_keywords: Optional[List[str]] = None) -> List[str]:
    """
    Retrieves suggested LeetCode questions for a specific company by reading
    from the './Leetcode-company-problem-set.xlsx' file. Each company's
    questions are expected to be in a sheet named after the company (case-insensitive).
    Questions are assumed to be listed in the first column (A) starting from the first row (A1).
    The role_keywords parameter is currently unused but available for future filtering.
    """
    excel_path = './Leetcode-company-problem-set.xlsx'
    default_questions = ["Reverse Linked List", "Valid Parentheses", "Coin Change"] # Default if company not found

    print(f"Fetching LeetCode questions for {company} from {excel_path}...")

    try:
        # Check if file exists first
        if not os.path.exists(excel_path):
            print(f"Error: Excel file not found at {excel_path}. Returning default questions.")
            return default_questions

        # Read all sheet names first to handle case-insensitivity
        xls = pd.ExcelFile(excel_path)
        sheet_names = xls.sheet_names
        target_sheet = None
        for name in sheet_names:
            if name.lower() == company.lower():
                target_sheet = name
                break

        if target_sheet:
            # Read the specific sheet, assuming no header and questions start at A1 (index 0)
            df = pd.read_excel(excel_path, sheet_name=target_sheet, header=None)

            if not df.empty and df.shape[1] > 0: # Check if dataframe is not empty and has at least one column
                 # Questions are in the first column (index 0)
                questions = df.iloc[:, 0].dropna().astype(str).tolist()
                if questions:
                    print(f"Found {len(questions)} questions for {company} in sheet '{target_sheet}'.")
                    return questions
                else:
                    print(f"Sheet '{target_sheet}' for {company} found, but the first column is empty or contains only NaN values.")
                    return default_questions
            else:
                print(f"Sheet '{target_sheet}' for {company} found but is empty or has no columns.")
                return default_questions
        else:
            print(f"No specific sheet found for '{company}'. Returning default questions.")
            return default_questions

    except FileNotFoundError: # Should be caught by os.path.exists, but kept for robustness
        print(f"Error: Excel file not found at {excel_path}. Returning default questions.")
        return default_questions
    except Exception as e:
        print(f"An error occurred while reading the Excel file for {company}: {e}. Returning default questions.")
        return default_questions


@tool
def record_and_transcribe_audio(duration: int = 15, fs: int = 16000) -> str:
    """Records audio from the microphone for a specified duration and transcribes it using Google Cloud Speech-to-Text."""
    print(f"Recording audio for {duration} seconds... Speak now!")
    audio_file = f"/tmp/interview_answer_{uuid.uuid4()}.wav"
    try:
        # Record audio
        recording = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')
        sd.wait()
        # Convert to int16 and save
        recording_int16 = np.int16(recording * 32767)
        wav.write(audio_file, fs, recording_int16)
        print("Audio recorded.")

        # Transcribe audio
        print("Transcribing audio...")
        client = speech.SpeechClient() # Assumes GOOGLE_APPLICATION_CREDENTIALS is set
        with open(audio_file, "rb") as f:
            content = f.read()
        audio = speech.RecognitionAudio(content=content)
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=fs,
            language_code="en-US",
            enable_automatic_punctuation=True
        )
        response = client.recognize(config=config, audio=audio)
        os.remove(audio_file) # Clean up temporary file

        if not response.results:
            print("Transcription failed: No speech detected.")
            return "[No speech detected]"

        transcript = " ".join([result.alternatives[0].transcript for result in response.results])
        print(f"Transcription complete: {transcript}")
        return transcript.strip()
    except Exception as e:
        if os.path.exists(audio_file):
            os.remove(audio_file)
        error_msg = f"Error during audio recording or transcription: {e}"
        print(error_msg)
        return error_msg


# TODO Implement the below tools

@tool
def generate_ideal_answer(question: str, company_tag: Optional[str] = None) -> str:
    """Generates an ideal answer to the question (simulated by LLM call)."""
    # This would typically involve another LLM call with specific instructions
    # For simplicity here, we'll just return a placeholder or let the main agent handle it.
    return f"[Placeholder: Ideal answer generation for '{question}' considering company '{company_tag}']"

@tool
def rewrite_candidate_answer(question: str, candidate_answer: str) -> str:
    """Rewrites the candidate's answer for improvement (simulated by LLM call)."""
    return f"[Placeholder: Rewritten version of answer for '{question}']"

@tool
def critique_and_advise(question: str, candidate_answer: str, ideal_answer: str, company_tag: Optional[str] = None) -> str:
    """Provides critique and advice based on the answers (simulated by LLM call)."""
    return f"[Placeholder: Critique for answer to '{question}' considering company '{company_tag}']"

## 3. Agent Definitions

Define the state, prompts, and nodes for each agent.


In [4]:
# # --- Agent State ---
# class InterviewState(TypedDict):
#     messages: List[Any] # Stores the conversation history
#     user_resume_path: Optional[str]
#     user_jd_path: Optional[str]
#     user_resume_text: Optional[str]
#     user_jd_text: Optional[str]
#     clean_resume: Optional[str]
#     clean_jd: Optional[str]
#     company_name: Optional[str]
#     knowledge_output: Optional[Dict[str, Any]] # Output from Knowledge Agent
#     planner_output: Optional[Dict[str, Any]] # Output from Planner Agent (study plan, etc.)
#     preferred_question_type: Optional[str]
#     generated_question: Optional[str]
#     candidate_answer: Optional[str]
#     evaluation_output: Optional[Dict[str, Any]] # Output from Evaluation Agent
#     current_agent: str # Tracks which agent's turn it is

# --- Agent Prompts (from design.md) ---

preprocessing_prompt = """
You are PreprocessingAgent, a specialist in structuring resume and job-description data for downstream analysis.
You will be given paths to the user's resume and the job description. Parse these files and extract the text.
Generate embeddings for both the resume and job description, saving them to the specified FAISS paths.
For all other tasks, handover to the next agent after performing your functions.

1. Call `extract_text_with_ocr` for both the resume file (`user_resume_path`) and the job description file (`user_jd_path`).
2. Clean the extracted text: Remove headers, footers, duplicate whitespace, and decorative lines. Store these as `clean_resume` and `clean_jd`.
   *(Self-correction: Labeling sections like [CONTACT...] is complex and better handled by downstream agents if needed. Focus on cleaning and embedding.)*
3. Extract the 'COMPANY NAME' from the cleaned job description (`clean_jd`).
4. Call `generate_embeddings_and_save` **once for `clean_resume`**, saving to ../faiss/resume_embeddings.
5. Call `generate_embeddings_and_save` **once for `clean_jd`**, saving to ../faiss/jd_embeddings .
6. Hand over the extracted Company Name and Job Title to the knowledge agent for retrieving company-specific knowledge from the web.
"""

knowledge_prompt = """
You are the Knowledge Agent in a multi-agent interview preparation system. Your role is to ONLY extract real-world, subjective expectations about how top companies evaluate technical candidates during interviews using the job description as a reference. 
You must infer the evaluation rubric, communication expectations, common patterns, and failure modes using a web search tool. You must also generate the knowledge embeddings using the results from your web search.  
For all other tasks, handover to the next agent after performing your functions.

## Tools Available:

### 1. Tavily Tool
- Performs web search queries across Reddit, Glassdoor, Blind, Medium, and other platforms.
- Returns concise snippets and links, not full articles.
- Snippets often contain behavioral signals or reflections from interviewees.

### 2. retrieve_jd_embeddings_from_vector_db
- Retrieve details relevant to the Job Description from the JD FAISS index.

### 3. generate_knowledge_embeddings_and_save tool
- Use to the results of your web search to create embeddings
- Save them to a FAISS index at `"./faiss/knowledge_embeddings"`.

---

## Inputs:
- The company and the job title to analyze (e.g., "Uber").

---

## Search Strategy:
Construct at least 4 queries using variations like:
- coding interview expectations site:reddit.com
- behavioral interview rubric site:glassdoor.com
- system design interview site:blind.com

---

## Goals:
1. Use the Tavily Tool to collect search result snippets about the company’s interview process.
2. For each snippet, analyze and infer themes such as:
   - Ownership
   - Tradeoff Thinking
   - Structured Reasoning
   - Handling Ambiguity
   - Communication Style
3. Cluster the themes and summarize them as a list of `inferred_rubric` items.
4. From the language and tone of the snippets, extract 3–5 actionable `communication_tips`.
5. Pass the snippet list and associated metadata into `generate_embeddings` to persist them in FAISS.
6. Hand over the output to the planner agent for generating the study plan, and all other tasks

---

## Output Format:
Return a JSON object in the format:
```json
{
  "company": "XYZ",
  "inferred_rubric": [
    {
      "theme": "Ownership",
      "evidence": "Multiple Reddit users reported being asked how they would proactively handle fallback strategies.",
      "discussion_reference": "https://www.reddit.com/r/csMajors/comments/abc123"
    },
    {
      "theme": "Tradeoff Thinking",
      "evidence": "Glassdoor snippets emphasize discussing time-space tradeoffs during implementation decisions.",
      "discussion_reference": "https://www.glassdoor.com/Interview/XYZ-Interview-Questions.htm"
    }
  ],
  "communication_tips": [
    "Narrate your approach clearly before coding.",
    "Always explain tradeoffs when discussing solutions.",
    "Discuss scalability and edge case assumptions upfront."
  ]
}
```
"""

planner_prompt = """
You are the Planner Agent, orchestrating interview preparation. You are only responsible for generating a study plan, curating a list of relevant leetcode problems and inferring the company-specific evaluation rubric. 
For all other tasks, handover to the next agent after performing your functions.


**Inputs You Have Access To (implicitly via state or tools)**:
- Parsed Resume Context (from ../faiss/resume_embeddings )
- Parsed Job Description Context (from ../faiss/jd_embeddings )
- Knowledge Agent Output: `inferred_rubric` and `communication_tips` for the company.
- User Preference: `preferred_question_type` (e.g., 'technical', 'behavioral').

**Your Responsibilities**:
1.  **Synthesize**: Briefly analyze the alignment between resume, JD, and company insights.
2.  **Generate Study Plan**: Create a concise, actionable study plan (markdown format).
3.  **Suggest LeetCode**: Call `company_leetcode_retriever` for the company.
4.  **Present Insights**: Format and include the `inferred_rubric` and `communication_tips` in your output.
5.  **Embed Insights**: Call `generate_embeddings_and_save` to save the combined text of the rubric and tips to the rubric path
6.  **Prepare for Question Agent**: Note the `preferred_question_type` for the next step.
7. Hand over the output to the question agent

**Output Format**:
Return **only** a JSON object like this:
```json
{
  "study_plan": "<Markdown formatted study plan>",
  "suggested_leetcode": ["<LeetCode Q1>", "<LeetCode Q2>"],
  "company_insights_display": {
    "company": "...",
    "inferred_rubric": [ ... ],
    "communication_tips": [ ... ]
  },
  "embedding_status": "<Status message from generate_embeddings_and_save>",
  "next_action": "Proceed to generate a question."
}
"""

question_prompt = """
You are the Question Agent. Your goal is to generate a single, relevant, open-ended interview question.

For all other tasks, handover to the next agent after performing your functions.

**Inputs You Have Access To (implicitly via state or tools)**:
- Question Type Requested:
- Company: 
- Contextual Data (via `retrieve_from_vector_db` from ../faiss/resume_embeddings, ./faiss/jd_embeddings)

**Your Task**:
1. Call `retrieve_from_vector_db` using relevant queries (e.g., job title, key skills, company name, question type) against the resume, JD, and rubric indices to gather context.
2. Synthesize the retrieved context.
3. Generate **one** interview question of the type that is:
    - Tailored to the company.
    - Relevant to the job description and candidate's likely experience.
    - Aligned with the company's inferred rubric/culture (if available).
    - Clear, professional, and open-ended.
4. Output the question to the user in a JSON format. This signals completion.

**Output Format**:
Return **only** a JSON object like this:
```json
{
  "question": "<The generated interview question>"
}
```
"""

evaluation_prompt = """
You are EvaluationFeedbackAgent, a senior interview coach.


For all other tasks, handover to the next agent after performing your functions.


**Given**:
- question:
- candidate_answer: 
- company_tag: 
- rubric_index_path: 

**Do the following**:
1. Call `retrieve_rubric_snippets` using the `question` and `company_tag` to get relevant evaluation criteria.
2. Call `generate_ideal_answer` for the `question` and `company_tag`.
3. Call `rewrite_candidate_answer` for the `question` and `candidate_answer`.
4. Call `critique_and_advise` using all inputs.
   - The critique must highlight strengths, list missed elements (e.g., complexity, STAR method), suggest improvements, use bullet points, and bold key terms.

**Output Format**:
Return **only** this JSON:
```json
{
  "ideal_answer": "<Output from generate_ideal_answer>",
  "improved_answer": "<Output from rewrite_candidate_answer>",
  "detailed_feedback": "<Output from critique_and_advise>"
}
```
"""


# # Create nodes
# preprocess_tools = [extract_text_with_ocr, generate_resume_embeddings_and_save, generate_jd_embeddings_and_save]
# preprocess_agent_node = create_agent_node(preprocessing_prompt, preprocess_tools)

# knowledge_tools = [tavily_tool, generate_resume_embeddings_and_save, generate_jd_embeddings_and_save]
# knowledge_agent_node = create_agent_node(knowledge_prompt, knowledge_tools)

# planner_tools = [company_leetcode_problem_retriever, generate_resume_embeddings_and_save, generate_jd_embeddings_and_save, generate_rubric_embeddings_and_save]
# planner_agent_node = create_agent_node(planner_prompt, planner_tools)

# question_tools = [retrieve_jd_embeddings_from_vector_db,retrieve_rubric_embeddings_from_vector_db]
# question_agent_node = create_agent_node(question_prompt, question_tools)

# evaluation_tools = [retrieve_resume_embeddings_from_vector_db,retrieve_jd_embeddings_from_vector_db,retrieve_rubric_embeddings_from_vector_db,retrieve_rubric_snippets, generate_ideal_answer, rewrite_candidate_answer, critique_and_advise]
# evaluation_agent_node = create_agent_node(evaluation_prompt, evaluation_tools)

def record_audio(duration=5, fs=16000):
  print("🎙️ Speak now...")
  audio_file = f"/tmp/test_audio_{uuid.uuid4()}.wav"

  recording = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')
  sd.wait()
  recording = np.int16(recording * 32767)
  wav.write(audio_file, fs, recording)
  print("✅ Audio recorded and saved.")
  return audio_file

def transcribe_audio(file_path):
  client = speech.SpeechClient()

  with open(file_path, "rb") as audio_file:
      content = audio_file.read()

  audio = speech.RecognitionAudio(content=content)
  config = speech.RecognitionConfig(
      encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
      sample_rate_hertz=16000,
      audio_channel_count=1,
      language_code="en-US",
      enable_automatic_punctuation=True
  )

  response = client.recognize(config=config, audio=audio)
  if not response.results:
      return "[No speech detected]"

  transcript = " ".join(result.alternatives[0].transcript for result in response.results)
  return transcript.strip()

# Custom record agent that records audio and transcribes it
async def record_agent(state, config):
  """
    Records audio from the user and transcribes it into text using Google Cloud Speech-to-Text.
    The transcript is saved to `candidate_answer` in the state.
  """
  print("🎧 Running record agent...")
  audio_path = record_audio()
  transcript = transcribe_audio(audio_path)
  os.remove(audio_path)
  state["candidate_answer"] = transcript
  return {"messages": []}

from langgraph.prebuilt import ToolNode
record_agent_node = ToolNode([record_agent], name="record_agent")

# Creating Agent Nodes
preprocess_tools = [extract_text_with_ocr, generate_resume_embeddings_and_save, generate_jd_embeddings_and_save, create_handoff_tool(agent_name='knowledge_agent', description="Hand over to Knowledge Agent for web search, rubric inference and generating knowledge embeddings")]
preprocess_agent_node = create_react_agent(
    gemini_model,
    preprocess_tools,
    prompt=preprocessing_prompt,
    name='preprocess_agent'
)

knowledge_tools = [tavily_tool, retrieve_jd_embeddings_from_vector_db, generate_knowledge_embeddings_and_save, create_handoff_tool(agent_name='planner_agent')]
knowledge_agent_node = create_react_agent(
    gemini_model,
    knowledge_tools,
    prompt=knowledge_prompt,
    name='knowledge_agent'
)

planner_tools = [company_leetcode_problem_retriever, retrieve_resume_embeddings_from_vector_db, retrieve_jd_embeddings_from_vector_db, retrieve_knowledge_embeddings_from_vector_db, generate_rubric_embeddings_and_save, create_handoff_tool(agent_name='question_agent')]
planner_agent_node = create_react_agent(
    gemini_model,
    planner_tools,
    prompt=planner_prompt,
    name='planner_agent'
)

question_tools = [retrieve_resume_embeddings_from_vector_db, retrieve_jd_embeddings_from_vector_db, retrieve_knowledge_embeddings_from_vector_db, create_handoff_tool(agent_name='record_agent', description="Hand over to Record Agent to capture user answer.")]
question_agent_node = create_react_agent(
    gemini_model,
    question_tools,
    prompt=question_prompt,
    name='question_agent'
)

evaluation_tools = [retrieve_resume_embeddings_from_vector_db, retrieve_jd_embeddings_from_vector_db, retrieve_rubric_embeddings_from_vector_db, generate_ideal_answer, rewrite_candidate_answer, critique_and_advise]
evaluation_agent_node = create_react_agent(
    gemini_model,
    evaluation_tools,
    prompt=evaluation_prompt,
    name='evaluation_agent'
)


workflow = create_swarm(
    agents=[
        preprocess_agent_node,
        knowledge_agent_node,
        planner_agent_node,
        question_agent_node,
        record_agent_node
    ],
    default_active_agent='preprocess_agent'
)

graph = workflow.compile(checkpointer=checkpointer)

config = {"configurable": {"thread_id": 1}}

turn_1 = graph.invoke(
    {"messages": [{
        "role": "user", 
        "content": "The file path to my resume and jd is ./Mandar_Burande_Resume.pdf and ./amazon-jd.txt. Give me a study plan for cracking an interview with this company and also a list of popular leetcode problems for this company. Based on my experience, skills and projects from my resume, suggest me a question that will help me improve my skills for this company."}]},
    config
)

print(turn_1)
print(turn_1['messages'][-1])





Extracting text from PDF: ./Mandar_Burande_Resume.pdfReading text file: ./amazon-jd.txt

PDF text extraction with pdfminer.six finished.




{'messages': [HumanMessage(content='The file path to my resume and jd is ./Mandar_Burande_Resume.pdf and ./amazon-jd.txt. Give me a study plan for cracking an interview with this company and also a list of popular leetcode problems for this company. Based on my experience, skills and projects from my resume, suggest me a question that will help me improve my skills for this company.', additional_kwargs={}, response_metadata={}, id='201cfc7f-6a56-409f-a662-36e287045e90'), AIMessage(content='', additional_kwargs={'function_call': {'name': 'extract_text_with_ocr', 'arguments': '{"file_path": "./amazon-jd.txt"}'}}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, name='preprocess_agent', id='run-ab9a70a2-a591-4964-a0da-8d7e3987efdc-0', tool_calls=[{'name': 'extract_text_with_ocr', 'args': {'file_path': './Mandar_Burande_Resume.pdf'}, 'id': '8e5a4c85-6543-490e-bfb2-2b177f7f7cef', 'type': 'tool_call'}, {'name': '

## 4. Graph Definition

Define the workflow connecting the agents.


In [5]:
# # --- Graph Construction ---
# workflow = StateGraph(InterviewState)

# # # Add nodes for each agent and their tools
# workflow.add_node("preprocess", preprocess_agent_node)
# # workflow.add_node("preprocess_tools", preprocess_tool_node)
# workflow.add_node("knowledge", knowledge_agent_node)
# # workflow.add_node("knowledge_tools", knowledge_tool_node)
# workflow.add_node("planner", planner_agent_node)
# # workflow.add_node("planner_tools", planner_tool_node)
# workflow.add_node("question", question_agent_node)
# # workflow.add_node("question_tools", question_tool_node)
# workflow.add_node("record_answer", record_and_transcribe_audio) # Direct tool call node
# workflow.add_node("evaluate", evaluation_agent_node)
# # workflow.add_node("evaluate_tools", evaluation_tool_node)

# # Define edges
# workflow.set_entry_point("preprocess")

# # Preprocessing Agent Logic
# # workflow.add_edge("preprocess", "preprocess_tools")
# # workflow.add_conditional_edges(
# #     "preprocess_tools",
# #     tools_condition,
# #     {"continue": "knowledge", END: END} # If tool call needed, loop back via tools_condition, else go to knowledge
# # )
# workflow.add_edge("preprocess", "knowledge")
# workflow.add_edge("knowledge", "planner")
# workflow.add_edge("planner", "question")
# workflow.add_edge("question", "record_answer")
# workflow.add_edge("record_answer", "evaluate")

# # Knowledge Agent Logic
# # workflow.add_edge("knowledge", "knowledge_tools")
# # workflow.add_conditional_edges(
# #     "knowledge_tools",
# #     tools_condition,
# #     {"continue": "planner", END: END}
# # )

# # # Planner Agent Logic
# # workflow.add_edge("planner", "planner_tools")
# # workflow.add_conditional_edges(
# #     "planner_tools",
# #     tools_condition,
# #     {"continue": "question", END: END}
# # )

# # # Question Agent Logic
# # workflow.add_edge("question", "question_tools")
# # workflow.add_conditional_edges(
# #     "question_tools",
# #     tools_condition,
# #     {"continue": "record_answer", END: END} # After question is generated, record answer
# # )

# # # Record Answer Node
# # workflow.add_edge("record_answer", "evaluate") # After recording, go to evaluation

# # # Evaluation Agent Logic
# # workflow.add_edge("evaluate", "evaluate_tools")
# # workflow.add_conditional_edges(
# #     "evaluate_tools",
# #     tools_condition,
# #     {"continue": END, END: END} # End after evaluation
# # )

# # Compile the graph
# # graph = workflow.compile(checkpointer=checkpointer)

# # print("Graph compiled successfully!")
# # # Optional: Visualize the graph
# # try:
# #     from IPython.display import Image, display
# #     display(Image(graph.get_graph().draw_mermaid_png()))
# # except Exception as e:
# #     print(f"Could not display graph: {e}. Make sure graphviz and mermaid are installed/configured.")

## 5. Execution and Interaction

Run the graph with user inputs.


In [6]:
# import asyncio
# import pprint

# # --- Execution ---

# # IMPORTANT: Set the path to your Google Cloud credentials file
# # This is needed for the record_and_transcribe_audio tool
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key.json" # Replace with the actual path to your key.json

# # --- User Inputs ---
# # !!! IMPORTANT: Replace these with the actual paths to your files !!!
# resume_file = os.path.join(os.getcwd(), "Mandar Burande_Resume.pdf") # e.g., "/path/to/your/resume.pdf"
# jd_file = os.path.join(os.getcwd(), "amazon-jd.txt") #"/path/to/your/job_description.txt"
# user_preferred_question_type = "technical" # Options: "technical", "behavioral", "system design", "debugging/problem-solving"

# # Create dummy files if they don't exist for the example run
# if not os.path.exists(resume_file):
#     with open(resume_file, "w") as f:
#         f.write("Sample Resume Content: Python Developer with 5 years experience in web development and data analysis.")
# if not os.path.exists(jd_file):
#      with open(jd_file, "w") as f:
#         f.write("Sample Job Description: Looking for a Senior Software Engineer at Google. Requires strong Python skills, experience with distributed systems, and cloud platforms.")

# # Define the initial state to start the graph
# initial_state = {
#     "messages": [],
#     "user_resume_path": resume_file,
#     "user_jd_path": jd_file,
#     "preferred_question_type": user_preferred_question_type,
#     "current_agent": "preprocess" # Start with the preprocessing agent
# }

# # Configuration for the graph run (e.g., unique thread ID)
# config = {"configurable": {"thread_id": "interview-prep-thread-1"}}

# async def run_graph():
#     final_state = None
#     print("--- Starting Interview Prep Workflow ---")
#     print(f"Resume: {resume_file}")
#     print(f"Job Description: {jd_file}")
#     print(f"Preferred Question Type: {user_preferred_question_type}")
#     print("-" * 30)

#     async for event in graph.astream_events(initial_state, config, version="v1"):
#         kind = event["event"]
#         tags = event.get("tags", [])
#         if kind == "on_chat_model_stream":
#             content = event["data"]["chunk"].content
#             if content:
#                 # Print LLM tokens as they arrive
#                 print(content, end="|")
#         elif kind == "on_tool_start":
#             print(f"\n--- Calling Tool: {event['name']} ---")
#             print(f"   Args: {event['data'].get('input')}")
#         elif kind == "on_tool_end":
#             print(f"--- Tool Result: {event['name']} ---")
#             print(f"   Output: {event['data'].get('output')}")
#             print("-" * 30)
#         elif kind == "on_chain_end":
#              # Check if it's the end of a specific agent node run
#             if event["name"] in ["preprocess", "knowledge", "planner", "question", "evaluate"]:
#                  print(f"\n--- Finished Agent: {event['name']} ---")
#                  # pprint.pprint(event['data'].get('output'), indent=2) # Print agent output if needed
#                  print("-" * 30)


#         # Track the final state
#         if kind == "on_graph_end":
#             final_state = event['data']['output']


#     print("\n--- Workflow Complete ---")

#     if final_state:
#         print("\n--- Final Results ---")
#         # Extract and print key information from the final state
#         planner_output = final_state.get('planner_output', {})
#         evaluation_output = final_state.get('evaluation_output', {})

#         print("\n**Study Plan:**")
#         print(planner_output.get('study_plan', 'Not generated.'))

#         print("\n**Suggested LeetCode:**")
#         pprint.pprint(planner_output.get('suggested_leetcode', 'Not generated.'))

#         print("\n**Company Insights:**")
#         pprint.pprint(planner_output.get('company_insights_display', 'Not generated.'))

#         print(f"\n**Generated Question ({final_state.get('preferred_question_type', 'N/A')}):**")
#         print(final_state.get('generated_question', 'Not generated.'))

#         print("\n**Your Transcribed Answer:**")
#         # The actual transcribed answer isn't directly stored in the state by the tool node,
#         # but it was passed to the evaluation agent. We print the placeholder for clarity.
#         # In a real UI, you'd capture the output of the 'record_answer' node.
#         print(final_state.get('candidate_answer', '[Answer was recorded and passed to evaluation]'))


#         print("\n**Evaluation Feedback:**")
#         print("\n*Ideal Answer (Placeholder):*")
#         print(evaluation_output.get('ideal_answer', 'Not generated.'))
#         print("\n*Improved Answer (Placeholder):*")
#         print(evaluation_output.get('improved_answer', 'Not generated.'))
#         print("\n*Detailed Feedback (Placeholder):*")
#         print(evaluation_output.get('detailed_feedback', 'Not generated.'))
#     else:
#         print("Workflow did not complete successfully or final state not captured.")

# # Run the asynchronous function
# import traceback

# try:
#     await run_graph()
# except Exception as e:
#     print("\nAn error occurred during graph execution:")
#     traceback.print_exc()

