# Modular AI Interview Agent with crewAI
This notebook demonstrates a refactor of the Interview Agent system using the crewAI framework for agent/task orchestration.

## Steps
1. **Setup & Imports**
2. **LLM Initialization**
3. **Define Agents (Resume Parser, Interviewer, Evaluator)**
4. **Define Tasks**
5. **Crew Orchestration**
6. **Run Example Workflow**


In [1]:
# 1. Setup & Imports
from crewai import Agent, Task, Crew, LLM
from dotenv import load_dotenv
import os

# Load environment variables (for API keys, etc.)
load_dotenv()

# 2. LLM Initialization
llm = LLM(
    model="groq/gemma2-9b-it",
    temperature=1.0
)
test_again = llm.call("Tell me about APJ Abdul Kalam in one sentence")
print(test_again)


A.P.J. Abdul Kalam, the "Missile Man of India," was a visionary scientist and the 11th President of India, known for his contributions to India's space and defense programs and his inspiring leadership. 





## Resume Parser

In [None]:
# 3. Define Resume Parser Agent and Task
from pydantic import BaseModel, Field, field_validator
from typing import Optional, Union, List
from typing import Optional
from crewai import Crew

# Candidate profile model (simplified for demo)
class CandidateProfile(BaseModel):
    name: str = Field(description="Candidate's full name")
    email: str = Field(description="Candidate's email address")
    experience: Union[str, List] = Field(description="experience")
    projects: Union[str, List] = Field(description="projects")
    education: Union[str, List] = Field(description="education")
    skills: Union[str, List] = Field(description="skills")
    extracurricular: Union[str, List] = Field(description="extracurricular")
    achievements: Optional[Union[str, List]] = Field(description="achievements")
    certifications: Optional[Union[str, List]] = Field(description="certifications")
    resume: str = Field(description="Resume text or summary")

    @field_validator(
        "experience", "projects", "education", "skills", "extracurricular", "achievements", "certifications"
    )
    def join_list_fields(cls, v, info):
        if isinstance(v, list):
            # If list of dicts, join their string representations
            if all(isinstance(i, dict) for i in v):
                return "; ".join([str(i) for i in v])
            return ", ".join(str(i) for i in v)
        return v

# Define the Resume Parser agent
resume_parser_agent = Agent(
    role="Resume Parser",
    goal="Extract structured candidate information from resume text",
    backstory="You are an expert at reading resumes and extracting key candidate details for interview preparation.",
    llm=llm
)

# Define the parsing task
resume_text = """John Doe
john@example.com
Experience: 5 years in Python, AI
Skills: Python, Machine Learning, Data Science"""

parse_resume_task = Task(
    description="Parse the following resume and extract name, email, experience, and skills as JSON:\n" + resume_text,
    agent=resume_parser_agent,
    expected_output="A JSON object with keys: name, email, experience, skills."
)

# Create a crew with the agent and task
crew = Crew(
    agents=[resume_parser_agent],
    tasks=[parse_resume_task]
)

# Run the workflow
results = crew.kickoff()
print(results)

```json
{
  "name": "John Doe",
  "email": "john@example.com",
  "experience": "5 years in Python, AI",
  "skills": ["Python", "Machine Learning", "Data Science"]
}
```


## Agent 1

In [3]:
from PyPDF2 import PdfReader
from docx import Document

def extract_text_from_pdf(file_path: str) -> str:
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

def extract_text_from_docx(file_path: str) -> str:
    doc = Document(file_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

def get_resume_text(resume_input: str, filetype: str = "text") -> str:
    if filetype == "pdf":
        return extract_text_from_pdf(resume_input)
    elif filetype == "docx":
        return extract_text_from_docx(resume_input)
    else:
        return resume_input  # Assume plain text

# Example usage:
resume_input = "D:/Python/AI Agent Project/Interview Agent/notebooks/Aditya Lalchandani Resume.pdf"  # or .docx or plain text
filetype = "pdf"  # or "docx" or "text"
resume_text = get_resume_text(resume_input, filetype)

parse_resume_task = Task(
    description=(
        "Parse the following resume and extract name, email, experience, projects, education, skills, extracurricular, achievements, certifications as JSON:\n"
        + resume_text
    ),
    agent=resume_parser_agent,
    expected_output="A JSON object with keys: name, email, experience, projects, education, skills, extracurricular, achievements, certifications."
)

crew = Crew(
    agents=[resume_parser_agent],
    tasks=[parse_resume_task]
)

results = crew.kickoff()
print(results)

{
  "name": "Aditya Lalchandani",
  "email": "aditya.lalchandani17@gmail.com",
  "experience": [
    {
      "title": "British Airways Job Simulation on Forage",
      "company": "British Airways",
      "dates": "January 2025",
      "description": "Virtual Internship\n•Cleaned and preprocessed over 50,000 airline booking records, engineered new features and removed 1,100+ outliers using\nZ-score filtering to improve data quality for predictive modeling.\n•Developed and tuned XGBoost classification models, achieving a recall improvement of 18% for booking completion\nprediction on test data through advanced feature engineering and hyperparameter optimization.\n•Visualized and analyzed the importance of 40+ features, identifying purchase lead as the top predictor and providing\nactionable insights to optimize flight scheduling and marketing strategies."
    }
  ],
  "projects": [
    {
      "title": "Flight Price Predictor",
      "description": "•Created an end-to-end flight price pr

## Agent 2

In [4]:
from pydantic import BaseModel, Field
from typing import List
from crewai import Agent, Task, Crew
import json

# Pydantic model for a generated interview question
class InterviewQuestion(BaseModel):
    question: str = Field(description="The interview question text")
    answer: str = Field(default="", description="The answer to the question")

# Define the Interview Question Generator agent
question_agent = Agent(
    role="Interview Question Generator",
    goal="Generate technical, behavioral, situational interview questions based on candidate profile and job description",
    backstory="You are an expert interviewer who creates relevant questions for candidates based on the job they are applying for.You have a good understanding of what makes a good candidate. You are very good at asking questions that get to the point.",
    llm=llm
)

# Inputs
job_position = "Data Scientist"
job_description = (
    "We are seeking a Data Scientist with strong experience in Python, machine learning, and data analysis. "
    "The candidate should be able to build predictive models, work with large datasets, and communicate insights effectively."
)
num_questions = 5

# Use the output from the resume parser (results) as input
candidate_profile = results  # or extract the relevant dict if needed

question_task = Task(
    description=(
        f"You are preparing for an interview for the position of '{job_position}'.\n"
        f"Job Description: {job_description}\n"
        f"Based on the following candidate profile, generate {num_questions} technical interview questions relevant to their experience, skills, and projects, and the job description above.The goal is to assess the candidate's skills, experience, and suitability for the role "
        "Return ONLY a JSON list of objects with the key 'question'.\n"
        f"Candidate Profile:\n{candidate_profile}"
    ),
    agent=question_agent,
    expected_output="A JSON list of objects with the key 'question'."
)

crew = Crew(
    agents=[question_agent],
    tasks=[question_task]
)

questions_output = crew.kickoff()

# --- Robustly extract the JSON string from CrewOutput ---
def extract_json_from_output(output):
    # Try .result, .output, or str fallback
    if hasattr(output, "result"):
        result = output.result
        if isinstance(result, str):
            return result
        elif isinstance(result, (list, dict)):
            return json.dumps(result)
    if hasattr(output, "output"):
        out = output.output
        if isinstance(out, str):
            return out
        elif isinstance(out, (list, dict)):
            return json.dumps(out)
    # Fallback: try to convert to string
    return str(output)

# Parse and validate the questions using Pydantic
try:
    questions_json = extract_json_from_output(questions_output)
    # Try to find the first JSON list in the string (handles LLM extra text)
    import re
    match = re.search(r'\\[.*\\]', questions_json, re.DOTALL)
    if match:
        questions_json = match.group(0)
    parsed_questions = [InterviewQuestion(**q) for q in json.loads(questions_json)]
    for q in parsed_questions:
        print(f"Q: {q.question}")
        q.answer = input("Your answer: ")
except Exception as e:
    print("Error parsing questions:", e)
    print(questions_output)

# Save questions and answers for next agent
answered_questions = [q.model_dump() for q in parsed_questions]


Q: You mentioned improving recall in your British Airways project. Can you elaborate on the steps you took to achieve this 18% improvement, including the specific feature engineering techniques and hyperparameter tuning strategies used?
Q: Your Flight Price Predictor project highlights your experience with modular ML pipelines. Can you describe how you structured your pipeline and the benefits of this approach, particularly in terms of maintainability and scalability?
Q: In your Song Recommender System, you mention using both collaborative and content-based filtering. Can you explain the rationale behind this hybrid approach and how you balanced the strengths of each method to create a more robust recommendation system?
Q: Your experience with handling large-scale music datasets using Dask is impressive. Can you provide a concrete example of a data processing challenge you encountered while working with these datasets and how you leveraged Dask to overcome it efficiently?
Q: You've uti

In [5]:
answered_questions

[{'question': 'You mentioned improving recall in your British Airways project. Can you elaborate on the steps you took to achieve this 18% improvement, including the specific feature engineering techniques and hyperparameter tuning strategies used?',
  'answer': 'ddd'},
 {'question': 'Your Flight Price Predictor project highlights your experience with modular ML pipelines. Can you describe how you structured your pipeline and the benefits of this approach, particularly in terms of maintainability and scalability?',
  'answer': 'cx'},
 {'question': 'In your Song Recommender System, you mention using both collaborative and content-based filtering. Can you explain the rationale behind this hybrid approach and how you balanced the strengths of each method to create a more robust recommendation system?',
  'answer': 'sx'},
 {'question': 'Your experience with handling large-scale music datasets using Dask is impressive. Can you provide a concrete example of a data processing challenge you en

In [6]:
from pydantic import BaseModel, Field
from typing import List
from crewai import Agent, Task, Crew
import json

# Pydantic model for feedback
class AnswerFeedback(BaseModel):
    question: str = Field(description="The interview question text")
    answer: str = Field(description="The candidate's answer")
    score: float = Field(description="Score for the answer (0-10)")
    comments: str = Field(description="Feedback comments")

# Define the Answer Evaluation agent
evaluation_agent = Agent(
    role="Answer Evaluation Agent",
    goal="Evaluate candidate answers and provide feedback using LLM",
    backstory="You are an expert technical interviewer who scores and comments on candidate answers.",
    llm=llm
)

# Prepare evaluation task input
feedback_task = Task(
    description=(
        "You are an expert technical interviewer. For each question and answer pair below, provide a score (0-10) and a short feedback comment. "
        "Return ONLY a JSON list of objects with keys: question, answer, score, comments.\n"
        f"Q&A Pairs: {json.dumps(answered_questions, ensure_ascii=False)}"
    ),
    agent=evaluation_agent,
    expected_output="A JSON list of objects with keys: question, answer, score, comments."
)

crew = Crew(
    agents=[evaluation_agent],
    tasks=[feedback_task]
)

feedback_output = crew.kickoff()

def extract_json_from_output(output):
    if hasattr(output, "result"):
        result = output.result
        if isinstance(result, str):
            return result
        elif isinstance(result, (list, dict)):
            return json.dumps(result)
    if hasattr(output, "output"):
        out = output.output
        if isinstance(out, str):
            return out
        elif isinstance(out, (list, dict)):
            return json.dumps(out)
    return str(output)

# Parse and validate feedback
try:
    feedback_json = extract_json_from_output(feedback_output)
    import re
    match = re.search(r'\[.*\]', feedback_json, re.DOTALL)
    if match:
        feedback_json = match.group(0)
    parsed_feedback = [AnswerFeedback(**f) for f in json.loads(feedback_json)]
    for f in parsed_feedback:
        print(f"Q: {f.question}\nA: {f.answer}\nScore: {f.score}\nComments: {f.comments}\n---")
except Exception as e:
    print("Error parsing feedback:", e)
    print(feedback_output)

# Save feedback for further use
feedback_results = [f.model_dump() for f in parsed_feedback]

Q: You mentioned improving recall in your British Airways project. Can you elaborate on the steps you took to achieve this 18% improvement, including the specific feature engineering techniques and hyperparameter tuning strategies used?
A: ddd
Score: 0.0
Comments: The answer is incomplete and doesn't provide any relevant information about the British Airways project. To earn a higher score, the candidate should describe the specific feature engineering techniques used (e.g., one-hot encoding, TF-IDF) and how hyperparameter tuning strategies (e.g., grid search, random search) were employed to optimize recall.
---
Q: Your Flight Price Predictor project highlights your experience with modular ML pipelines. Can you describe how you structured your pipeline and the benefits of this approach, particularly in terms of maintainability and scalability?
A: cx
Score: 0.0
Comments: The answer is incomplete and doesn't provide any details about the structured pipeline or its benefits. A strong answ

In [None]:
from PyPDF2 import PdfReader
from docx import Document

def extract_text_from_pdf(file_path: str) -> str:
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

def extract_text_from_docx(file_path: str) -> str:
    doc = Document(file_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text

def get_resume_text(resume_input: str, filetype: str = "text") -> str:
    if filetype == "pdf":
        return extract_text_from_pdf(resume_input)
    elif filetype == "docx":
        return extract_text_from_docx(resume_input)
    else:
        return resume_input  # Assume plain text

# Example usage:
resume_input = "D:/Python/AI Agent Project/Interview Agent/Roshni Sarda Resume.pdf"
filetype = "pdf"  # or "docx" or "text"
resume_text = get_resume_text(resume_input, filetype)
parse_resume_task = Task(
    description="Parse the following resume and extract name, email, experience, projects, education,skills, extracurricular, achievements, certifications as JSON:\n" + resume_text,
    agent=resume_parser_agent,
    expected_output="A JSON object with keys: name, email, experience, skills."
)

crew = Crew(
    agents=[resume_parser_agent],
    tasks=[parse_resume_task]
)

results = crew.kickoff()
print(results)