#**CV Analysis Chatbot**
Create a CV analysis system that can process multiple CV documents (PDF) and word(.docx), extract information using OCR, and
provide a chatbot interface for querying the extracted information.

##Document Processing

* Set up a system to handle both PDF and Word document formats
* Implement OCR
* Create a robust parsing system that can extract:
   * Personal Information
   * Education History
   * Work Experience
   * Skills
   * Projects
   * Certifications

In [1]:
#Mount the google drive as I kept all the files in gdrive for easy availability
from google.colab import drive
drive.mount('/content/drive')

# Set the folder path where CVs are stored in your Google Drive
cv_folder_path = "/content/drive/My Drive/CV Analysis/CVs"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


###Download required libraries

In [2]:
!pip install pdfminer.six PyMuPDF pdf2image pytesseract opencv-python pillow numpy
!pip install pdfplumber python-docx pytest
!apt-get install poppler-utils
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.6).
0 upgraded, 0 newly installed, 0 to remove and 21 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 21 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 21 not upgraded.


In [3]:
#import required libraries

import os #For interacting with the file system (listing files, handling paths).
import fitz  # PyMuPDF to read text from PDFs.
import pytesseract #OCR library to extract text from images.
import numpy as np
import cv2 #(OpenCV): Image processing for OCR (converting scanned PDFs to text).
from pdf2image import convert_from_path #Converts PDF pages into images (for OCR).
from PIL import Image
from docx import Document #Extracts text from Microsoft Word documents.
import json #Handling structured data storage in JSON format.
import re #regular expression
import pandas as pd

In [4]:
#Extract data from the word (.docx) documents using python-docx

def extract_text_from_word(docx_path):
    doc = Document(docx_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text.strip()


In [5]:
#To check the imported pdf's are either editable pdf or scanned image pdf

def is_pdf_scanned(pdf_path):
    doc = fitz.open(pdf_path)
    text = "".join([page.get_text("text") for page in doc])
    return len(text.strip()) == 0  # If no text is found, it's a scanned PDF
#checks if the extracted text is empty. If len(text.strip()) == 0, it returns True, indicating that the PDF is scanned and requires OCR for text extraction.



In [6]:
#To extract data from editable pdf using PyMuPDF

def extract_text_from_pdf(pdf_path):

    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc]) # page.get_text("text") iterate through each pdf and join in next line to keep the correct formating
    return text.strip()


In [7]:
#Convert the scanned image pdf to image to conduct the OCR

#This returns a list of image file in image_paths
def convert_pdf_to_images(pdf_path):

    images = convert_from_path(pdf_path)
    image_paths = []

    for i, img in enumerate(images):
        image_path = f"page_{i}.png" #gives page name as page_0, page_1 ...
        img.save(image_path, "PNG")
        image_paths.append(image_path)

    return image_paths


In [8]:
#Image extracted in the previous line is not giving proper ocr output.
#The image is enhanced by implementing Gaussian Blur, and Binarization

def enhance_image(image_path):

    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Apply Gaussian Blur to remove noise
    blurred = cv2.GaussianBlur(image, (5, 5), 0)

    # Apply Adaptive Thresholding (Binarization)
    enhanced = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                     cv2.THRESH_BINARY, 11, 2)

    # Save and return the enhanced image path
    enhanced_image_path = "enhanced_" + image_path
    cv2.imwrite(enhanced_image_path, enhanced)
    return enhanced_image_path


In [9]:
#Extracts text from an image using Tesseract OCR.

def extract_text_from_image(image_path):

    return pytesseract.image_to_string(Image.open(image_path))


Combining all the above calling definitions here

In [10]:
def get_file_type(file_path):
    """
    Determines the file type based on the file extension.
    """
    if file_path.lower().endswith(".docx"): # .lower() to convert to lowercase ensure case-insensitive matching
        return "word"
    elif file_path.lower().endswith(".pdf"):
        return "pdf"
    else:
        return None  # Or raise an exception if you prefer
def process_cv(file_path):
    """
    Processes a given CV document:
    - Detects if it's Word, Editable PDF, or Scanned PDF.
    - Extracts text accordingly.
    """
    file_type = get_file_type(file_path) #calls the get_file_type to figure out word or pdf
    extracted_text = ""

    print(f"Processing: {file_path}")

    if file_type == "word":
        print("Word document detected. Extracting text...")
        extracted_text = extract_text_from_word(file_path) #uses function extract_text_from_word

    elif file_type == "pdf":
        if is_pdf_scanned(file_path): #check for scanned pdf or editable pdf
            print("Scanned PDF detected. Applying OCR...")

            # Convert PDF to images
            images = convert_pdf_to_images(file_path) #uses function convert_pdf_to_images

            # Enhance images and extract text
            for img_path in images:
                enhanced_img = enhance_image(img_path)
                extracted_text += extract_text_from_image(enhanced_img) + "\n"

        else:
            print("Editable PDF detected. Extracting text...")
            extracted_text = extract_text_from_pdf(file_path) #uses function extract_text_from_pdf

    else:
        print("Unsupported file type.") # if its neither pdf nor word

    return extracted_text


In [11]:
# List all files in the gdrive CVs folder
#os.listdir(cv_folder_path) gets list of all files in that folder
# f.endswith((".pdf", ".docx")) select only file ends with .pdf or .docx
cv_files = [os.path.join(cv_folder_path, f) for f in os.listdir(cv_folder_path) if f.endswith((".pdf", ".docx"))]

# Process each CV
cv_texts = {}
# empty dictinory to store key value pairs
# keys will be the path and values will be the data

for cv_file in cv_files:
    extracted_text = process_cv(cv_file)
    cv_texts[cv_file] = extracted_text
    print(f"\nExtracted Text from {cv_file}:\n", extracted_text[:500], "\n...")


Processing: /content/drive/My Drive/CV Analysis/CVs/File (2).pdf
Scanned PDF detected. Applying OCR...

Extracted Text from /content/drive/My Drive/CV Analysis/CVs/File (2).pdf:
 OTS
10 f2022555-0120)

o Chicago; Ittinoiss US:

in linkedin.convresumekraft}

SKILLS,

—_—_

°o Gir

°

o Gita

fee C++ programming:
o Cimiininmii

EDUCATION,

aSSSSnnBpnmn@™

Bachelor of Engineering:
am Thee
‘Sep.2010=Jun 2014

San Jose State University,
(ETE

 

 

RACHEL FRANK

 

PYTHON DEVELOPER

 

SUMMARY

{ama python developer, | am interested in Automation using python. |have’
Automotive domain knownledge and | have experience designing Test
Automation Frameworks. | like to build smart 
...
Processing: /content/drive/My Drive/CV Analysis/CVs/File (4).pdf
Scanned PDF detected. Applying OCR...

Extracted Text from /content/drive/My Drive/CV Analysis/CVs/File (4).pdf:
 WORK’
EXPERIENCE!

——————_

STRENGTHS

 

Nadin Friedberger

Phone number: :555-555-5555
Email address: hello@kickresume.com

Dynamic Ce

In [12]:
!pip install fuzzywuzzy
"""fuzzywuzzy is a library that makes it easy to work with text
   that might have slight variations or misspellings. It helps
   you compare strings and find similarities between them. This
   is often referred to as "fuzzy matching". """



'fuzzywuzzy is a library that makes it easy to work with text\n   that might have slight variations or misspellings. It helps\n   you compare strings and find similarities between them. This\n   is often referred to as "fuzzy matching". '

In [13]:
import spacy
from fuzzywuzzy import process  # This is needed for fuzzy matching


# Load NLP model
nlp = spacy.load("en_core_web_sm")

"""Natural Language Processing (NLP) in Python. spacy is used for a wide range of text analysis tasks,
   such as tokenization, part-of-speech tagging, named entity recognition, and more. """



'Natural Language Processing (NLP) in Python. spacy is used for a wide range of text analysis tasks,\n   such as tokenization, part-of-speech tagging, named entity recognition, and more. '

In [14]:
def extract_name(text):
    """
    Extracts name using the first line as a fallback. If that fails, uses NLP to find a PERSON entity.
    """
    # Assume the first line is the name
    lines = text.strip().split("\n")
    if lines and 2 <= len(lines[0].split()) <= 4:
        return lines[0].strip()


#if the first case failed then the second approach is by using NLP

    # Use NLP to find a PERSON entity
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text.strip()

    return "Not Found"


In [15]:
def extract_email(text):
    """
    Extracts email from the given text using regex.
    """
    email_match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)
    return email_match.group() if email_match else "Not Found"
#structured pattern that can be efficiently matched using regex. email has structred pattern

In [16]:
def extract_phone(text):
    """
    Extracts phone number from the given text using regex.
    """
    phone_match = re.search(r"\(?\+?[0-9]*\)?[-.\s]?[0-9]+[-.\s]?[0-9]+[-.\s]?[0-9]+", text)
    return phone_match.group() if phone_match else "Not Found"


In [17]:
def extract_personal_info(text):
    """
    Extracts personal details such as Name, Email, and Phone using NLP & regex.
    """
    personal_info = {
        "Name": extract_name(text),
        "Email": extract_email(text),
        "Phone": extract_phone(text)
    }

    return personal_info


In [18]:
def extract_education(text):
    """
    Extracts Education details
    """
    education_keywords = ["university", "college", "bachelor", "master", "phd", "degree", "b.sc", "m.sc", "mba", "bachelor’s", "master’s"]
    education_info = []

    for line in text.split("\n"):
        for keyword in education_keywords:
            if keyword in line.lower():
                education_info.append(line.strip())

    return education_info if education_info else ["Not Found"]

In [19]:
def extract_experience(text):
    """
    Extracts Work Experience details
    """
    experience_keywords = ["experience", "work", "intern", "company", "position", "role", "employer", "responsibilities"]
    experience_info = []

    for line in text.split("\n"):
        for keyword in experience_keywords:
            if keyword in line.lower():
                experience_info.append(line.strip())

    return experience_info if experience_info else ["Not Found"]

In [20]:
def extract_skills(text):
    """
    Extracts skills using NLP (spaCy) and fuzzy matching against a predefined list.
    """
    # Predefined skill list
    skills_list = [
        "Python", "Java", "C++", "Machine Learning", "Deep Learning", "Data Science", "SQL",
        "TensorFlow", "PyTorch", "Django", "Flask", "Kubernetes", "Docker", "AWS", "React",
        "Angular", "Natural Language Processing", "Data Analysis", "Cybersecurity", "CI/CD",
        "Software Testing", "REST APIs", "Microservices", "Git", "Cloud Computing", "ETL",
    # Soft Skills
        "Communication", "Leadership", "Problem Solving", "Time Management", "Critical Thinking",
        "Collaboration", "Decision Making", "Creativity", "Adaptability", "Empathy",
        "Teamwork", "Emotional Intelligence", "Conflict Resolution"
    ]

    # Extract words using NLP
    doc = nlp(text)
    extracted_skills = set()

    # Use NER (Named Entity Recognition) for Skill Extraction
    for token in doc:
        if token.pos_ in ["NOUN", "PROPN"]:  # Many skills are nouns (e.g., "Python", "communication") Part-of-Speech filtering
            best_match, score = process.extractOne(token.text, skills_list)  # finds best match of tokens to the skills list and gives similarity score
            if score > 80:  # If similarity is high, add to extracted skills
                extracted_skills.add(best_match)

    return list(extracted_skills) if extracted_skills else ["Not Found"]


In [21]:
def extract_projects(text):
    """
    Extracts Project details from CVs.
    """
    project_keywords = ["project", "developed", "built", "created", "designed"]
    project_info = []

    for line in text.split("\n"):
        for keyword in project_keywords:
            if keyword in line.lower():
                project_info.append(line.strip())

    return project_info if project_info else ["Not Found"]

In [22]:
def extract_certifications(text):
    """
    Extracts Certifications from the CV.
    """
    certification_keywords = ["certified", "certificate", "course"]
    certification_info = []

    for line in text.split("\n"):
        for keyword in certification_keywords:
            if keyword in line.lower():
                certification_info.append(line.strip())

    return certification_info if certification_info else ["Not Found"]

In [23]:
import json

# extracted information from the CVs and stores it in a structured JSON format.
json_data = []

for file, text in cv_texts.items(): #file = path to the cv file and text =extracted content

#Extract the data from each file and store into below defined variables
    personal_info = extract_personal_info(text)
    education = extract_education(text)
    experience = extract_experience(text)
    skills = extract_skills(text)
    projects = extract_projects(text)
    certifications = extract_certifications(text)

#Create a dictionary representing the structured data of one CV
    json_data.append({
        "file_name": file,
        "personal_info": {
            "name": personal_info["Name"],
            "email": personal_info["Email"],
            "phone": personal_info["Phone"],
        },
        "education": education,
        "experience": experience,
        "skills": skills,
        "projects": projects,
        "certifications": certifications
    })

# Save to JSON file
json_file_path = "/content//drive/My Drive/CV Analysis/structured_cv_data.json"
with open(json_file_path, "w") as json_file:
    json.dump(json_data, json_file, indent=4)



In [24]:
json_data

[{'file_name': '/content/drive/My Drive/CV Analysis/CVs/File (2).pdf',
  'personal_info': {'name': 'Gir',
   'email': 'Not Found',
   'phone': '2022555-0120'},
  'education': ['Bachelor of Engineering:',
   'San Jose State University,',
   'httpsvigithub.com/simple-stockpredictions/blob/master/Linear_models-',
   'CardFraud/olob/master/Credit_card_prediction.ipynb'],
  'experience': ['Automotive domain knownledge and | have experience designing Test',
   'Automation Frameworks. | like to build smart application using Al and',
   'EXPERIENCE',
   '.© Working as a Python developer in Test-suite Scheduler Application. |',
   '© Developed the recovery framework for Test Bench Crashes which has',
   '. Developed Selenium python based automation framework for enabling',
   '* Developed the Test Automation framework for Android-Auto,',
   'C, Unux internals, micro processors, socket programming'],
  'skills': ['Data Analysis',
   'Kubernetes',
   'C++',
   'Machine Learning',
   'Software Tes

In [25]:
#To display it in dataframe

import pandas as pd
# Load JSON data
with open(json_file_path, "r") as json_file:
    json_data = json.load(json_file)

# Convert JSON to DataFrame and normalize nested fields
df_json = pd.json_normalize(json_data)

# Reorder Columns: Personal Info First
column_order = [
    "personal_info.name",
    "personal_info.email",
    "personal_info.phone",
    "education",
    "experience",
    "skills",
    "projects",
    "certifications"
]

# Keep only available columns (some may be missing)
df_json = df_json[[col for col in column_order if col in df_json.columns]]

df_json

Unnamed: 0,personal_info.name,personal_info.email,personal_info.phone,education,experience,skills,projects,certifications
0,Gir,Not Found,2022555-0120,"[Bachelor of Engineering:, San Jose State Univ...",[Automotive domain knownledge and | have exper...,"[Data Analysis, Kubernetes, C++, Machine Learn...","[Senior Project Engineer -Jan 2018 - Prosent, ...",[Not Found]
1,Nadin Friedberger,hello@kickresume.com,555-555-5555,[Not Found],"[WORK’, EXPERIENCE!, Dynamic Certified Python ...","[Adaptability, Critical Thinking, SQL, C++, Te...",[‘» Developed web application back end compone...,[Dynamic Certified Python Developer with 3 yea...
2,JOHN GONALEZ,John1652@gmail.com,(123) 456-7890,"[University of Chicago, University of Pittsburgh]",[Experlenced Python developer with extensive O...,"[Django, SQL, C++, Teamwork, Software Testing,...",[‘© Built extensive test coverage for all new ...,[Not Found]
3,Alexander Taylor,Not Found,2019-2020,"[Master of Science in Computer Science, Univer...",[Enthusiastic Python Developer with over two y...,"[Natural Language Processing, Critical Thinkin...",[Worked on HTML JavaScript and Python developm...,[Aspediatized course conducted by Coursera tha...
4,GIULIA GONZALEZ,Not Found,(123,"[University of Chicago, University of Pittsburgh]","[WORK EXPERIENCE, WORK EXPERIENCE, * Worked on...","[Adaptability, Django, SQL, C++, Teamwork, Sof...","[Using Selenium, built out a unit testingintra...",[developer for applications for amath course]
5,Contact Details,diannkupha@email.com,(938) 032 7933,"[© Bachelor of Science in, University of Calif...",[Experienced Python Developer with St. years o...,"[ETL, Django, SQL, C++, Flask, Software Testin...","[Python Developed, ‘e Developed and maintained...",[Not Found]


##LLM Integration
 * Integrate with an LLM API (OpenAI GPT, Claude, or similar)
 * Implement proper prompt engineering for CV analysis
 * Create a system to structure and store the extracted information
 * Handle API rate limiting and errors gracefully

###Why Use Mistral AI for LLM Integration in This Project?
* Mistral AI was chosen for LLM integration because of its cost-effectiveness, speed, and strong natural language processing (NLP) capabilities.

* Compared to other models like GPT-4, Mistral AI offers lower costs and faster responses, making it ideal for high-volume resume processing.

* API-friendly nature allows seamless integration into the existing pipeline, enabling automated resume parsing, candidate ranking, and job matching.

However, Mistral AI has some limitations.

* Tt may sometimes generate inaccurate or generalized outputs when dealing with highly technical resumes.

* The free-tier API has rate limits, which could impact scalability if processing a large dataset.

In [26]:
!pip install --upgrade mistralai



In [27]:
!pip install requests
#Mistral AI's Python SDK (MistralClient) was deprecated, and direct API calls were needed.



###When & Why Do We Use requests?
* The official Mistral AI Python SDK was deprecated and returned errors.
Instead of using MistralClient, we sent raw HTTP requests using requests.post().
* requests allows us to handle API rate limits manually (time.sleep(1)).
* We can retry failed API calls when rate limits (429 Too Many Requests) occur.
* When We Need to Handle API Authentication & Headers


In [28]:
from mistralai import Mistral
import time
import requests #to make http request
#client = MistralClient(api_key=MISTRAL_API_KEY) not working with colab so using google request
# Set up Mistral API Key (
MISTRAL_API_KEY = "C06b4SMJRLj2yGBfBa3jdSqc9dM7lRgq"
MISTRAL_API_URL = "https://api.mistral.ai/v1/chat/completions" #This is where we send requests to interact with the Mistral AI language model

HEADERS = {
    "Authorization": f"Bearer {MISTRAL_API_KEY}", #contains authentication tokens
    "Content-Type": "application/json" #this header tells the API that JSON format data has been sent
}


In [29]:
def analyze_cv_with_mistral(cv_data):
    """
    Sends extracted CV data to Mistral AI using a direct API request.
    Handles API errors & rate limits.
    """

    #promt engineering gives instruction to the Mistral AI model, guiding it on how to analyze the CV.
    prompt = f"""
    You are an AI HR assistant specializing in professional resume analysis.

    **Analyze the candidate's CV data and provide:**

    1. **Candidate Strengths**
       - List technical & soft skills with proficiency levels (Beginner, Intermediate, Advanced).
       - Highlight major achievements and experience.

    2. **Areas for Improvement**
       - Identify missing information (e.g., missing experience, certifications, or skills).
       - Suggest improvements to enhance job prospects.

    3. **Education Comparison**
       - Provide an assessment of the candidate’s education level.
       - Compare against industry standards for relevant job roles.

    4. **Experience Match**
       - Identify industries where the candidate has experience.
       - Match years of experience with common job requirements.

    5. **Recommended Job Roles & Suitability**
       - Suggest **job titles** based on the candidate’s skills and experience.
       - Rate suitability as **High, Medium, or Low match** for each suggested role.

    **Candidate CV Data:**
    {json.dumps(cv_data, indent=4)}

    **Response Format:**
    ---
     **Candidate Strengths:**
    - **Technical Skills:** Python (Advanced), Machine Learning (Intermediate), SQL (Advanced)
    - **Soft Skills:** Communication (Strong), Teamwork (Excellent)
    - **Experience Highlights:** Developed an ML-based fraud detection system, 5+ years in backend development.

     **Areas for Improvement:**
    - Missing details: Work experience lacks specific achievements.
    - Suggested improvements: Add certifications in Cloud Computing (AWS/Azure).

     **Education Assessment:**
    - Degree: Bachelor’s in Computer Science
    - Meets job requirements? **Yes**
    - Suggested improvements: Specialization in AI recommended.

     **Industry Experience Match:**
    - Relevant Industry: Finance, Technology
    - Years of Experience: 2+ years
    - Meets requirements? **Yes**

     **Recommended Job Roles & Suitability:**
    - **Senior Data Engineer** - **High Match**
    - **Machine Learning Engineer** - **Medium Match **
    - **Cloud Engineer** - **Low Match ** (Needs cloud certification)
    """

    # Prepare API Payload
    payload = {
        "model": "mistral-small", # "mistral-small" for better insights
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.8,  # Adjust creativity level (0.0 = strict, 1.0 = creative)
        "max_tokens": 500  # Adjust response length
    }


#try...except block: This handles potential errors during the API request.

    try:
        # Send API Request
        response = requests.post(MISTRAL_API_URL, headers=HEADERS, json=payload)

        #Check Response Status
        if response.status_code == 200: #status code 200 means request successful
            return response.json()["choices"][0]["message"]["content"].strip()
        else:
            return f" API Error: {response.status_code} - {response.text}"

    except Exception as e:
        return f" Error: {str(e)}"


In [None]:
# Load the extracted JSON data
json_file_path = "/content//drive/My Drive/CV Analysis/structured_cv_data.json"

with open(json_file_path, "r") as json_file:
    extracted_cvs = json.load(json_file)

# Add AI insights to each CV and save the results
for cv in extracted_cvs:
    print(f"\n **Analyzing CV for {cv['personal_info']['name']}**...\n") #using f string to print which candidates cv is analysing now

    # Handle API rate limits by adding a short delay
    time.sleep(1)  #To prevent the programme from sending too many API requests

    insights = analyze_cv_with_mistral(cv) #call the function
    cv["mistral_insights"] = insights  #  Store insights inside each CV object

# Save the updated JSON with AI insights
json_output_path = "/content//drive/My Drive/CV Analysis/structured_cv_with_insights.json"

with open(json_output_path, "w") as json_file: #w write r read
    json.dump(extracted_cvs, json_file, indent=4)

print(f" AI-analyzed CV data saved at {json_output_path}")



 **Analyzing CV for Gir**...


 **Analyzing CV for Nadin Friedberger**...


 **Analyzing CV for JOHN GONALEZ**...


 **Analyzing CV for Alexander Taylor**...



In [None]:
extracted_cvs

In [None]:

# Print AI-generated insights for each candidate
for cv in extracted_cvs:
    print(f"\n **Candidate: {cv['personal_info']['name']}**")
    print(" AI Insights:\n")
    print(cv.get("mistral_insights", "No insights found."))
    print("\n" + "="*100 + "\n")  # Separator for readability


##Query System
* Develop a chatbot interface for querying CV information
* Implement natural language understanding for queries
* Create a context management system for follow-up questions
* Support common query types:
    * Finding candidates with specific skills
    * Comparing education levels
    * Searching for experience in specific industries
    * Identifying matching candidates for job requirements

###Implementation of Query System  

* Gradio chatbot interface for querying CVs
* Natural Language Understanding (NLU)
* Context memory for follow-up questions
* Supports common query types:

  * Finding candidates with specific skills
  * Comparing education levels
  * Searching for experience in specific industries
  * Identifying matching candidates for job roles

In [None]:
!pip install gradio #install gradio library

In [None]:
#Convert JSON to DataFrame for easy searching
df = pd.DataFrame(extracted_cvs)

#convert everything to lowercase
df = df.applymap(lambda x:
                 [str(item).lower().strip() for item in x] if isinstance(x, list)
                 else str(x).lower().strip() if isinstance(x, str)
                 else x)

# processed DataFrame
print(df.head())


###Implement Natural Language Understanding (NLU)
Implemented using a keyword-based classification system to understand user queries, we classify them into four main categories:
* Finding candidates with specific skills
* Comparing education levels
* Searching for experience in specific industries
* Identifying matching candidates for job roles

In [None]:
import gradio as gr

chat_context = {} # Context memory (stores last query type for follow-up questions)

def classify_query(query, session_id="default"):
    """
    Uses NLU to classify user queries:
    - "skills" → Find candidates with specific skills
    - "education" → Compare education levels
    - "experience" → Search for experience in industries
    - "job_match" → Find candidates for job roles
    - "certification" → Find candidates with specific certifications
    - "follow-up" → Handles follow-up questions
    """

    query = query.lower().strip()

    # Detect follow-up queries
    if query in ["what about", "how about", "and", "any others?"]:
        return "follow-up"

    # Check for Skills Queries
    skill_keywords = ["skill", "knows", "expertise", "proficient in", "good at", "familiar with","know"]
    if any(word in query for word in skill_keywords):
        return "skills"

    # Check for Education Queries
    elif any(word in query for word in ["degree", "education", "qualification", "bachelor", "master",
                                        "certification","graduated","graduate","hold","holding"]):
        return "education"

    # Check for Experience Queries
    elif any(word in query for word in ["experience", "industry", "worked in", "background in","worked on","as","experience","background"]):
        return "experience"

    # Check for Job Match Queries
    elif any(word in query for word in ["match", "job", "fit", "role", "position","matching","closely matches"]):
        return "job_match"

    # Check for Certificate Queries
    elif any(word in query for word in ["certification", "certified", "credential", "accredited"]):
        return "certification"
    return "unknown"


In [None]:
print(classify_query("Find candidates with Python skills"))  # Expected Output: "skills"
print(classify_query("Which candidates have a Master’s degree?"))  #Expected Output: "education"
print(classify_query("Find candidates with experience in AI?"))  #Expected Output: "experience"


###Implement Query Processing (Search CV Data)
Once the query is classified, the chatbot searches for relevant candidates.
* Searches the CV dataset (df) for matching candidates.
* Handles different query types and returns relevant results.
* Stores previous query type for follow-ups.

In [None]:
def query_cv_info(query, session_id="default"):
    """
    Processes user queries and searches CV data.
    Uses NLU classification and context memory for follow-up questions.
    """

    global chat_context  # Stores previous queries for follow-up questions

    # Classify the query
    query_type = classify_query(query, session_id)

    # 🔹 Handle Follow-Up Queries
    if query_type == "follow-up":
        if session_id in chat_context:
            query_type = chat_context[session_id]  # Use last query type
        else:
            return " No previous query found. Please ask a full question."

    # Store query context (for follow-up questions)
    chat_context[session_id] = query_type

    print(f"Processing Query Type: {query_type}")


    # Handle Skills Queries
    if query_type == "skills":
        skill_match = re.search(r"(?:know|proficient in|knowledge of|expert in|good at|familiar with|skilled in|experienced in|using)\s+([a-zA-Z+#]+)", query.lower())
        skill = skill_match.group(1).lower() if skill_match else query.lower().replace("find candidates with", "").replace("skills", "").strip()

        print(f" Searching for candidates with skill: {skill}")

        matching_candidates = df[df["skills"].apply(lambda skills: skill in skills)]
        if matching_candidates.empty:
            return f" No candidates found with skill: {skill}"
        else:
            return matching_candidates[["personal_info", "skills"]].to_dict(orient="records")


    # Education match Queries
    elif query_type == "education":
        return df[["personal_info", "education"]].to_dict(orient="records")


    # Experience match Queries
    elif query_type == "experience":
        industry = query.split("experience in")[-1].strip().lower()
        matching_candidates = df[df["experience"].apply(lambda exp: any(industry in e.lower() for e in exp))]

        if matching_candidates.empty:
            return f" No candidates found with experience in: {industry}"
        else:
            return matching_candidates[["personal_info", "experience"]].to_dict(orient="records")


    # Certification Queries
    elif query_type == "certification":
        cert_match = re.search(r"(?:certified|certification in|credential in)\s+([a-zA-Z+#]+)", query.lower())
        certificate = cert_match.group(1).lower() if cert_match else query.lower().replace("find candidates with", "").replace("certifications", "").strip()

        print(f" Searching for candidates with : {certificate}")

        matching_candidates = df[df["certifications"].apply(lambda certifications: certificate in certifications)]
        if matching_candidates.empty:
            return f" No candidates found with certifications: {certificate}"
        else:
            return matching_candidates[["personal_info", "certifications"]].to_dict(orient="records")


    # Job match Queries
    elif query_type == "job_match":
        return df[["personal_info", "mistral_insights"]].to_dict(orient="records")

    else:
        return " I didn't understand that. Try asking about skills, education, or experience."


In [None]:
print(query_cv_info("Find candidates with Python skills"))  # Expected Output: List of candidates with Python
print(query_cv_info("Do any candidates know java?"))  # Expected Output: List of Java-skilled candidates


In [None]:
#Chatbot userinterface using gradio library
def chatbot_interface(user_query, session_id="default"):
    """
    Chatbot function that interacts with users and fetches query results.
    """

    #print(f" User Query: {user_query}")  # Debugging
    response = query_cv_info(user_query, session_id)
    #print(f" Chatbot Response: {response}")  #Debugging

    return response if isinstance(response, str) else json.dumps(response, indent=4)

# Create Gradio chatbot interface
chatbot = gr.Interface(
    fn=chatbot_interface,
    inputs=gr.Textbox(label="Ask about candidates (e.g., 'Find candidates with Python skills')"),
    outputs="text",
    title="HireGenius🚀 AI-Powered Candidate Finder",
    description="Ask about candidates' skills, education, experience, or job matches. Supports follow-up questions!"
)

# Launch the chatbot
chatbot.launch(share=True)
