In [2]:
import os
from pathlib import Path
from crewai import Agent, Task, Crew, Process, LLM
from crewai_tools import FileReadTool, DirectoryReadTool
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from langchain_community.embeddings import OllamaEmbeddings
import PyPDF2
import docx
import re
import json
from typing import Dict, List
import uuid
import time

# Configuration - UPDATED FOR YOUR SYSTEM
DESKTOP_PATH = r"C:\Users\annen\Desktop\resumes"  # Your resume folder
QDRANT_URL = "http://localhost:6333"
COLLECTION_NAME = "resumes_collection"
OLLAMA_MODEL = "mistral:latest"
OLLAMA_BASE_URL = "http://localhost:11434"
EMBEDDING_MODEL = "nomic-embed-text:latest"  # Faster, smaller embedding model

# Initialize Qdrant client
qdrant_client = QdrantClient(url=QDRANT_URL)

# Delete existing collection if it exists (to fix dimension mismatch)
try:
    qdrant_client.delete_collection(collection_name=COLLECTION_NAME)
    print(f"✓ Deleted old collection: {COLLECTION_NAME}")
except:
    pass

# Initialize Ollama embeddings using a faster embedding model
print(f"Initializing embeddings with {EMBEDDING_MODEL}...")
print("Note: If you don't have nomic-embed-text, run: ollama pull nomic-embed-text")
embeddings = OllamaEmbeddings(
    model=EMBEDDING_MODEL,
    base_url=OLLAMA_BASE_URL
)
# Get actual vector size
test_emb = embeddings.embed_query("test")
VECTOR_SIZE = len(test_emb)
print(f"✓ Using {EMBEDDING_MODEL} embeddings (vector size: {VECTOR_SIZE})")

# Initialize Qdrant collection
try:
    qdrant_client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
    )
    print(f"✓ Created collection: {COLLECTION_NAME} with vector size: {VECTOR_SIZE}")
except Exception as e:
    print(f"Collection already exists: {COLLECTION_NAME}")


def extract_text_from_pdf(file_path: str) -> str:
    """Extract text from PDF file"""
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
            return text
    except Exception as e:
        return f"Error reading PDF: {str(e)}"


def extract_text_from_docx(file_path: str) -> str:
    """Extract text from DOCX file"""
    try:
        doc = docx.Document(file_path)
        text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
        return text
    except Exception as e:
        return f"Error reading DOCX: {str(e)}"


def extract_resume_info(text: str, filename: str) -> Dict:
    """Extract structured information from resume text"""
    
    # Clean text
    text = text.replace('\x00', '')  # Remove null characters
    
    # Extract email
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    emails = re.findall(email_pattern, text)
    email = emails[0] if emails else "Not found"
    
    # Extract phone - multiple patterns to catch different formats
    phone_patterns = [
        r'(?:Mob|Mobile|Phone|Tel|Cell)[\s#:]*([+]?\d{1,3}[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4})',  # Mob#832-803-7434
        r'\+?\d{1,3}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',  # (123) 456-7890 or +1-123-456-7890
        r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b',  # 123-456-7890 or 123.456.7890
        r'\b\d{10}\b',  # 1234567890
    ]
    
    phone = "Not found"
    for pattern in phone_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            phone = matches[0] if isinstance(matches[0], str) else ''.join(matches[0])
            # Clean up phone number
            phone = re.sub(r'[^\d+\-\s()]', '', phone).strip()
            break
    
    # Extract name - improved logic
    name = "Not found"
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    
    # Method 1: Look for name near the top (first 10 lines)
    for line in lines[:10]:
        # Skip lines that are clearly not names
        if any(skip in line.lower() for skip in ['objective', 'email', 'phone', 'address', 'po box', 'summary', '>', '#']):
            continue
        # Name should be 2-5 words, mostly letters
        words = line.split()
        if 2 <= len(words) <= 5 and all(word[0].isupper() for word in words[:3] if word):
            # Remove any trailing commas, credentials, etc.
            name = re.sub(r',.*', '', line)  # Remove everything after comma
            name = re.sub(r'\s+(MBA|MS|PMP|CISSP|PhD|BSc|BA|MSc).*', '', name, flags=re.IGNORECASE)
            break
    
    # Method 2: If still not found, try to extract from email
    if name == "Not found" and email != "Not found":
        email_name = email.split('@')[0]
        name = email_name.replace('.', ' ').replace('_', ' ').title()
    
    # Method 3: Use filename as fallback
    if name == "Not found":
        name = filename.replace('.pdf', '').replace('.docx', '').replace('_', ' ').title()
    
    # Extract skills (look for skills section with better parsing)
    skills = []
    text_lower = text.lower()
    
    # Find skills section
    skills_keywords = ['skills', 'technical skills', 'core competencies', 'technologies', 'expertise']
    skills_idx = -1
    for keyword in skills_keywords:
        idx = text_lower.find(keyword)
        if idx != -1:
            skills_idx = idx
            break
    
    if skills_idx != -1:
        # Get text after skills keyword (next 1000 chars)
        skills_section = text[skills_idx:skills_idx+1000].lower()
        
        # Expanded tech skills list
        tech_skills = [
            'python', 'java', 'javascript', 'react', 'sql', 'aws', 'azure', 'gcp',
            'docker', 'kubernetes', 'git', 'machine learning', 'ai', 'ml',
            'node', 'nodejs', 'typescript', 'html', 'css', 'angular', 'vue',
            'mongodb', 'postgresql', 'mysql', 'redis', 'django', 'flask',
            'tensorflow', 'pytorch', 'scikit-learn', 'pandas', 'numpy',
            'c++', 'c#', '.net', 'spring', 'hibernate', 'rest api', 'microservices',
            'jenkins', 'ci/cd', 'terraform', 'ansible', 'linux', 'windows',
            'agile', 'scrum', 'jira', 'confluence', 'devops', 'cloud',
            'pci', 'hipaa', 'iso 27001', 'cissp', 'pmp', 'cism', 'cisa',
            'network security', 'cybersecurity', 'penetration testing', 'vulnerability',
            'firewall', 'cisco', 'checkpoint', 'splunk', 'siem', 'sox', 'gdpr'
        ]
        
        for skill in tech_skills:
            if skill in skills_section:
                skills.append(skill.title())
    
    # Remove duplicates and sort
    skills = sorted(list(set(skills)))
    
    # Extract experience years
    exp_patterns = [
        r'(\d+)\+?\s*years?\s+(?:of\s+)?experience',
        r'experience[:\s]+(\d+)\+?\s*years?'
    ]
    experience_years = "Not specified"
    for pattern in exp_patterns:
        match = re.search(pattern, text_lower)
        if match:
            experience_years = f"{match.group(1)} years"
            break
    
    resume_data = {
        "filename": filename,
        "name": name,
        "email": email,
        "phone": phone,
        "skills": skills,
        "experience_years": experience_years,
        "full_text": text[:2000],  # Store first 2000 chars for debugging
        "text_length": len(text)
    }
    
    return resume_data


def process_resumes_and_store():
    """Process all resumes from Desktop and store in Qdrant"""
    
    resume_files = []
    
    # Get all PDF and DOCX files from Desktop
    for file in os.listdir(DESKTOP_PATH):
        if file.endswith(('.pdf', '.docx')):
            resume_files.append(os.path.join(DESKTOP_PATH, file))
    
    print(f"Found {len(resume_files)} resume files on Desktop")
    
    processed_resumes = []
    points = []
    
    for file_path in resume_files:
        filename = os.path.basename(file_path)
        print(f"Processing: {filename}")
        
        # Extract text based on file type
        if file_path.endswith('.pdf'):
            text = extract_text_from_pdf(file_path)
        else:
            text = extract_text_from_docx(file_path)
        
        if "Error" not in text:
            # Extract structured info
            resume_info = extract_resume_info(text, filename)
            processed_resumes.append(resume_info)
            
            # Create embedding with shorter text to speed up processing
            embedding_text = f"Name: {resume_info['name']}\nEmail: {resume_info['email']}\nSkills: {', '.join(resume_info['skills'])}"
            
            print(f"  Creating embedding for {filename}...")
            try:
                embedding = embeddings.embed_query(embedding_text)
                time.sleep(0.2)  # Small delay between embeddings
            except Exception as e:
                print(f"  ⚠ Error creating embedding: {e}")
                continue
            
            # Create point for Qdrant
            point = PointStruct(
                id=str(uuid.uuid4()),
                vector=embedding,
                payload=resume_info
            )
            points.append(point)
    
    # Upload to Qdrant
    if points:
        qdrant_client.upsert(
            collection_name=COLLECTION_NAME,
            points=points
        )
        print(f"\n✓ Successfully stored {len(points)} resumes in Qdrant!")
    
    return processed_resumes


def query_resumes(query: str):
    """Query resumes using natural language"""
    
    # Create embedding for the query
    query_embedding = embeddings.embed_query(query)
    
    # Search in Qdrant
    search_results = qdrant_client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_embedding,
        limit=5
    )
    
    if not search_results:
        return "No matching resumes found."
    
    # Create LLM using CrewAI's LLM wrapper for Ollama with longer timeout
    llm = LLM(
        model=f"ollama/{OLLAMA_MODEL}",
        base_url=OLLAMA_BASE_URL,
        timeout=300,  # 5 minutes timeout
        temperature=0.3,
        max_tokens=500  # Shorter responses to be faster
    )
    
    # Create agent
    query_agent = Agent(
        role="Resume Query Assistant",
        goal="Answer questions about candidate resumes accurately and conversationally",
        backstory="You are an expert HR assistant who helps find information from candidate resumes.",
        llm=llm,
        verbose=True
    )
    
    # Prepare context from search results (keep it concise)
    context = "Relevant resumes:\n\n"
    for i, result in enumerate(search_results[:3], 1):  # Only use top 3 results
        resume = result.payload
        context += f"{i}. {resume.get('name', 'N/A')}\n"
        context += f"   Email: {resume.get('email', 'N/A')}\n"
        context += f"   Phone: {resume.get('phone', 'N/A')}\n"
        context += f"   Skills: {', '.join(resume.get('skills', []))}\n"
        context += f"   Experience: {resume.get('experience_years', 'Not specified')}\n\n"
    
    # Create task with concise prompt
    query_task = Task(
        description=f"""Answer this query briefly: "{query}"

{context}

Provide a short, direct answer (2-3 sentences max) with the specific information requested.""",
        expected_output="Brief answer to the query",
        agent=query_agent
    )
    
    # Create crew and execute with shorter max iterations
    crew = Crew(
        agents=[query_agent],
        tasks=[query_task],
        process=Process.sequential,
        verbose=True,
        max_iterations=3  # Limit iterations to avoid long processing
    )
    
    result = crew.kickoff()
    return result


def main():
    """Main function to run the resume intelligence system"""
    
    print("=" * 60)
    print("RESUME INTELLIGENCE SYSTEM")
    print("=" * 60)
    
    # Step 1: Process and store resumes
    print("\n[1] Processing resumes from Desktop...")
    processed = process_resumes_and_store()
    
    print(f"\nProcessed {len(processed)} resumes:")
    for resume in processed:
        print(f"  - {resume['name']}")
        print(f"    Email: {resume['email']}")
        print(f"    Phone: {resume['phone']}")
        print(f"    Skills: {', '.join(resume['skills'][:5]) if resume['skills'] else 'None'}")
        print()
    
    # Step 2: Interactive query mode
    print("\n" + "=" * 60)
    print("QUERY MODE - Ask questions about the resumes!")
    print("=" * 60)
    print("Examples:")
    print("  - What is Pranay Reddy's email?")
    print("  - Find candidates with Python skills")
    print("  - Show me personal details of John Doe")
    print("  - Type 'exit' to quit\n")
    
    while True:
        query = input("\n🔍 Your question: ").strip()
        
        if query.lower() in ['exit', 'quit', 'q']:
            print("Goodbye!")
            break
        
        if not query:
            continue
        
        print("\n🤖 Searching and analyzing...")
        answer = query_resumes(query)
        print(f"\n💡 Answer:\n{answer}\n")
        print("-" * 60)


if __name__ == "__main__":
    main()

INFO:httpx:HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: DELETE http://localhost:6333/collections/resumes_collection "HTTP/1.1 200 OK"


✓ Deleted old collection: resumes_collection
Initializing embeddings with nomic-embed-text:latest...
Note: If you don't have nomic-embed-text, run: ollama pull nomic-embed-text
✓ Using nomic-embed-text:latest embeddings (vector size: 768)


INFO:httpx:HTTP Request: PUT http://localhost:6333/collections/resumes_collection "HTTP/1.1 200 OK"


✓ Created collection: resumes_collection with vector size: 768
RESUME INTELLIGENCE SYSTEM

[1] Processing resumes from Desktop...
Found 12 resume files on Desktop
Processing: Ahmad Qasem-Resume.pdf
  Creating embedding for Ahmad Qasem-Resume.pdf...
Processing: Ashok Kumar.docx
  Creating embedding for Ashok Kumar.docx...
Processing: Dexter Nigel Ramkissoon.docx
  Creating embedding for Dexter Nigel Ramkissoon.docx...
Processing: Donald Belvin.docx
  Creating embedding for Donald Belvin.docx...
Processing: Jumoke-Adekanmi-Web-Developer-2025-03-21.pdf
  Creating embedding for Jumoke-Adekanmi-Web-Developer-2025-03-21.pdf...
Processing: Kiran N. Penmetcha's Profile.pdf
  Creating embedding for Kiran N. Penmetcha's Profile.pdf...
Processing: Mahesh_Bolikonda (1).pdf
  Creating embedding for Mahesh_Bolikonda (1).pdf...
Processing: Mutchie.docx
  Creating embedding for Mutchie.docx...
Processing: PRANAY REDDY_DE_Resume.pdf
  Creating embedding for PRANAY REDDY_DE_Resume.pdf...
Processing: Res

INFO:httpx:HTTP Request: PUT http://localhost:6333/collections/resumes_collection/points?wait=true "HTTP/1.1 200 OK"



✓ Successfully stored 12 resumes in Qdrant!

Processed 12 resumes:
  - Ahmad Elsheikhq
    Email: ahmad.elsheikhq@gmail.com
    Phone: Not found
    Skills: Agile, Ai

  - Project History
    Email: Not found
    Phone: Not found
    Skills: .Net, Ai, Angular, C#, Ml

  - Dexternigel
    Email: dexternigel@gmail.com
    Phone: Not found
    Skills: Ai, Checkpoint, Cisco, Firewall, Hipaa

  - Donald Belvin
    Email: donald0099@gmail.com
    Phone: Not found
    Skills: Agile, Ai, Cloud, Firewall, Ml

  - Jumokea
    Email: JumokeA@gmail.com
    Phone: 212-363-0754
    Skills: Ai, Aws, Azure, C#, Css

  - Kiran N. Penmetcha
    Email: KPenmetcha@gmail.com
    Phone: 626-736-0525
    Skills: Agile, Ai, Git, Ml

  - Getmaheshb
    Email: getmaheshb@gmail.com
    Phone: Not found
    Skills: Pci

  - WILLIAM E. MUTCHIE
    Email: jivaniwilliam@hotmail.com
    Phone: Not found
    Skills: Cisco, Git, Html, Java, Linux

  - PRANA Y REDD Y
    Email: pranayreddy9799@gmail.com
    Phone: 443-

  search_results = qdrant_client.search(
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/resumes_collection/points/search "HTTP/1.1 200 OK"


[92m13:50:41 - LiteLLM:INFO[0m: utils.py:3258 - 
LiteLLM completion() model= mistral:latest; provider = ollama


INFO:LiteLLM:
LiteLLM completion() model= mistral:latest; provider = ollama


Output()

INFO:httpx:HTTP Request: POST http://localhost:11434/api/show "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 500 Internal Server Error"


Output()

APIConnectionError: litellm.APIConnectionError: OllamaException - {"error":"model requires more system memory (4.9 GiB) than is available (4.5 GiB)"}