In [None]:
# Install required packages
!pip install pandas numpy PyPDF2 scikit-learn spacy nltk tensorflow

# Import all necessary libraries
import os
import json
import uuid
import random
import pandas as pd
import numpy as np
import PyPDF2  # Correct import (not PyPDF2)
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import spacy
from nltk.stem import PorterStemmer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import joblib
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model, save_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
!pip install matplotlib seaborn networkx
# Download required NLTK data
# First, let's download the regular punkt data
import nltk
import os
import shutil

# Download the standard punkt tokenizer


# Create the necessary directory structure for punkt_tab
nltk_data_path = nltk.data.path[0]  # Get the first NLTK data path
punkt_path = os.path.join(nltk_data_path, 'tokenizers', 'punkt', 'english')
punkt_tab_dir = os.path.join(nltk_data_path, 'tokenizers', 'punkt_tab', 'english')

# Create the directory if it doesn't exist
os.makedirs(punkt_tab_dir, exist_ok=True)

# The specific files needed are collocations.tab and sent_starters.txt
files_to_copy = ['collocations.tab', 'sent_starters.txt']

for file in files_to_copy:
    source_file = os.path.join(punkt_path, file)
    target_file = os.path.join(punkt_tab_dir, file)

    # Check if source files exist
    if os.path.exists(source_file):
        # Copy the file
        shutil.copy2(source_file, target_file)
        print(f"Copied {file} to {punkt_tab_dir}")
    else:
        # Create empty files if originals don't exist
        with open(target_file, 'w', encoding='utf-8') as f:
            print(f"Created empty {file} in {punkt_tab_dir}")

print("Setup complete for punkt_tab resources")
# Initialize spaCy
try:
    nlp = spacy.load('en_core_web_sm')
except:
    # If model not available, download it
    import subprocess
    import sys
    subprocess.call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load('en_core_web_sm')

# Initialize Porter Stemmer
stemmer = PorterStemmer()

In [None]:
def generate_career_database(output_path='career_database.csv'):
    """
    Generates a CSV file with career data for engineering graduates
    """
    # Common career paths after engineering degrees
    job_titles = [
        'Software Engineer', 'Senior Software Engineer', 'Frontend Developer', 'Backend Developer',
        'Full-Stack Developer', 'Mobile App Developer', 'Data Scientist', 'Data Analyst',
        'Machine Learning Engineer', 'AI Researcher', 'DevOps Engineer', 'Cloud Architect',
        'Systems Administrator', 'Network Engineer', 'Security Engineer', 'QA Engineer',
        'Test Automation Engineer', 'Product Manager', 'Project Manager', 'Scrum Master',
        'Business Analyst', 'Technical Consultant', 'Solutions Architect', 'Database Administrator',
        'Technical Writer', 'UX/UI Designer', 'Game Developer', 'Blockchain Developer',
        'IoT Developer', 'AR/VR Developer'
    ]

    # Common industries
    industries = [
        'Technology', 'Healthcare', 'Finance', 'Education', 'E-commerce',
        'Manufacturing', 'Telecommunications', 'Entertainment', 'Consulting',
        'Energy', 'Transportation', 'Government', 'Defense', 'Retail'
    ]

    # Career stages
    career_stages = ['entry-level', 'early-career', 'mid-career', 'experienced', 'senior']

    # Technical skills pool
    technical_skills = [
        'Python', 'Java', 'JavaScript', 'C++', 'C#', 'Go', 'Rust', 'Swift', 'Kotlin',
        'TypeScript', 'PHP', 'Ruby', 'SQL', 'HTML', 'CSS', 'React', 'Angular', 'Vue',
        'Node.js', 'Django', 'Flask', 'Spring', 'ASP.NET', 'Express.js', 'Ruby on Rails',
        'TensorFlow', 'PyTorch', 'scikit-learn', 'Pandas', 'NumPy', 'Docker', 'Kubernetes',
        'AWS', 'Azure', 'GCP', 'Git', 'CI/CD', 'Jenkins', 'Terraform', 'Ansible',
        'Hadoop', 'Spark', 'MongoDB', 'PostgreSQL', 'MySQL', 'Redis', 'Elasticsearch',
        'GraphQL', 'REST API', 'Microservices', 'Serverless', 'Linux', 'Blockchain',
        'IoT', 'AR/VR', 'Unity', 'Unreal Engine', 'WebGL', 'Mobile Development'
    ]

    # Soft skills pool
    soft_skills = [
        'Communication', 'Leadership', 'Teamwork', 'Problem Solving', 'Critical Thinking',
        'Time Management', 'Adaptability', 'Creativity', 'Attention to Detail', 'Negotiation',
        'Conflict Resolution', 'Emotional Intelligence', 'Presentation', 'Client Management',
        'Mentoring', 'Decision Making', 'Organization', 'Delegation', 'Strategic Thinking',
        'Research', 'Analytical Skills'
    ]

    # Create different profiles for new job seekers vs. career transitioners
    # New job seekers typically need fewer specialized skills
    new_seeker_profiles = {
        'entry-level': {
            'tech_skills': (3, 6),
            'soft_skills': (2, 4),
            'salary_range': (50000, 70000, 10000, 20000)
        },
        'early-career': {
            'tech_skills': (4, 7),
            'soft_skills': (3, 5),
            'salary_range': (65000, 85000, 10000, 20000)
        }
    }

    # Career transitioners often bring more diverse skills but might lack some core technical skills
    transitioner_profiles = {
        'entry-level': {
            'tech_skills': (2, 5),
            'soft_skills': (4, 7),
            'salary_range': (55000, 75000, 10000, 20000)
        },
        'early-career': {
            'tech_skills': (4, 8),
            'soft_skills': (5, 8),
            'salary_range': (70000, 90000, 15000, 25000)
        },
        'mid-career': {
            'tech_skills': (5, 9),
            'soft_skills': (6, 9),
            'salary_range': (85000, 115000, 20000, 30000)
        },
        'experienced': {
            'tech_skills': (6, 10),
            'soft_skills': (7, 10),
            'salary_range': (110000, 140000, 25000, 40000)
        }
    }

    # Create the careers data
    careers = []
    career_id = 1

    # Generate for new job seekers
    for i in range(40):  # 40 career entries for new job seekers
        job_title = random.choice(job_titles)
        industry = random.choice(industries)
        career_stage = random.choice(list(new_seeker_profiles.keys()))
        career_type = 'new_seeker'

        profile = new_seeker_profiles[career_stage]

        # Select skills based on profile
        num_tech_skills = random.randint(*profile['tech_skills'])
        num_soft_skills = random.randint(*profile['soft_skills'])

        required_tech_skills = random.sample(technical_skills, num_tech_skills)
        required_soft_skills = random.sample(soft_skills, num_soft_skills)

        # Add job-specific skills
        if 'Software' in job_title or 'Developer' in job_title:
            job_specific_skills = ['Programming', 'Software Development', 'Debugging']
            required_tech_skills.extend(random.sample(job_specific_skills, min(2, len(job_specific_skills))))
        elif 'Data' in job_title:
            job_specific_skills = ['Statistics', 'Data Visualization', 'Data Analysis']
            required_tech_skills.extend(random.sample(job_specific_skills, min(2, len(job_specific_skills))))

        # All required skills
        all_required_skills = required_tech_skills + required_soft_skills

        # Salary range
        min_salary = random.randint(*profile['salary_range'][:2])
        max_salary = min_salary + random.randint(*profile['salary_range'][2:])

        # Create job description
        job_description = f"Entry-level position suitable for graduates with minimal experience. Requires proficiency in {', '.join(required_tech_skills[:3])}. Strong {', '.join(required_soft_skills[:2])} skills are essential. This role provides excellent opportunities for growth and skill development."

        career = {
            'job_id': career_id,
            'job_title': job_title,
            'industry': industry,
            'career_stage': career_stage,
            'career_type': career_type,
            'required_skills': '|'.join(all_required_skills),
            'min_salary': min_salary,
            'max_salary': max_salary,
            'job_description': job_description,
            'company_size': random.choice(['Small', 'Medium', 'Large', 'Enterprise']),
            'remote_options': random.choice(['Remote', 'Hybrid', 'On-site']),
            'growth_potential': random.randint(7, 10)  # New job seekers have high growth potential
        }

        careers.append(career)
        career_id += 1

    # Generate for career transitioners
    for i in range(60):  # 60 career entries for career transitioners
        job_title = random.choice(job_titles)
        industry = random.choice(industries)
        career_stage = random.choice(list(transitioner_profiles.keys()))
        career_type = 'transitioner'

        profile = transitioner_profiles[career_stage]

        # Select skills based on profile
        num_tech_skills = random.randint(*profile['tech_skills'])
        num_soft_skills = random.randint(*profile['soft_skills'])

        required_tech_skills = random.sample(technical_skills, num_tech_skills)
        required_soft_skills = random.sample(soft_skills, num_soft_skills)

        # Add job-specific skills
        if 'Manager' in job_title or 'Consultant' in job_title:
            job_specific_skills = ['Project Management', 'Stakeholder Management', 'Strategic Planning']
            required_tech_skills.extend(random.sample(job_specific_skills, min(2, len(job_specific_skills))))
        elif 'Architect' in job_title:
            job_specific_skills = ['System Design', 'Architecture Patterns', 'Technical Leadership']
            required_tech_skills.extend(random.sample(job_specific_skills, min(2, len(job_specific_skills))))

        # All required skills
        all_required_skills = required_tech_skills + required_soft_skills

        # Salary range
        min_salary = random.randint(*profile['salary_range'][:2])
        max_salary = min_salary + random.randint(*profile['salary_range'][2:])

        # Create job description
        job_description = f"Position ideal for professionals transitioning from related fields. Values transferrable skills like {', '.join(required_soft_skills[:3])}. Technical requirements include {', '.join(required_tech_skills[:3])}. Previous experience in similar domains can substitute for some technical requirements."

        career = {
            'job_id': career_id,
            'job_title': job_title,
            'industry': industry,
            'career_stage': career_stage,
            'career_type': career_type,
            'required_skills': '|'.join(all_required_skills),
            'min_salary': min_salary,
            'max_salary': max_salary,
            'job_description': job_description,
            'company_size': random.choice(['Small', 'Medium', 'Large', 'Enterprise']),
            'remote_options': random.choice(['Remote', 'Hybrid', 'On-site']),
            'growth_potential': random.randint(5, 9)  # Slightly lower average growth potential for transitioners
        }

        careers.append(career)
        career_id += 1

    # Create DataFrame and save to CSV
    careers_df = pd.DataFrame(careers)
    careers_df.to_csv(output_path, index=False)
    print(f"Generated career database with {len(careers)} entries")
    return careers_df

In [None]:
def validate_pdf(file_path):
    """
    Validates if the file is a valid PDF
    """
    try:
        # Check if file exists
        if not os.path.exists(file_path):
            return {"valid": False, "error": "File does not exist"}

        # Check if file is a PDF
        if not file_path.lower().endswith('.pdf'):
            return {"valid": False, "error": "File is not a PDF"}

        # Check file size
        max_size = 10 * 1024 * 1024  # 10MB
        if os.path.getsize(file_path) > max_size:
            return {"valid": False, "error": "File too large (max 10MB)"}

        # Try to open and read PDF
        with open(file_path, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            if len(pdf_reader.pages) < 1:
                return {"valid": False, "error": "PDF has no pages"}

        return {"valid": True}
    except Exception as e:
        return {"valid": False, "error": str(e)}

def extract_text_from_pdf(file_path):
    """
    Extracts text from PDF file with positional data
    """
    text_data = {
        "full_text": "",
        "text_by_page": [],
        "sections": []
    }

    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        num_pages = len(pdf_reader.pages)

        for i in range(num_pages):
            page = pdf_reader.pages[i]
            page_text = page.extract_text()

            text_data["full_text"] += page_text + " "
            text_data["text_by_page"].append(page_text)

            # Simple section detection based on line breaks and formatting
            potential_sections = page_text.split('\n\n')

            for section_text in potential_sections:
                if section_text.strip():
                    # Try to identify if this is a section header
                    lines = section_text.split('\n')
                    if len(lines) > 0 and any(keyword in lines[0].lower() for keyword in
                                             ['experience', 'education', 'skills', 'certification',
                                              'projects', 'summary', 'objective']):
                        section = {
                            "title": lines[0].strip(),
                            "content": '\n'.join(lines[1:]) if len(lines) > 1 else "",
                            "page": i
                        }
                        text_data["sections"].append(section)

    return text_data

In [None]:
def recognize_document_structure(text_data):
    """
    Analyzes the resume structure to identify different sections
    """
    # Define common section keywords
    section_patterns = {
        'contactInfo': ['contact', 'phone', 'email', 'address', 'linkedin'],
        'summary': ['summary', 'objective', 'profile', 'about'],
        'experience': ['experience', 'work', 'employment', 'job history', 'professional'],
        'education': ['education', 'academic', 'qualification', 'degree', 'university', 'college'],
        'skills': ['skills', 'expertise', 'technical', 'competencies', 'proficiencies'],
        'projects': ['projects', 'portfolio', 'works'],
        'certifications': ['certifications', 'certificates', 'licenses', 'credentials']
    }

    structured_sections = []

    # Process sections from text extraction
    for section in text_data["sections"]:
        section_type = "other"

        # Check against patterns to determine section type
        for type_name, keywords in section_patterns.items():
            if any(keyword in section["title"].lower() for keyword in keywords):
                section_type = type_name
                break

        structured_sections.append({
            "title": section["title"],
            "content": section["content"],
            "type": section_type,
            "page": section["page"]
        })

    # If no sections were found, try to extract them from full text
    if not structured_sections:
        full_text = text_data["full_text"]

        # Simple regex-based section extraction
        for type_name, keywords in section_patterns.items():
            pattern = '|'.join(keywords)
            regex = rf'(?i)({pattern})(?:\s|\:|\n)'

            matches = re.finditer(regex, full_text)
            for match in matches:
                start_pos = match.start()
                # Find the next section or end of text
                next_match = re.search(regex, full_text[start_pos + 1:])
                end_pos = start_pos + 1 + next_match.start() if next_match else len(full_text)

                section_title = full_text[start_pos:start_pos + match.end() - start_pos].strip()
                section_content = full_text[start_pos + len(section_title):end_pos].strip()

                structured_sections.append({
                    "title": section_title,
                    "content": section_content,
                    "type": type_name,
                    "page": 0  # We don't know the page number in this case
                })

    # Extract contact information
    contact_info = extract_contact_info(text_data["full_text"])

    return {
        "structured_sections": structured_sections,
        "contact_info": contact_info
    }

def extract_contact_info(text):
    """
    Extracts contact information from resume text
    """
    contact_info = {
        "email": None,
        "phone": None,
        "linkedin": None,
        "website": None,
        "location": None
    }

    # Email regex
    email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    email_match = re.search(email_regex, text)
    if email_match:
        contact_info["email"] = email_match.group(0)

    # Phone regex
    phone_regex = r'(?:\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
    phone_match = re.search(phone_regex, text)
    if phone_match:
        contact_info["phone"] = phone_match.group(0)

    # LinkedIn regex
    linkedin_regex = r'(?:linkedin\.com\/in\/)[a-zA-Z0-9_-]+'
    linkedin_match = re.search(linkedin_regex, text)
    if linkedin_match:
        contact_info["linkedin"] = linkedin_match.group(0)

    # Website regex
    website_regex = r'(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:\/[^\s]*)?'
    website_matches = re.findall(website_regex, text)
    if website_matches:
        # Filter out LinkedIn URLs
        websites = [url for url in website_matches if 'linkedin' not in url]
        if websites:
            contact_info["website"] = websites[0]

    # Location - city, state format common in resumes
    location_regex = r'\b[A-Z][a-z]+(?:[\s-][A-Z][a-z]+)*,\s*[A-Z]{2}\b'
    location_match = re.search(location_regex, text)
    if location_match:
        contact_info["location"] = location_match.group(0)

    return contact_info

In [None]:
def extract_skills(structured_data):
    """
    Extracts explicit and implicit skills from resume
    """
    # Load the skill taxonomy (in a real implementation, this would be a comprehensive database)
    skill_taxonomy = load_skill_taxonomy()

    extracted_skills = {
        "technical": [],
        "soft": [],
        "languages": [],
        "tools": [],
        "methodologies": [],
        "domain": [],
        "inferred": []
    }

    # Find skills section
    skills_section = next((s for s in structured_data["structured_sections"]
                          if s["type"] == "skills"), None)

    # Find experience section for implicit skills
    experience_section = next((s for s in structured_data["structured_sections"]
                              if s["type"] == "experience"), None)

    # Process explicit skills if skills section exists
    if skills_section:
        skills_text = skills_section["content"]
        explicit_skills = extract_explicit_skills(skills_text, skill_taxonomy)

        # Categorize extracted skills
        for skill_name in explicit_skills:
            category = categorize_skill(skill_name, skill_taxonomy)
            if category in extracted_skills:
                extracted_skills[category].append({
                    "name": skill_name,
                    "source": "explicit",
                    "confidence": 0.95
                })

    # Process experience section for implicit skills
    if experience_section:
        experience_text = experience_section["content"]
        implicit_skills = extract_implicit_skills(experience_text, skill_taxonomy)

        # Add implicit skills with confidence scores
        for skill_data in implicit_skills:
            skill_name = skill_data["skill"]
            category = skill_data["category"]
            confidence = skill_data["confidence"]

            # Check if skill already exists in extracted skills
            existing = False
            for cat, skills in extracted_skills.items():
                if any(s["name"].lower() == skill_name.lower() for s in skills):
                    existing = True
                    break

            if not existing and category in extracted_skills:
                extracted_skills[category].append({
                    "name": skill_name,
                    "source": "implicit",
                    "confidence": confidence
                })

        # Infer additional skills from experience
        inferred_skills = infer_skills_from_experience(experience_text, skill_taxonomy)
        extracted_skills["inferred"] = inferred_skills

    return extracted_skills

def load_skill_taxonomy():
    """
    Loads skill taxonomy (simplified version)
    """
    taxonomy = {
        "technical": [
            "Python", "Java", "JavaScript", "C++", "C#", "SQL", "HTML", "CSS",
            "React", "Angular", "Vue", "Node.js", "Django", "Flask", "Spring",
            "Machine Learning", "Deep Learning", "NLP", "Computer Vision",
            "Data Analysis", "Statistics", "Algorithms", "Data Structures"
        ],
        "soft": [
            "Communication", "Leadership", "Teamwork", "Problem Solving",
            "Critical Thinking", "Time Management", "Adaptability", "Creativity",
            "Attention to Detail", "Presentation", "Negotiation", "Mentoring"
        ],
        "languages": [
            "English", "Spanish", "French", "German", "Chinese", "Japanese",
            "Russian", "Portuguese", "Arabic", "Hindi"
        ],
        "tools": [
            "Git", "Docker", "Kubernetes", "AWS", "Azure", "GCP", "JIRA",
            "Confluence", "Slack", "MS Office", "Photoshop", "Figma", "Tableau",
            "Power BI", "Excel", "Jenkins", "Travis CI", "CircleCI"
        ],
        "methodologies": [
            "Agile", "Scrum", "Kanban", "Waterfall", "DevOps", "TDD", "BDD",
            "CI/CD", "Lean", "Six Sigma", "Design Thinking", "OOP", "Functional Programming"
        ],
        "domain": [
            "Finance", "Healthcare", "Education", "E-commerce", "Gaming",
            "Social Media", "Cybersecurity", "Blockchain", "IoT", "Telecommunications",
            "Logistics", "Manufacturing", "Retail", "Energy", "Transportation"
        ]
    }

    # Create flat list for lookup
    all_skills = []
    for category, skills in taxonomy.items():
        all_skills.extend([(skill.lower(), category) for skill in skills])

    return {
        "categories": taxonomy,
        "all_skills": dict(all_skills)
    }

def extract_explicit_skills(text, skill_taxonomy):
    """
    Extracts explicit skills from skills section
    """
    skills = []

    # Convert text to lowercase for case-insensitive matching
    text_lower = text.lower()

    # Look for skills in taxonomy
    for skill_name in skill_taxonomy["all_skills"].keys():
        # Check for exact match with word boundaries
        pattern = r'\b' + re.escape(skill_name) + r'\b'
        if re.search(pattern, text_lower):
            # Capitalize skill name properly
            words = skill_name.split()
            capitalized = ' '.join(word.capitalize() for word in words)
            skills.append(capitalized)

    # Look for skills separated by commas, bullets, or newlines
    items = re.split(r',|\n|•|\*|\/|\\', text)
    for item in items:
        item = item.strip().lower()
        if item in skill_taxonomy["all_skills"]:
            words = item.split()
            capitalized = ' '.join(word.capitalize() for word in words)
            skills.append(capitalized)

    # Remove duplicates
    return list(set(skills))

def extract_implicit_skills(text, skill_taxonomy):
    """
    Extracts implicit skills from experience descriptions
    """
    implicit_skills = []

    # Process text with spaCy for better phrase extraction
    doc = nlp(text)

    # Extract sentences
    sentences = [sent.text for sent in doc.sents]

    for sentence in sentences:
        # Look for verbs followed by technical terms
        sentence_doc = nlp(sentence)

        for token in sentence_doc:
            if token.pos_ == "VERB":
                # Check next few tokens for potential skill
                for i in range(1, 5):  # Look ahead up to 4 tokens
                    if token.i + i >= len(sentence_doc):
                        break

                    # Get span of tokens that could be a skill phrase
                    potential_skill = sentence_doc[token.i + 1:token.i + i + 1].text.lower()

                    # Check against skill taxonomy
                    for skill_name, category in skill_taxonomy["all_skills"].items():
                        similarity = calculate_skill_similarity(potential_skill, skill_name)

                        if similarity > 0.8:  # Threshold for skill match
                            implicit_skills.append({
                                "skill": skill_name.title(),
                                "confidence": similarity * 0.8,  # Lower confidence for implicit skills
                                "category": category
                            })

    # Remove duplicates
    unique_skills = {}
    for skill in implicit_skills:
        skill_key = skill["skill"].lower()
        if skill_key not in unique_skills or skill["confidence"] > unique_skills[skill_key]["confidence"]:
            unique_skills[skill_key] = skill

    return list(unique_skills.values())

def calculate_skill_similarity(text1, text2):
    """
    Calculate similarity between two skill texts using spaCy tokenization
    instead of NLTK's word_tokenize
    """
    # Use spaCy for tokenization instead of NLTK
    tokens1 = set(token.text.lower() for token in nlp(text1))
    tokens2 = set(token.text.lower() for token in nlp(text2))

    # Handle empty sets
    if not tokens1 or not tokens2:
        return 0.0

    # Jaccard similarity
    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)

    return len(intersection) / len(union)

def categorize_skill(skill_name, skill_taxonomy):
    """
    Categorizes a skill into one of the defined categories
    """
    skill_lower = skill_name.lower()

    # Check if skill exists in taxonomy
    if skill_lower in skill_taxonomy["all_skills"]:
        return skill_taxonomy["all_skills"][skill_lower]

    # If not directly in taxonomy, find best category match
    best_similarity = 0
    best_category = "technical"  # Default category

    for category, skills in skill_taxonomy["categories"].items():
        for category_skill in skills:
            similarity = calculate_skill_similarity(skill_lower, category_skill.lower())
            if similarity > best_similarity:
                best_similarity = similarity
                best_category = category

    return best_category

def infer_skills_from_experience(text, skill_taxonomy):
    """
    Infers additional skills based on experience descriptions
    """
    # In a real implementation, this would use a knowledge graph or ML model
    # For this example, we'll use a simplified rule-based approach

    inferred_skills = []

    # Example inference rules (simplified)
    inference_rules = [
        {
            "keywords": ["managed", "team", "teams", "led", "supervised", "directed"],
            "skill": "Leadership",
            "category": "soft",
            "confidence": 0.7
        },
        {
            "keywords": ["developed", "created", "built", "implemented", "coded"],
            "skill": "Software Development",
            "category": "technical",
            "confidence": 0.6
        },
        {
            "keywords": ["analyzed", "data", "insights", "metrics", "statistics"],
            "skill": "Data Analysis",
            "category": "technical",
            "confidence": 0.6
        },
        {
            "keywords": ["design", "designed", "ui", "ux", "interface", "wireframes"],
            "skill": "UI/UX Design",
            "category": "technical",
            "confidence": 0.6
        },
        {
            "keywords": ["agile", "sprint", "scrum", "kanban", "jira"],
            "skill": "Agile Methodology",
            "category": "methodologies",
            "confidence": 0.8
        }
    ]

    # Check each inference rule
    text_lower = text.lower()
    for rule in inference_rules:
        if any(keyword in text_lower for keyword in rule["keywords"]):
            # Check if this skill is already in our taxonomy results
            if rule["skill"].lower() not in [s["name"].lower() for s in inferred_skills]:
                inferred_skills.append({
                    "name": rule["skill"],
                    "source": "inferred",
                    "confidence": rule["confidence"],
                    "category": rule["category"]
                })

    return inferred_skills

In [None]:
def contextualize_experience(structured_data):
    """
    Analyzes and contextualizes work experience
    """
    experiences = []

    # Find experience section
    experience_section = next((s for s in structured_data["structured_sections"]
                              if s["type"] == "experience"), None)

    if not experience_section:
        return experiences

    experience_text = experience_section["content"]

    # Split text into job entries (typically separated by blank lines or dates)
    job_entries = re.split(r'\n\s*\n', experience_text)
    job_entries = [entry.strip() for entry in job_entries if entry.strip()]

    for entry in job_entries:
        # Extract job components using regex
        job_title_regex = r'(?:\b|^)(Senior|Lead|Principal|Junior|Associate)?\s*([A-Z][a-z]+(?: [A-Z][a-z]+)*)\b'
        company_regex = r'\bat\s+([A-Z][a-z0-9]*(?:[\s-][A-Z][a-z0-9]*)*),?\s'
        date_regex = r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[\s,-]+\d{4}\s*(?:(?:to|-|–|—)\s*(?:(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[\s,-]+)?\d{4}|(?:to|-|–|—)\s*Present|Current)'

        job_title_match = re.search(job_title_regex, entry)
        company_match = re.search(company_regex, entry)
        date_match = re.search(date_regex, entry, re.IGNORECASE)

        # Extract job title
        job_title = "Unknown Position"
        if job_title_match:
            job_title = job_title_match.group(0).strip()

        # Extract company
        company = "Unknown Company"
        if company_match:
            company = company_match.group(1).strip()

        # Extract date range
        date_range = "Unknown Dates"
        if date_match:
            date_range = date_match.group(0).strip()

        # Extract description (lines that don't contain title, company or date)
        lines = entry.split('\n')
        description_lines = []

        for line in lines:
            if (not job_title_match or job_title not in line) and \
               (not company_match or company not in line) and \
               (not date_match or date_match.group(0) not in line):
                description_lines.append(line.strip())

        description = '\n'.join(description_lines)

        # Create job object
        job = {
            "title": job_title,
            "company": company,
            "dateRange": date_range,
            "description": description
        }

        # Enrich with company information (in a real implementation, this would use a database)
        job["companyInfo"] = {
            "industry": infer_industry(job_title, description),
            "size": "Unknown",
            "sector": "Unknown"
        }

        experiences.append(job)

    # Sort experiences by date (most recent first)
    experiences.sort(key=lambda x: extract_end_year(x["dateRange"]), reverse=True)

    return experiences

def infer_industry(title, description):
    """
    Infers industry from job title and description
    """
    # Map of keywords to industries
    industry_keywords = {
        "Technology": ["software", "developer", "programming", "web", "app", "IT", "technical", "engineer"],
        "Healthcare": ["medical", "health", "patient", "clinical", "doctor", "nurse", "hospital"],
        "Finance": ["bank", "financial", "investment", "trading", "accounting", "finance"],
        "Education": ["school", "university", "teaching", "education", "academic", "student"],
        "Retail": ["retail", "store", "sales", "customer", "e-commerce", "shop"],
        "Manufacturing": ["manufacturing", "production", "factory", "assembly", "industrial"],
        "Media": ["media", "journalism", "publishing", "content", "news", "editor"],
        "Consulting": ["consulting", "consultant", "advisory", "strategy"]
    }

    # Combine title and description for analysis
    text = (title + " " + description).lower()

    # Count matches for each industry
    matches = {}
    for industry, keywords in industry_keywords.items():
        matches[industry] = sum(1 for keyword in keywords if keyword.lower() in text)

    # Return industry with most matches, or "Unknown" if no matches
    max_matches = max(matches.values()) if matches else 0
    if max_matches > 0:
        # Get all industries with max matches
        top_industries = [industry for industry, count in matches.items() if count == max_matches]
        return top_industries[0]  # Return first industry with max matches

    return "Unknown"

def extract_end_year(date_range):
    """
    Extracts the end year from a date range string
    """
    # Extract years from the date range
    year_matches = re.findall(r'\d{4}', date_range)

    # If "Present" is in the date range, use current year
    if re.search(r'Present|Current', date_range, re.IGNORECASE):
        return datetime.now().year

    # Return the last year found, or 0 if no years found
    return int(year_matches[-1]) if year_matches else 0

In [None]:
def analyze_education(structured_data):
    """
    Analyzes education information
    """
    education = []
    certifications = []

    # Find education section
    education_section = next((s for s in structured_data["structured_sections"]
                             if s["type"] == "education"), None)

    if education_section:
        education_text = education_section["content"]

        # Split into education entries
        entries = re.split(r'\n\s*\n', education_text)
        entries = [entry.strip() for entry in entries if entry.strip()]

        for entry in entries:
            # Extract education components using regex
            degree_regex = r'(?:B\.?S\.?|B\.?A\.?|M\.?S\.?|M\.?A\.?|Ph\.?D\.?|M\.?B\.?A\.?|Bachelor|Master|Doctor|Associate)[^\n]*'
            university_regex = r'(?:University|College|Institute|School)[^\n]*'
            date_regex = r'\b\d{4}\b'
            gpa_regex = r'\bGPA\s*(?:of|:)?\s*(\d+\.\d+|\d+)'

            degree_match = re.search(degree_regex, entry, re.IGNORECASE)
            university_match = re.search(university_regex, entry, re.IGNORECASE)
            date_matches = re.findall(date_regex, entry)
            gpa_match = re.search(gpa_regex, entry, re.IGNORECASE)

            # Create education entry
            education_entry = {
                "degree": degree_match.group(0).strip() if degree_match else "Unknown Degree",
                "institution": university_match.group(0).strip() if university_match else "Unknown Institution",
                "dateRange": '-'.join(date_matches) if date_matches else "Unknown Dates",
                "gpa": gpa_match.group(1) if gpa_match else None
            }

            # Standardize and enrich education entry
            standardized_entry = standardize_degree(education_entry)
            education.append(standardized_entry)

    # Find certifications section
    cert_section = next((s for s in structured_data["structured_sections"]
                        if s["type"] == "certifications"), None)

    if cert_section:
        cert_text = cert_section["content"]

        # Split into certification entries
        cert_entries = re.split(r'\n|•|\*', cert_text)
        cert_entries = [entry.strip() for entry in cert_entries if entry.strip()]

        for entry in cert_entries:
            # Extract date if present
            date_regex = r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[\s,-]*\d{4}\b|\b\d{4}\b'
            date_match = re.search(date_regex, entry, re.IGNORECASE)

            # Create certification entry
            certification = {
                "name": re.sub(date_regex, '', entry, flags=re.IGNORECASE).strip() if date_match else entry,
                "date": date_match.group(0).strip() if date_match else "Unknown Date"
            }

            certifications.append(certification)

    return {
        "education": education,
        "certifications": certifications
    }

def standardize_degree(education_entry):
    """
    Standardizes degree information
    """
    degree = education_entry["degree"]

    # Define standard degree mappings
    degree_patterns = [
        {"regex": r'\bB\.?S\.?|Bachelor\s+of\s+Science\b', "standard": "Bachelor of Science"},
        {"regex": r'\bB\.?A\.?|Bachelor\s+of\s+Arts\b', "standard": "Bachelor of Arts"},
        {"regex": r'\bM\.?S\.?|Master\s+of\s+Science\b', "standard": "Master of Science"},
        {"regex": r'\bM\.?A\.?|Master\s+of\s+Arts\b', "standard": "Master of Arts"},
        {"regex": r'\bPh\.?D\.?|Doctor\s+of\s+Philosophy\b', "standard": "Doctor of Philosophy"},
        {"regex": r'\bM\.?B\.?A\.?|Master\s+of\s+Business\s+Administration\b', "standard": "Master of Business Administration"}
    ]

    # Find matching standard degree
    standard_degree = degree
    for pattern in degree_patterns:
        if re.search(pattern["regex"], degree, re.IGNORECASE):
            standard_degree = pattern["standard"]
            break

    # Extract major if present
    major_regex = r'in\s+([A-Za-z]+(?: [A-Za-z]+)*)\b'
    major_match = re.search(major_regex, degree, re.IGNORECASE)

    # Determine degree level
    level = get_degree_level(standard_degree)

    return {
        **education_entry,
        "degree": standard_degree,
        "major": major_match.group(1) if major_match else None,
        "level": level
    }

def get_degree_level(degree):
    """
    Determines the level of a degree
    """
    if re.search(r'Bachelor', degree):
        return "undergraduate"
    elif re.search(r'Master|MBA', degree):
        return "graduate"
    elif re.search(r'Doctor|Ph\.?D\.?', degree):
        return "doctorate"
    elif re.search(r'Associate', degree):
        return "associate"
    else:
        return "other"

In [None]:
def generate_user_profile(extracted_data):
    """
    Generates a comprehensive user profile from extracted data
    """
    structured_data = extracted_data["structured_data"]
    skills = extracted_data["skills"]
    experiences = extracted_data["experiences"]
    education_data = extracted_data["education_data"]

    # Combine all data into comprehensive user profile
    profile = {
        "id": str(uuid.uuid4()),
        "createdAt": datetime.now().isoformat(),
        "personalInfo": structured_data["contact_info"],
        "skills": {
            "technical": normalize_skills(skills["technical"]),
            "soft": normalize_skills(skills["soft"]),
            "languages": normalize_skills(skills["languages"]),
            "tools": normalize_skills(skills["tools"]),
            "methodologies": normalize_skills(skills["methodologies"]),
            "domain": normalize_skills(skills["domain"]),
            "inferred": skills["inferred"]
        },
        "experience": experiences,
        "education": education_data["education"],
        "certifications": education_data["certifications"],
        "careerStage": determine_career_stage(experiences, education_data),
        "careerTrajectory": analyze_career_trajectory(experiences)
    }

    return profile

def normalize_skills(skills_array):
    """
    Normalizes skills by removing duplicates and combining confidence scores
    """
    skill_map = {}

    # Group by skill name and calculate best confidence
    for skill in skills_array:
        name = skill["name"].lower()
        if name not in skill_map or skill_map[name]["confidence"] < skill["confidence"]:
            skill_map[name] = skill

    return list(skill_map.values())

def determine_career_stage(experiences, education_data):
    """
    Determines career stage based on experience and education
    """
    # Calculate total years of experience
    total_years = 0

    for job in experiences:
        date_range = job["dateRange"]

        # Extract years
        years = re.findall(r'\d{4}', date_range)
        if len(years) >= 2:
            start_year = int(years[0])
            end_year = datetime.now().year if "Present" in date_range else int(years[-1])
            total_years += end_year - start_year

    # Determine education level
    highest_education = sorted(education_data["education"],
                              key=lambda x: {"doctorate": 4, "graduate": 3, "undergraduate": 2, "associate": 1, "other": 0}[x["level"]],
                              reverse=True)[0] if education_data["education"] else None

    # Determine career stage
    if total_years < 2:
        return "entry-level"
    elif total_years < 5:
        return "early-career"
    elif total_years < 10:
        return "mid-career"
    elif total_years < 15:
        return "experienced"
    else:
        return "senior"

def analyze_career_trajectory(experiences):
    """
    Analyzes career trajectory based on job history
    """
    if len(experiences) < 2:
        return {
            "direction": "stable",
            "velocity": "normal",
            "pattern": "linear"
        }

    # Analyze job titles for progression
    titles = [exp["title"] for exp in experiences]
    title_progression = analyze_title_progression(titles)

    # Analyze industry changes
    industries = [exp["companyInfo"]["industry"] for exp in experiences if exp["companyInfo"]["industry"] != "Unknown"]
    industry_changes = len(set(industries))

    # Analyze tenure patterns
    tenures = calculate_tenures(experiences)

    return {
        "direction": title_progression["direction"],
        "velocity": determine_velocity(experiences, tenures),
        "pattern": determine_pattern(industry_changes, len(experiences))
    }

def analyze_title_progression(titles):
    """
    Analyzes job title progression
    """
    # Define career level keywords
    level_keywords = {
        "entry": ["assistant", "junior", "intern", "trainee"],
        "mid": ["associate", "analyst", "specialist", "developer"],
        "senior": ["senior", "lead", "principal", "manager", "head", "chief", "director", "vp"]
    }

    # Score each title
    scores = []
    for title in titles:
        title_lower = title.lower()
        score = 0

        # Check for level keywords
        for level, keywords in level_keywords.items():
            for keyword in keywords:
                if keyword in title_lower:
                    score = 1 if level == "entry" else 2 if level == "mid" else 3
                    break
            if score > 0:
                break

        scores.append(score or 1.5)  # Default to mid-level if no keywords found

    # Calculate progression
    first_score = scores[0]
    last_score = scores[-1]

    if last_score > first_score:
        return {"direction": "upward", "magnitude": last_score - first_score}
    elif last_score < first_score:
        return {"direction": "downward", "magnitude": first_score - last_score}
    else:
        return {"direction": "stable", "magnitude": 0}

def calculate_tenures(experiences):
    """
    Calculates job tenures in years
    """
    tenures = []

    for job in experiences:
        date_range = job["dateRange"]

        # Extract years
        years = re.findall(r'\d{4}', date_range)
        if len(years) >= 2:
            start_year = int(years[0])
            end_year = datetime.now().year if "Present" in date_range else int(years[-1])
            tenures.append(end_year - start_year)
        else:
            tenures.append(1)  # Default if unable to determine

    return tenures

def determine_velocity(experiences, tenures):
    """
    Determines career velocity based on job changes and tenures
    """
    if not tenures:
        return "normal"

    avg_tenure = sum(tenures) / len(tenures)

    if avg_tenure < 1.5:
        return "rapid"
    elif avg_tenure < 3:
        return "normal"
    else:
        return "steady"

def determine_pattern(industry_changes, job_count):
    """
    Determines career pattern based on industry changes
    """
    if job_count == 0:
        return "unknown"

    change_ratio = industry_changes / job_count

    if change_ratio > 0.5:
        return "diverse"
    elif change_ratio > 0.2:
        return "exploratory"
    else:
        return "specialized"

In [None]:
class CareerRecommender:
    """
    Career recommendation system with deep learning capabilities
    """
    def __init__(self, database_path='career_database.csv', model_path='career_model.h5'):
        """
        Initialize the Career Recommender with database and ML model
        """
        # Check if database exists, if not generate it
        if not os.path.exists(database_path):
            self.career_data = generate_career_database(database_path)
        else:
            self.career_data = pd.read_csv(database_path)

        # Pre-process the data for career matching
        self._preprocess_career_data()

        # Load or initialize the recommendation model
        self.model_path = model_path
        self.model = self._load_or_create_model()

        # Initialize feedback data for reinforcement learning
        self.feedback_data = []

    def _preprocess_career_data(self):
        """
        Preprocess the career data for faster matching
        """
        # Convert pipe-separated skills to lists
        self.career_data['skills_list'] = self.career_data['required_skills'].apply(
            lambda x: x.split('|')
        )

        # Create a string representation of all skills for vectorization
        self.career_data['skills_text'] = self.career_data['required_skills'].apply(
            lambda x: ' '.join(x.split('|'))
        )

        # Create a TF-IDF vectorizer for skill matching
        self.vectorizer = TfidfVectorizer()
        self.skill_vectors = self.vectorizer.fit_transform(self.career_data['skills_text'])

    def _load_or_create_model(self):
        """
        Load existing model or create a new one
        """
        try:
            if os.path.exists(self.model_path):
                print(f"Loading existing career recommendation model from {self.model_path}")
                return load_model(self.model_path)
            else:
                print("Creating new career recommendation model")
                return self._create_model()
        except Exception as e:
            print(f"Error loading model: {str(e)}. Creating new model.")
            return self._create_model()

    def _create_model(self):
        """
        Create a new neural network model for skill-based job matching enhancement
        """
        # Simple MLP model to adjust skill match scores based on career stage and type
        model = Sequential([
            Dense(128, activation='relu', input_shape=(7,)),  # Input: skill match, career stage values, is_new_seeker
            Dropout(0.3),
            Dense(64, activation='relu'),
            Dropout(0.2),
            Dense(32, activation='relu'),
            Dense(1, activation='sigmoid')  # Output: enhanced match score
        ])

        model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss='mse',
            metrics=['mae']
        )

        # If we had real user data, we would train the model here
        # For now, we'll just return the untrained model
        return model

    def match_careers(self, user_profile, career_type="new_seeker", top_n=10):
        """
        Match the user profile with potential careers

        Args:
            user_profile: A dictionary containing the user's skills, experience, and preferences
            career_type: 'new_seeker' or 'transitioner'
            top_n: Number of top matches to return

        Returns:
            Dictionary with career recommendations and learning paths
        """
        # Extract user skills
        user_skills = set()
        for skill_category in ['technical', 'soft', 'tools', 'methodologies', 'domain', 'languages']:
            if skill_category in user_profile['skills']:
                user_skills.update([skill['name'].lower() for skill in user_profile['skills'][skill_category]])

        # Add inferred skills
        if 'inferred' in user_profile['skills']:
            user_skills.update([skill['name'].lower() for skill in user_profile['skills']['inferred']])

        # Create text representation of user skills for vectorization
        user_skills_text = ' '.join(user_skills)

        # Vectorize user skills
        user_vector = self.vectorizer.transform([user_skills_text])

        # Calculate similarity scores
        similarity_scores = cosine_similarity(user_vector, self.skill_vectors).flatten()

        # Add similarity scores to career data
        career_matches = self.career_data.copy()
        career_matches['similarity'] = similarity_scores

        # Filter by career type if specified
        if career_type in ["new_seeker", "transitioner"]:
            career_matches = career_matches[career_matches['career_type'] == career_type]

        # Filter by career stage if available
        if 'careerStage' in user_profile:
            # Map user career stage to our categories
            stage_mapping = {
                'entry-level': ['entry-level'],
                'early-career': ['entry-level', 'early-career'],
                'mid-career': ['early-career', 'mid-career'],
                'experienced': ['mid-career', 'experienced'],
                'senior': ['experienced', 'senior']
            }

            # Get appropriate career stages for the user
            appropriate_stages = stage_mapping.get(
                user_profile['careerStage'],
                ['entry-level', 'early-career', 'mid-career', 'experienced', 'senior']
            )

            # Filter careers by appropriate stages
            career_matches = career_matches[career_matches['career_stage'].isin(appropriate_stages)]

        # Enhance match scores using the deep learning model
        if len(career_matches) > 0:
            enhanced_scores = self._enhance_match_scores(
                career_matches,
                user_profile,
                career_type
            )
            career_matches['enhanced_similarity'] = enhanced_scores

            # Sort by enhanced similarity score
            career_matches = career_matches.sort_values('enhanced_similarity', ascending=False)
        else:
            # Sort by original similarity if no enhancement
            career_matches = career_matches.sort_values('similarity', ascending=False)

        # Get top N matches
        top_matches = career_matches.head(top_n)

        # Calculate skill gaps for each match
        recommendations = []
        all_skill_gaps = set()

        for _, match in top_matches.iterrows():
            # Get required skills for this career
            required_skills = set(skill.lower() for skill in match['skills_list'])

            # Calculate skill gaps
            skill_gaps = required_skills - user_skills

            # Calculate match percentage
            if len(required_skills) > 0:
                match_percentage = (len(required_skills) - len(skill_gaps)) / len(required_skills) * 100
            else:
                match_percentage = 100

            # Adjust match percentage based on enhanced similarity if available
            if 'enhanced_similarity' in match:
                # Blend original match percentage with the enhanced score
                match_percentage = 0.7 * match_percentage + 30 * match['enhanced_similarity']

            # Add to recommendations
            recommendations.append({
                'jobTitle': match['job_title'],
                'industry': match['industry'],
                'matchPercentage': round(match_percentage, 1),
                'skillsMatched': len(required_skills) - len(skill_gaps),
                'totalSkillsRequired': len(required_skills),
                'skillGaps': list(skill.title() for skill in skill_gaps),
                'minSalary': int(match['min_salary']),
                'maxSalary': int(match['max_salary']),
                'remoteOptions': match['remote_options'],
                'growthPotential': int(match['growth_potential']),
                'careerType': match['career_type'],
                'jobId': int(match['job_id'])
            })

            # Add to overall skill gaps
            all_skill_gaps.update(skill.title() for skill in skill_gaps)

        # Generate learning recommendations for skill gaps
        learning_paths = self._generate_learning_paths(all_skill_gaps)

        return {
            'careerRecommendations': recommendations,
            'learningPaths': learning_paths
        }

    def _enhance_match_scores(self, career_matches, user_profile, career_type):
        """
        Use the deep learning model to enhance match scores based on additional factors
        """
        # Extract career stage as numeric value
        stage_values = {
            'entry-level': 0.0,
            'early-career': 0.25,
            'mid-career': 0.5,
            'experienced': 0.75,
            'senior': 1.0
        }

        user_stage_value = stage_values.get(user_profile.get('careerStage', 'entry-level'), 0.0)

        # Prepare input features for model
        features = []
        for _, row in career_matches.iterrows():
            # Create feature vector for each job match
            is_new_seeker = 1.0 if career_type == "new_seeker" else 0.0
            job_stage_value = stage_values.get(row['career_stage'], 0.0)

            # Extract career trajectory and experience data
            trajectory_data = user_profile.get('careerTrajectory', {})
            direction_value = 0.5  # neutral
            if trajectory_data.get('direction') == 'upward':
                direction_value = 1.0
            elif trajectory_data.get('direction') == 'downward':
                direction_value = 0.0

            pattern_value = 0.5  # default
            if trajectory_data.get('pattern') == 'specialized':
                pattern_value = 1.0
            elif trajectory_data.get('pattern') == 'diverse':
                pattern_value = 0.0

            # Create feature vector: [similarity, user_stage, job_stage, stage_diff, is_new_seeker, direction, pattern]
            feature = [
                row['similarity'],  # Base similarity score
                user_stage_value,   # User career stage
                job_stage_value,    # Job career stage
                abs(user_stage_value - job_stage_value),  # Stage difference
                is_new_seeker,      # If user is new job seeker
                direction_value,    # Career trajectory direction
                pattern_value       # Career pattern
            ]

            features.append(feature)

        # If we have features, predict enhanced scores
        if features:
            features_array = np.array(features)

            # If model is untrained, blend features in a simple way
            # When we have real feedback data, we'd use predictions from the trained model
            enhanced_scores = features_array[:, 0] * 0.7 + \
                            (1 - features_array[:, 3]) * 0.2 + \
                            (features_array[:, 4] if career_type == "new_seeker" else (1 - features_array[:, 4])) * 0.1

            return enhanced_scores

        # Fallback to original similarity scores
        return career_matches['similarity'].values

    def add_feedback(self, user_profile, job_id, feedback_score):
        """
        Add user feedback for reinforcement learning

        Args:
            user_profile: User profile dictionary
            job_id: ID of the job that received feedback
            feedback_score: Score from 1-5 indicating how good the match was
        """
        # Get job details
        job = self.career_data[self.career_data['job_id'] == job_id].iloc[0] if any(self.career_data['job_id'] == job_id) else None

        if job is not None:
            # Extract features for this feedback instance
            stage_values = {
                'entry-level': 0.0,
                'early-career': 0.25,
                'mid-career': 0.5,
                'experienced': 0.75,
                'senior': 1.0
            }

            user_stage_value = stage_values.get(user_profile.get('careerStage', 'entry-level'), 0.0)
            job_stage_value = stage_values.get(job['career_stage'], 0.0)
            is_new_seeker = 1.0 if job['career_type'] == "new_seeker" else 0.0

            # Extract trajectory data
            trajectory_data = user_profile.get('careerTrajectory', {})
            direction_value = 0.5
            if trajectory_data.get('direction') == 'upward':
                direction_value = 1.0
            elif trajectory_data.get('direction') == 'downward':
                direction_value = 0.0

            pattern_value = 0.5
            if trajectory_data.get('pattern') == 'specialized':
                pattern_value = 1.0
            elif trajectory_data.get('pattern') == 'diverse':
                pattern_value = 0.0

            # Calculate similarity score
            user_skills = set()
            for skill_category in ['technical', 'soft', 'tools', 'methodologies', 'domain', 'languages']:
                if skill_category in user_profile['skills']:
                    user_skills.update([skill['name'].lower() for skill in user_profile['skills'][skill_category]])

            if 'inferred' in user_profile['skills']:
                user_skills.update([skill['name'].lower() for skill in user_profile['skills']['inferred']])

            required_skills = set(job['skills_list']) if isinstance(job['skills_list'], list) else set(job['required_skills'].split('|'))
            skill_overlap = len(user_skills.intersection(required_skills))
            similarity = skill_overlap / len(required_skills) if len(required_skills) > 0 else 0

            # Create feature vector
            feature = [
                similarity,         # Base similarity score
                user_stage_value,   # User career stage
                job_stage_value,    # Job career stage
                abs(user_stage_value - job_stage_value),  # Stage difference
                is_new_seeker,      # If user is new job seeker
                direction_value,    # Career trajectory direction
                pattern_value       # Career pattern
            ]

            # Normalize feedback score to 0-1 range
            normalized_score = (feedback_score - 1) / 4.0

            # Add to feedback data
            self.feedback_data.append((feature, normalized_score))

            # Update model if we have enough data
            if len(self.feedback_data) >= 10:
                self._update_model()

    def _update_model(self):
        """
        Update the recommendation model with accumulated feedback
        """
        if len(self.feedback_data) == 0:
            return

        # Prepare training data
        X = np.array([f[0] for f in self.feedback_data])
        y = np.array([f[1] for f in self.feedback_data])

        # Train model with new data
        self.model.fit(X, y, epochs=50, batch_size=8, verbose=0)

        # Save updated model
        self.model.save(self.model_path)
        print(f"Updated recommendation model with {len(self.feedback_data)} feedback examples")

    def _generate_learning_paths(self, skill_gaps):
        """
        Generate learning recommendations for skill gaps
        """
        # In a real implementation, this would query a learning resources database
        # For this example, we'll generate sample learning resources

        learning_paths = {}

        for skill in skill_gaps:
            # Generate 2-3 learning resources for each skill
            num_resources = random.randint(2, 3)
            resources = []

            for i in range(num_resources):
                # Determine resource type
                resource_type = random.choice(['Course', 'Tutorial', 'Book', 'Workshop', 'Certification'])

                # Determine provider based on type
                if resource_type == 'Course':
                    provider = random.choice(['Coursera', 'Udemy', 'edX', 'LinkedIn Learning', 'Pluralsight'])
                elif resource_type == 'Tutorial':
                    provider = random.choice(['YouTube', 'FreeCodeCamp', 'W3Schools', 'Khan Academy', 'TutorialsPoint'])
                elif resource_type == 'Book':
                    provider = random.choice(['O\'Reilly', 'Manning', 'Packt', 'Apress', 'No Starch Press'])
                elif resource_type == 'Workshop':
                    provider = random.choice(['General Assembly', 'Codecademy', 'Bootcamp', 'University Extension'])
                else:  # Certification
                    provider = random.choice(['Microsoft', 'AWS', 'Google', 'Cisco', 'CompTIA', 'Oracle'])

                # Determine difficulty
                difficulty = random.choice(['beginner', 'intermediate', 'advanced'])

                # Determine duration based on type
                if resource_type == 'Course':
                    duration = f"{random.randint(4, 12)} weeks"
                elif resource_type == 'Tutorial':
                    duration = f"{random.randint(1, 10)} hours"
                elif resource_type == 'Book':
                    duration = f"{random.randint(10, 30)} hours"
                elif resource_type == 'Workshop':
                    duration = f"{random.randint(1, 5)} days"
                else:  # Certification
                    duration = f"{random.randint(1, 6)} months"

                # Create resource
                resource = {
                    'title': f"{skill} {resource_type}",
                    'provider': provider,
                    'type': resource_type,
                    'duration': duration,
                    'difficulty': difficulty,
                    'url': f"https://example.com/{skill.lower().replace(' ', '-')}"
                }

                resources.append(resource)

            # Sort resources by difficulty (beginner to advanced)
            difficulty_order = {'beginner': 0, 'intermediate': 1, 'advanced': 2}
            resources.sort(key=lambda x: difficulty_order[x['difficulty']])

            learning_paths[skill] = resources

        return learning_paths

# After the CareerRecommender class definition and its methods

def evaluate_model_ndcg(recommender, test_profiles, relevance_scores, top_k=10):
    """
    Calculate Normalized Discounted Cumulative Gain (NDCG) for career recommendations

    Args:
        recommender: Your CareerRecommender instance
        test_profiles: List of user profiles for testing
        relevance_scores: Dictionary mapping profile_id to dict of {job_id: relevance_score}
                         where relevance_score is typically 0-3 (0=irrelevant, 3=perfect match)
        top_k: Number of recommendations to consider

    Returns:
        ndcg_score: Average NDCG score (0-1, higher is better)
    """
    import numpy as np

    ndcg_scores = []

    for profile in test_profiles:
        profile_id = profile['id']

        # Skip profiles without relevance judgments
        if profile_id not in relevance_scores:
            continue

        # Get recommendations for this profile
        recommendations = recommender.match_careers(profile, top_n=top_k)
        recommended_jobs = [rec['jobId'] for rec in recommendations['careerRecommendations']]

        # Calculate DCG (Discounted Cumulative Gain)
        dcg = 0
        for i, job_id in enumerate(recommended_jobs):
            # Get relevance (0 if not in relevance scores)
            rel = relevance_scores[profile_id].get(job_id, 0)
            # Apply log discount (position i+1 because we're 0-indexed)
            dcg += (2**rel - 1) / np.log2(i + 2)  # +2 because log base 2 of 1 is 0

        # Calculate ideal DCG (IDCG)
        # Sort all relevance scores for this profile in descending order
        ideal_rels = sorted(relevance_scores[profile_id].values(), reverse=True)[:top_k]
        idcg = sum((2**rel - 1) / np.log2(i + 2) for i, rel in enumerate(ideal_rels))

        # Calculate NDCG
        ndcg = dcg / idcg if idcg > 0 else 0
        ndcg_scores.append(ndcg)

    # Average NDCG across all profiles
    avg_ndcg = sum(ndcg_scores) / len(ndcg_scores) if ndcg_scores else 0
    return avg_ndcg

def create_test_relevance_scores(test_profiles, all_job_ids, num_relevant=5):
    """
    Create test relevance scores for NDCG evaluation

    Args:
        test_profiles: List of test user profiles
        all_job_ids: List of all job IDs
        num_relevant: Number of relevant jobs per profile

    Returns:
        relevance_scores: Dictionary of {profile_id: {job_id: relevance_score}}
    """
    import random

    relevance_scores = {}

    for profile in test_profiles:
        profile_id = profile['id']
        relevance_scores[profile_id] = {}

        # Select random jobs to be relevant
        relevant_jobs = random.sample(all_job_ids, min(num_relevant, len(all_job_ids)))

        # Assign random relevance scores (1-3) to relevant jobs
        for job_id in relevant_jobs:
            # Higher relevance score means more relevant (0=irrelevant, 3=perfect match)
            relevance_scores[profile_id][job_id] = random.randint(1, 3)

    return relevance_scores


In [None]:
def generate_visualization_data(profile, recommendations):
    """
    Generates data structures optimized for visualization
    """
    # 1. Skill radar chart data
    skill_radar_data = generate_skill_radar_data(profile['skills'])

    # 2. Career path visualization
    career_path_data = generate_career_path_data(
        profile['experience'],
        recommendations['careerRecommendations']
    )

    # 3. Learning pathway timeline
    learning_pathway_data = generate_learning_pathway_data(
        recommendations['learningPaths']
    )

    # 4. Industry compatibility chart
    industry_compatibility_data = generate_industry_compatibility_data(
        profile,
        recommendations['careerRecommendations']
    )

    return {
        'skillRadarData': skill_radar_data,
        'careerPathData': career_path_data,
        'learningPathwayData': learning_pathway_data,
        'industryCompatibilityData': industry_compatibility_data
    }

def generate_visualizations(profile, recommendations):
    """
    Generate all visualizations and save them to files

    Args:
        profile: User profile generated from generate_user_profile()
        recommendations: Recommendations from CareerRecommender.match_careers()

    Returns:
        Dictionary with paths to saved visualization files
    """
    import matplotlib.pyplot as plt

    # Get visualization data
    visualization_data = generate_visualization_data(profile, recommendations)

    # Generate each visualization
    vis_paths = {}

    # 1. Skill Radar Chart
    skill_fig = visualize_skill_radar(visualization_data['skillRadarData'])
    skill_path = 'skill_radar_chart.png'
    skill_fig.savefig(skill_path, dpi=300, bbox_inches='tight')
    plt.close(skill_fig)
    vis_paths['skill_radar'] = skill_path

    # 2. Career Path Visualization
    career_fig = visualize_career_path(visualization_data['careerPathData'])
    career_path = 'career_path_visualization.png'
    career_fig.savefig(career_path, dpi=300, bbox_inches='tight')
    plt.close(career_fig)
    vis_paths['career_path'] = career_path

    # 3. Learning Pathway Timeline
    learning_fig = visualize_learning_pathway(visualization_data['learningPathwayData'])
    learning_path = 'learning_pathway_timeline.png'
    learning_fig.savefig(learning_path, dpi=300, bbox_inches='tight')
    plt.close(learning_fig)
    vis_paths['learning_pathway'] = learning_path

    # 4. Industry Compatibility Chart
    industry_fig = visualize_industry_compatibility(visualization_data['industryCompatibilityData'])
    industry_path = 'industry_compatibility_chart.png'
    industry_fig.savefig(industry_path, dpi=300, bbox_inches='tight')
    plt.close(industry_fig)
    vis_paths['industry_compatibility'] = industry_path

    print(f"All visualizations generated and saved:")
    for key, path in vis_paths.items():
        print(f"- {key}: {path}")

    return vis_paths

def visualize_skill_radar(skills_data):
    """
    Create radar chart visualization for skills proficiency

    Args:
        skills_data: Data from generate_skill_radar_data()
    """
    import matplotlib.pyplot as plt
    import numpy as np

    # Extract categories and values
    categories = [item['category'] for item in skills_data]
    values = [item['value'] for item in skills_data]

    # Number of variables
    N = len(categories)

    # Create angles for each category
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]  # Close the loop

    # Add values to complete the loop
    values += values[:1]

    # Set up the plot
    fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))

    # Draw one axis per variable and add labels
    plt.xticks(angles[:-1], categories, color='black', size=14)

    # Draw the chart
    ax.plot(angles, values, 'o-', linewidth=2, color='#4C72B0')
    ax.fill(angles, values, color='#4C72B0', alpha=0.25)

    # Configure the visualization
    ax.set_rlabel_position(0)
    plt.yticks([20, 40, 60, 80, 100], ["20%", "40%", "60%", "80%", "100%"], color="grey", size=12)
    plt.ylim(0, 100)

    # Add title and styling
    plt.title('Skill Proficiency by Category', size=20, y=1.1, fontweight='bold')

    # Improve aesthetics
    ax.spines['polar'].set_visible(False)
    ax.grid(color='lightgray', linestyle='-', linewidth=0.5)

    return fig

def visualize_career_path(career_path_data):
    """
    Create a career path visualization showing past experience and future possibilities

    Args:
        career_path_data: Data from generate_career_path_data()
    """
    import matplotlib.pyplot as plt
    import networkx as nx

    # Create directed graph
    G = nx.DiGraph()

    # Add nodes with attributes
    nodes = career_path_data['nodes']
    edges = career_path_data['edges']

    # Get past and future nodes
    past_nodes = [n for n in nodes if n['type'] == 'past']
    future_nodes = [n for n in nodes if n['type'] == 'future']

    # Add nodes to graph
    for node in nodes:
        G.add_node(node['id'], **node)

    # Add edges to graph
    for edge in edges:
        G.add_edge(edge['source'], edge['target'],
                  weight=edge.get('weight', 1.0),
                  type=edge['type'])

    # Set up the plot
    fig, ax = plt.subplots(figsize=(12, 8))

    # Define positions - past on left, future on right
    pos = {}

    # Position past nodes in a vertical line on the left
    for i, node in enumerate(past_nodes):
        pos[node['id']] = (0, len(past_nodes) - i)

    # Position future nodes in a vertical line on the right
    for i, node in enumerate(future_nodes):
        pos[node['id']] = (3, (len(future_nodes) + 1) / (i + 1))

    # Draw nodes with different colors for past and future
    past_node_ids = [n['id'] for n in past_nodes]
    future_node_ids = [n['id'] for n in future_nodes]

    # Draw edges with different styles
    history_edges = [(u, v) for u, v, d in G.edges(data=True) if d['type'] == 'history']
    potential_edges = [(u, v) for u, v, d in G.edges(data=True) if d['type'] == 'potential']

    # Edge styling
    nx.draw_networkx_edges(G, pos, edgelist=history_edges, edge_color='gray',
                          width=2, arrowsize=15, alpha=0.7)
    nx.draw_networkx_edges(G, pos, edgelist=potential_edges, edge_color='#4C72B0',
                          width=1.5, style='dashed', arrowsize=15, alpha=0.5)

    # Node styling
    nx.draw_networkx_nodes(G, pos, nodelist=past_node_ids, node_color='#FF7043',
                          node_size=1000, alpha=0.9)
    nx.draw_networkx_nodes(G, pos, nodelist=future_node_ids, node_color='#4CAF50',
                          node_size=1000, alpha=0.7)

    # Create labels with job title and company/industry
    past_labels = {n['id']: f"{n['title']}\n{n['company']}" for n in past_nodes}
    future_labels = {n['id']: f"{n['title']}\n{n['industry']}\n{n['matchPercentage']}% match"
                    for n in future_nodes}

    # Combine labels
    labels = {**past_labels, **future_labels}

    # Draw labels
    nx.draw_networkx_labels(G, pos, labels=labels, font_size=10, font_weight='bold')

    # Add title and styling
    plt.title('Career Path Trajectory', size=20, fontweight='bold')
    plt.axis('off')  # Turn off axis

    # Add legend
    past_patch = plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#FF7043',
                          markersize=15, label='Past Roles')
    future_patch = plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#4CAF50',
                            markersize=15, label='Potential Future Roles')
    history_line = plt.Line2D([0], [0], color='gray', lw=2, label='Career History')
    potential_line = plt.Line2D([0], [0], color='#4C72B0', lw=2, linestyle='--',
                              label='Potential Paths')

    plt.legend(handles=[past_patch, future_patch, history_line, potential_line],
              loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=4)

    plt.tight_layout()
    return fig

def visualize_learning_pathway(learning_pathway_data):
    """
    Create a timeline visualization of recommended learning pathways

    Args:
        learning_pathway_data: Data from generate_learning_pathway_data()
    """
    import matplotlib.pyplot as plt
    import matplotlib.patches as patches

    # Sort by start week
    learning_items = sorted(learning_pathway_data, key=lambda x: x['startWeek'])

    # Set up the plot
    fig, ax = plt.subplots(figsize=(12, 8))

    # Define colors for different resource types
    resource_colors = {
        'Course': '#3498db',
        'Tutorial': '#2ecc71',
        'Book': '#9b59b6',
        'Workshop': '#e74c3c',
        'Certification': '#f39c12'
    }

    # Define position for each skill (y-axis)
    skills = list(set([item['skill'] for item in learning_items]))
    skill_positions = {skill: i for i, skill in enumerate(skills)}

    # Draw timeline items
    for item in learning_items:
        start_week = item['startWeek']
        end_week = item['endWeek']
        duration = end_week - start_week
        skill = item['skill']

        # Create rectangle for each learning item
        rect = patches.Rectangle(
            (start_week, skill_positions[skill] - 0.3),
            duration,
            0.6,
            linewidth=1,
            edgecolor='black',
            facecolor=resource_colors.get(item['type'], '#7f8c8d'),
            alpha=0.7
        )

        # Add rectangle to plot
        ax.add_patch(rect)

        # Add text label in the middle of rectangle
        if duration > 2:  # Only add text if enough space
            ax.text(
                start_week + duration/2,
                skill_positions[skill],
                item['title'],
                ha='center',
                va='center',
                fontsize=9,
                fontweight='bold',
                color='white'
            )

    # Set y-axis labels (skills)
    plt.yticks(list(skill_positions.values()), list(skill_positions.keys()))

    # Calculate max end week for x-axis
    max_end_week = max([item['endWeek'] for item in learning_items])

    # Set x-axis labels (weeks)
    plt.xticks(range(0, max_end_week + 2, 2), [f'Week {w}' for w in range(0, max_end_week + 2, 2)])

    # Add title and styling
    plt.title('Learning Pathway Timeline', size=20, fontweight='bold')
    plt.xlabel('Timeline', fontsize=14)
    plt.ylabel('Skills to Develop', fontsize=14)

    # Add grid
    plt.grid(axis='x', linestyle='--', alpha=0.7)

    # Add legend for resource types
    legend_elements = [
        patches.Patch(facecolor=color, edgecolor='black', label=res_type)
        for res_type, color in resource_colors.items()
    ]

    plt.legend(handles=legend_elements, loc='upper center',
              bbox_to_anchor=(0.5, -0.1), ncol=5)

    plt.tight_layout()
    return fig

def visualize_industry_compatibility(compatibility_data):
    """
    Create a visualization showing compatibility with different industries

    Args:
        compatibility_data: Data from generate_industry_compatibility_data()
    """
    import matplotlib.pyplot as plt
    import numpy as np
    import seaborn as sns

    # Sort by overall compatibility
    sorted_data = sorted(
        compatibility_data,
        key=lambda x: (x['skillMatch'] * 0.5 + x['personalityFit'] * 0.25 + x['valuesAlignment'] * 0.25),
        reverse=True
    )

    # Top 5 industries for clarity
    industries = [item['industry'] for item in sorted_data[:5]]
    skill_matches = [item['skillMatch'] for item in sorted_data[:5]]
    personality_fits = [item['personalityFit'] for item in sorted_data[:5]]
    values_alignments = [item['valuesAlignment'] for item in sorted_data[:5]]

    # Set up plot
    fig, ax = plt.subplots(figsize=(12, 8))

    # Set width of bars
    bar_width = 0.25

    # Set positions of bars on X axis
    r1 = np.arange(len(industries))
    r2 = [x + bar_width for x in r1]
    r3 = [x + bar_width for x in r2]

    # Create bars
    ax.bar(r1, skill_matches, width=bar_width, color='#3498db',
          edgecolor='grey', label='Skill Match')
    ax.bar(r2, personality_fits, width=bar_width, color='#2ecc71',
          edgecolor='grey', label='Personality Fit')
    ax.bar(r3, values_alignments, width=bar_width, color='#9b59b6',
          edgecolor='grey', label='Values Alignment')

    # Add labels and title
    plt.xlabel('Industry', fontsize=14)
    plt.ylabel('Compatibility Score (%)', fontsize=14)
    plt.title('Industry Compatibility Analysis', fontsize=20, fontweight='bold')

    # Add xticks on the middle of the bars
    plt.xticks([r + bar_width for r in range(len(industries))], industries, rotation=45, ha='right')

    # Create legend
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=3)

    # Add grid and improve aesthetics
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    sns.despine(left=False, bottom=False)

    # Adjust layout
    plt.tight_layout()
    return fig

def generate_skill_radar_data(skills):
    """
    Generates data for skill radar chart
    """
    # Define skill categories for radar chart
    categories = [
        'Technical', 'Tools', 'Methodologies',
        'Domain Knowledge', 'Soft Skills', 'Languages'
    ]

    # Map categories to profile keys
    category_map = {
        'Technical': 'technical',
        'Tools': 'tools',
        'Methodologies': 'methodologies',
        'Domain Knowledge': 'domain',
        'Soft Skills': 'soft',
        'Languages': 'languages'
    }

    # Calculate average confidence for each category
    data = []
    for category in categories:
        profile_key = category_map[category]
        category_skills = skills.get(profile_key, [])

        if category_skills:
            avg_confidence = sum(skill.get('confidence', 0.5) for skill in category_skills) / len(category_skills)
        else:
            avg_confidence = 0

        data.append({
            'category': category,
            'value': round(avg_confidence * 100, 1)  # Convert to percentage
        })

    return data

def generate_career_path_data(experience, recommendations):
    """
    Generates data for career path visualization
    """
    # Create nodes for past experience
    past_nodes = []
    for i, job in enumerate(experience):
        past_nodes.append({
            'id': f'past-{i}',
            'type': 'past',
            'title': job['title'],
            'company': job['company'],
            'industry': job['companyInfo']['industry'],
            'dateRange': job['dateRange']
        })

    # Create nodes for recommended careers
    future_nodes = []
    for i, rec in enumerate(recommendations[:3]):  # Top 3 recommendations
        future_nodes.append({
            'id': f'future-{i}',
            'type': 'future',
            'title': rec['jobTitle'],
            'industry': rec['industry'],
            'matchPercentage': rec['matchPercentage'],
            'skillGaps': rec['skillGaps']
        })

    # Create edges between nodes
    edges = []

    # Connect past experiences chronologically
    for i in range(len(past_nodes) - 1):
        edges.append({
            'source': past_nodes[i]['id'],
            'target': past_nodes[i + 1]['id'],
            'type': 'history'
        })

    # Connect current role to future possibilities
    if past_nodes:
        current_role = past_nodes[-1]

        for node in future_nodes:
            edges.append({
                'source': current_role['id'],
                'target': node['id'],
                'type': 'potential',
                'weight': node['matchPercentage'] / 100
            })

    return {
        'nodes': past_nodes + future_nodes,
        'edges': edges
    }

def generate_learning_pathway_data(learning_paths):
    """
    Generates data for learning pathway timeline
    """
    timeline_items = []
    current_week = 0

    # Convert learning paths to timeline items
    for skill, resources in learning_paths.items():
        # Take up to 2 resources for each skill
        for i, resource in enumerate(resources[:2]):
            # Calculate duration in weeks (simplified)
            duration_text = resource['duration']
            duration_weeks = 1  # Default

            if 'week' in duration_text:
                weeks = re.search(r'(\d+)', duration_text)
                if weeks:
                    duration_weeks = int(weeks.group(1))
            elif 'month' in duration_text:
                months = re.search(r'(\d+)', duration_text)
                if months:
                    duration_weeks = int(months.group(1)) * 4
            elif 'day' in duration_text:
                days = re.search(r'(\d+)', duration_text)
                if days:
                    duration_weeks = max(1, int(days.group(1)) // 5)
            elif 'hour' in duration_text:
                hours = re.search(r'(\d+)', duration_text)
                if hours:
                    duration_weeks = max(1, int(hours.group(1)) // 20)

            timeline_items.append({
                'id': f'{skill}-{i}',
                'title': resource['title'],
                'skill': skill,
                'provider': resource['provider'],
                'type': resource['type'],
                'startWeek': current_week,
                'endWeek': current_week + duration_weeks,
                'difficulty': resource['difficulty'],
                'url': resource['url']
            })

            current_week += duration_weeks

        # Add buffer week between skills
        current_week += 1

    return timeline_items

def generate_industry_compatibility_data(profile, recommendations):
    """
    Generates data for industry compatibility visualization
    """
    # Extract unique industries from recommendations
    industries = list(set(rec['industry'] for rec in recommendations))

    # Calculate compatibility for each industry
    compatibility_data = []

    for industry in industries:
        # Filter recommendations for this industry
        industry_recs = [rec for rec in recommendations if rec['industry'] == industry]

        if not industry_recs:
            continue

        # Calculate average match percentage
        avg_match = sum(rec['matchPercentage'] for rec in industry_recs) / len(industry_recs)

        # Calculate skill gap count
        skill_gaps = set()
        for rec in industry_recs:
            skill_gaps.update(rec['skillGaps'])

        # Check for personality fit if available
        personality_fit = 0
        if 'personalityMatch' in profile:
            match = next((m for m in profile['personalityMatch'] if industry.lower() in m['career'].lower()), None)
            if match:
                personality_fit = match['fitScore'] * 100

        # Check for values alignment if available
        values_alignment = 0
        if 'valuesAlignment' in profile:
            match = next((a for a in profile['valuesAlignment'] if a['industry'] == industry), None)
            if match:
                values_alignment = match['alignmentScore'] * 100

        compatibility_data.append({
            'industry': industry,
            'skillMatch': round(avg_match, 1),
            'personalityFit': round(personality_fit, 1),
            'valuesAlignment': round(values_alignment, 1),
            'gapCount': len(skill_gaps)
        })

    # Sort by overall score
    compatibility_data.sort(key=lambda x: (
        x['skillMatch'] * 0.5 +
        x['personalityFit'] * 0.25 +
        x['valuesAlignment'] * 0.25
    ), reverse=True)

    return compatibility_data

In [None]:
def process_resume(file_path, career_type="new_seeker"):
    """
    Main function to process a resume PDF file

    Args:
        file_path: Path to the resume PDF file
        career_type: "new_seeker" or "transitioner"

    Returns:
        Dictionary with processed resume data
    """
    try:
        print("Starting resume processing...")

        # Step 1: Validate PDF
        validation_result = validate_pdf(file_path)
        if not validation_result["valid"]:
            raise ValueError(f"Invalid PDF: {validation_result['error']}")

        # Step 2: Extract text
        print("Extracting text from PDF...")
        text_data = extract_text_from_pdf(file_path)

        # Step 3: Recognize document structure
        print("Recognizing document structure...")
        structured_data = recognize_document_structure(text_data)

        # Step 4: Extract skills
        print("Extracting skills...")
        skills = extract_skills(structured_data)

        # Step 5: Contextualize experience
        print("Contextualizing experience...")
        experiences = contextualize_experience(structured_data)

        # Step 6: Analyze education
        print("Analyzing education...")
        education_data = analyze_education(structured_data)

        # Step 7: Generate profile
        print("Generating user profile...")
        profile = generate_user_profile({
            "structured_data": structured_data,
            "skills": skills,
            "experiences": experiences,
            "education_data": education_data
        })

        # Step 8-9: Generate career recommendations using our ML-enhanced recommender
        print("Generating career recommendations...")
        recommender = CareerRecommender()
        recommendations = recommender.match_careers(profile, career_type=career_type)

        # Step 10: Generate visualization data
        print("Generating visualization data...")
        visualization_data = generate_visualization_data(profile, recommendations)

        # New step: Generate and save visualizations
        print("Creating visual representations...")
        visualization_paths = generate_visualizations(profile, recommendations)

        # Display visualizations directly in Colab
        from IPython.display import display, Image, Markdown

        # Show career recommendations first
        print("\n--- TOP CAREER RECOMMENDATIONS ---")
        for i, rec in enumerate(recommendations['careerRecommendations'][:5]):
            print(f"{i+1}. {rec['jobTitle']} ({rec['industry']}) - {rec['matchPercentage']}% match")
            print(f"   Salary range: ${rec['minSalary']:,} - ${rec['maxSalary']:,}")
            if rec['skillGaps']:
                print(f"   Skills to develop: {', '.join(rec['skillGaps'][:3])}")
            print()

        # Display all visualizations
        print("\n--- VISUALIZATIONS ---")
        display(Markdown("### 1. Skill Radar Chart"))
        display(Image(visualization_paths['skill_radar']))

        display(Markdown("### 2. Career Path Visualization"))
        display(Image(visualization_paths['career_path']))

        display(Markdown("### 3. Learning Pathway Timeline"))
        display(Image(visualization_paths['learning_pathway']))

        display(Markdown("### 4. Industry Compatibility Chart"))
        display(Image(visualization_paths['industry_compatibility']))

        # Combine all results
        result = {
            "profile": profile,
            "recommendations": recommendations,
            "visualizationData": visualization_data,
            "visualizationPaths": visualization_paths
        }
        print("Resume processing completed successfully!")
        return result

    except Exception as e:
        print(f"Error processing resume: {str(e)}")
        raise e

In [None]:
if __name__ == "__main__":
    # Check if career database exists, if not generate it
    if not os.path.exists('career_database.csv'):
        print("Generating career database...")
        generate_career_database()

    print("Welcome to Career Nexus!")
    print("Upload your resume PDF file:")

    # This is where the upload button will appear
    from google.colab import files
    uploaded = files.upload()

    if uploaded:
        # Get the filename of the uploaded file
        resume_file = list(uploaded.keys())[0]

        # Ask user about their career status
        print("\nAre you a new job seeker or transitioning to a new career?")
        print("1. New job seeker (recent graduate or entering workforce)")
        print("2. Career transitioner (changing careers or industries)")

        career_choice = input("Enter your choice (1 or 2): ")
        career_type = "new_seeker" if career_choice == "1" else "transitioner"

        print(f"\nProcessing resume: {resume_file} as a {career_type.replace('_', ' ')}...")
        result = process_resume(resume_file, career_type=career_type)

        # Save results to JSON
        output_file = 'resume_analysis_result.json'
        with open(output_file, 'w') as f:
            json.dump(result, f, indent=2)

        print(f"Analysis complete! Results saved to {output_file}")

        # Show key recommendations
        print("\n--- TOP CAREER RECOMMENDATIONS ---")
        for i, rec in enumerate(result['recommendations']['careerRecommendations'][:5]):
            print(f"{i+1}. {rec['jobTitle']} ({rec['industry']}) - {rec['matchPercentage']}% match")
            print(f"   Salary range: ${rec['minSalary']:,} - ${rec['maxSalary']:,}")
            if rec['skillGaps']:
                print(f"   Skills to develop: {', '.join(rec['skillGaps'][:3])}")
            print()
             # After displaying recommendations and visualizations, add model evaluation
    print("\n--- MODEL EVALUATION (NDCG) ---")

    # Create a recommender instance
    eval_recommender = CareerRecommender()

    # Get all job IDs from the career database
    all_job_ids = eval_recommender.career_data['job_id'].tolist()

    # Create simplified test profiles for evaluation
    from datetime import datetime
    test_profiles = [{
        "id": f"test_{i}",
        "createdAt": datetime.now().isoformat(),
        "careerStage": "entry-level",
        "skills": {
            "technical": [{"name": "Python", "confidence": 0.8}],
            "soft": [{"name": "Communication", "confidence": 0.7}],
            "tools": [],
            "methodologies": [],
            "domain": [],
            "languages": [],
            "inferred": []
        },
        "experience": [],
        "education": []
    } for i in range(5)]  # Just 5 test profiles to keep evaluation fast

    # Create test relevance scores
    relevance_scores = create_test_relevance_scores(test_profiles, all_job_ids)

    # Run the evaluation
    ndcg_score = evaluate_model_ndcg(eval_recommender, test_profiles, relevance_scores)
    print(f"Model Quality (NDCG@10): {ndcg_score:.4f}")