In [None]:
# Ensure necessary libraries are installed:
!pip install spacy pymupdf matplotlib squarify numpy
!pip install language-tool-python
!pip install nltk textstat
# python -m spacy download en_core_web_sm

In [1]:
import spacy
from spacy.matcher import Matcher
from collections import Counter
import re
import matplotlib.pyplot as plt
import squarify # for treemap
import numpy as np
import fitz  # PyMuPDF
import math
import os # interact with the operating system. It is a standard library module, meaning it is included with Python and does not need to be installed separately.
import statistics
import language_tool_python
import nltk
from textstat import flesch_reading_ease
# --- Configuration ---

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("spaCy model 'en_core_web_sm' not found. Please run: python -m spacy download en_core_web_sm")
    exit()

KNOWN_SKILLS = [
    # Programming Languages
    "python", "java", "javascript", "c++", "c#", "php", "ruby", "swift", "kotlin", "scala", "go", "rust", "typescript",
    "sql", "nosql", "r", "matlab", "perl", "shell scripting", "bash",
    # Web Development
    "html", "css", "react", "react.js", "angular", "vue", "vue.js", "next.js", "nuxtjs", "nodejs", "node.js", "express", "express.js",
    "django", "flask", "rubyonrails", "asp.net", "jquery", "bootstrap", "tailwind css", "sass", "less",
    "webpack", "babel", "rest", "restful apis", "graphql", "soap", "ajax", "json", "xml", "yaml",
    # Mobile Development
    "android development", "ios development", "swiftui", "objective-c", "flutter", "react native", "xamarin", "kotlin multiplatform",
    # Databases
    "mysql", "postgresql", "mongodb", "oracle database", "sql server", "sqlite", "redis", "memcached", "cassandra", "elasticsearch",
    "dynamodb", "firebase",
    # Cloud & DevOps
    "aws", "amazon web services", "azure", "microsoft azure", "gcp", "google cloud platform", "docker", "kubernetes", "k8s",
    "jenkins", "gitlab ci", "github actions", "ansible", "terraform", "puppet", "chef", "ci/cd", "continuous integration", "continuous deployment",
    "linux", "unix", "server administration", "powershell", "serverless", "microservices", "nginx", "apache", "load balancing",
    # Data Science & ML/AI
    "machine learning", "deep learning", "nlp", "natural language processing", "computer vision", "data analysis",
    "data mining", "data visualization", "pandas", "numpy", "scipy", "scikit-learn", "sklearn", "tensorflow", "keras", "pytorch",
    "apache spark", "hadoop", "tableau", "power bi", "statistics", "big data", "etl", "data warehousing",
    # Software Engineering & Architecture
    "agile", "scrum", "kanban", "waterfall", "design patterns", "data structures", "object-oriented programming", "oop",
    "functional programming", "system design", "software architecture", "software development life cycle", "sdlc",
    "unit testing", "integration testing", "end-to-end testing", "qa", "quality assurance", "selenium", "cypress", "jest", "junit", "pytest", "tdd", "bdd",
    # Cybersecurity
    "cybersecurity", "information security", "penetration testing", "ethical hacking", "network security", "cryptography", "siem", "soc",
    # Business & Soft Skills
    "project management", "product management", "communication", "teamwork", "leadership", "problem solving", "analytical skills",
    "customer service", "technical writing", "ui/ux design", "user interface", "user experience", "figma", "adobe xd", "sketch",
    # Other Technical Skills
    "api design", "sdk development", "blockchain", "game development", "unity", "unreal engine", "virtual reality", "vr", "augmented reality", "ar",
    "embedded systems", "iot", "robotics", "photoshop", "illustrator", "autocad", "blender",
    # Marketing
    "digital marketing", "social media marketing", "content marketing", "email marketing", "paid advertising", "ppc", "search engine marketing", "sem",
    "marketing automation", "crm", "customer relationship management", "market research", "marketing strategy", "brand management", "public relations", "affiliate marketing",
    # SEO
    "search engine optimization", "seo", "seo strategy", "keyword research", "on-page seo", "off-page seo", "technical seo", "local seo", "seo analytics",
    # Branding
    "brand strategy", "brand identity", "brand messaging", "brand development", "brand awareness", "brand positioning", "brand communication",
    # Finance
    "financial analysis", "financial modeling", "budgeting", "forecasting", "accounting", "corporate finance", "investment management", "financial reporting",
    # Strategy
    "business strategy", "strategic planning", "market analysis", "competitive analysis", "growth strategy", "innovation strategy", "business development",
    # Sales
    "sales management", "sales strategy", "lead generation", "sales process", "account management", "negotiation", "sales presentations",
    "customer acquisition", "sales forecasting", "channel sales", "key account management", "solution selling", "sales operations",
    # Post-MBA Fields
    "management consulting", "operations management", "supply chain management", "human resources", "organizational development",
    "entrepreneurship", "venture capital", "private equity", "investment banking", "mergers and acquisitions", "real estate",
    "healthcare management", "technology management", "international business", "corporate social responsibility", "sustainability"
]
KNOWN_SKILLS = sorted(list(set(skill.lower() for skill in KNOWN_SKILLS)))


# Keywords for Job Description Skill Importance
IMPORTANCE_KEYWORDS = {
    # High importance (3.0 - 2.5): Non-negotiable or critical skills
    "must": 3.0,
    "required": 3.0,
    "essential": 3.0,
    "critical": 3.0,
    "mandatory": 3.0,
    "vital": 3.0,
    "expert": 2.8,
    "proficient": 2.5,

    # Moderate importance (2.0 - 1.1): Important but not always mandatory
    "strong": 2.0,
    "demonstrated": 1.8,
    "experienced": 1.6,
    "preferred": 1.5,
    "desired": 1.5,
    "highly valued": 1.5,
    "significant": 1.3,
    "recommended": 1.1,

    # Low importance (1.0 - 0.1): Nice-to-have or supplementary skills
    "familiar": 0.8,
    "knowledge of": 0.6,
    "basic": 0.5,
    "exposure to": 0.4,
    "awareness of": 0.3,
    "develop": 0.5,
    "design": 0.5,
    "implement": 0.5,
    "work with": 0.4,
    "understanding of": 0.3,
    "introductory": 0.2,
    "optional": 0.1
}

# Keywords for Resume Skill Proficiency
PROFICIENCY_MODIFIERS = {
    # High proficiency (3.0 - 2.5): Expert or near-expert level
    "expert in": 3.0,
    "mastery": 3.0,
    "specialized in": 2.8,
    "advanced": 2.8,
    "proficient in": 2.5,
    "highly skilled": 2.5,

    # Moderate proficiency (2.0 - 1.1): Competent to strong skills
    "strong experience": 2.0,
    "lead": 2.0,
    "managed": 1.8,
    "developed": 1.8,
    "experienced in": 1.6,
    "skilled in": 1.5,
    "competent in": 1.5,
    "strong": 1.5,
    "certified in": 1.4,
    "trained in": 1.3,
    "practiced in": 1.2,
    "experience with": 1.1,

    # Low proficiency (1.0 - 0.1): Basic or limited skills
    "worked with": 1.0,
    "used": 0.9,
    "familiar with": 0.8,
    "knowledge of": 0.7,
    "exposure to": 0.6,
    "basic understanding of": 0.5,
    "introductory": 0.4,
    "beginner": 0.3,
    "learning": 0.2,
    "aware of": 0.1
}


# Unified Degrees List
DEGREES_S1 = [
    "bachelor", "bachelors", "b.sc", "bsc", "b.e", "be", "b.tech", "btech",
    "master", "masters", "m.sc", "msc", "m.e", "me", "m.tech", "mtech",
    "mba", "phd", "doctorate", "associate", "diploma"
]
DEGREES_S2 = [
    "bachelor", "b.tech", "btech", "b.e", "b.sc", "ba", "bca",
    "master", "m.tech", "mtech", "m.e", "m.sc", "ma", "mca",
    "phd", "doctorate", "mba", "pgdm", "diploma"
]
COMBINED_DEGREES = sorted(list(set([d.lower() for d in DEGREES_S1 + DEGREES_S2])))

# Degree Hierarchy: Lower number is higher rank
DEGREE_HIERARCHY = {
    "phd": 1, "doctorate": 1,
    "master": 2, "masters": 2, "m.tech": 2, "mtech": 2, "m.e": 2, "me": 2, "m.sc": 2, "msc": 2, "ma": 2, "mca": 2, "mba": 2, "pgdm": 2,
    "bachelor": 3, "bachelors": 3, "b.tech": 3, "btech": 3, "b.e": 3, "be": 3, "b.sc": 3, "bsc": 3, "ba": 3, "bca": 3,
    "diploma": 4,
    "associate": 5
}
DEFAULT_DEGREE_RANK = 10 # For degrees not in hierarchy

# Regex for experience
EXPERIENCE_REGEX = re.compile(r'(?:(?:minimum|at least)\s*)?(\d{1,2})\+?\s*(?:\+?\s*)?(?:years?|yrs?)\s*(?:of)?\s*(?:experience|exp)?', re.IGNORECASE)

# SWOT Analysis Thresholds
STRENGTH_PROFICIENCY_THRESHOLD = 70 #minimum proficiency that a person should have in skill according to resume to qualify it as his strength
STRENGTH_IMPORTANCE_THRESHOLD = 60
WEAKNESS_IMPORTANCE_THRESHOLD = 60
WEAKNESS_PROFICIENCY_THRESHOLD = 30
OPPORTUNITY_PROFICIENCY_THRESHOLD = 50
OPPORTUNITY_IMPORTANCE_THRESHOLD = 40 # Lower importance for this specific JD
THREAT_JD_IMPORTANCE_THRESHOLD = 60 # Critical skill for JD
THREAT_PROFICIENCY_THRESHOLD = 10   # Very low or missing proficiency in critical skill


# --- Helper Functions ---
def preprocess_text(text: str) -> str:
    """Cleans and normalizes text."""
    text = text.lower() #lowers all the text
    text = re.sub(r'\s+', ' ', text) #Replaces multiple whitespace characters with a single space.
    text = re.sub(r'[^\w\s.\-/#+]', '', text)
    return text.strip()

def build_skill_matcher(nlp_vocab, skill_list: list) -> Matcher:
    """Builds a spaCy Matcher for the given skill list."""
    matcher = Matcher(nlp_vocab)
    for skill in skill_list:
        pattern = [{"LOWER": token} for token in skill.lower().split()]
        matcher.add(skill.upper(), [pattern])
    return matcher

def extract_text_from_pdf(pdf_path: str) -> str:
    """Extracts all text from a PDF file."""
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text("text")
        return text.lower()
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
        return ""

def extract_degrees_from_text(text: str, degree_list: list) -> list:
    """Extracts recognized degrees from text."""
    found_degrees = set()
    processed_text = text.lower()
    for degree_pattern in degree_list:
        if re.search(r'\b' + re.escape(degree_pattern) + r'\b', processed_text):
            found_degrees.add(degree_pattern)
    return sorted(list(found_degrees))

def calculate_max_possible_absolute_score(jd_skill_importance_map: dict) -> float:
    """
    Calculates the maximum possible absolute score a candidate could achieve.
    Assumes proficiency of 100 for all JD skills and a high median frequency.
    """
    if not jd_skill_importance_map:
        return 0.0

    # Sum of combined skill scores assuming max proficiency (100)
    max_skill_sum = 0.0
    for skill, data in jd_skill_importance_map.items():
        importance_score = data.get("score", 0)
        max_skill_sum += importance_score
    max_absolute_score = max_skill_sum
    return max_absolute_score
# --- Job Description Analysis Module ---
def extract_skills_with_context_jd(doc: spacy.tokens.Doc, skill_list: list, matcher: Matcher) -> dict:
    """Extracts skills and their sentence contexts from a spaCy Doc."""
    skill_occurrences = {}
    matches = matcher(doc)
    for match_id, start, end in matches:
        original_skill_name = nlp.vocab.strings[match_id].lower()
        if original_skill_name in skill_list:
            sentence = doc[start:end].sent
            skill_occurrences.setdefault(original_skill_name, []).append(sentence)
    return skill_occurrences

def rate_skills_from_job_description(job_description_text: str) -> tuple[dict, str | None, list]:
    """
    Analyzes a job description to rate skills by importance.
    Returns a dictionary of {skill: {"score": importance_score (0-100), "frequency": N}},
    required experience, and required degrees.
    """
    cleaned_text = preprocess_text(job_description_text)
    doc = nlp(cleaned_text)

    skill_matcher = build_skill_matcher(nlp.vocab, KNOWN_SKILLS)
    skill_sentence_map = extract_skills_with_context_jd(doc, KNOWN_SKILLS, skill_matcher)

    if not skill_sentence_map:
        print("No known skills found in the job description.")
        return {}, None, []

    skill_raw_scores = Counter()
    for skill, sentences in skill_sentence_map.items():
        frequency = len(sentences)
        base_score = frequency
        keyword_score_boost = 0
        for sentence in sentences:
            sentence_text = sentence.text.lower()
            for keyword, weight in IMPORTANCE_KEYWORDS.items():
                if re.search(r'\b' + re.escape(keyword) + r'\b', sentence_text):
                    keyword_score_boost += weight
        total_score = base_score + keyword_score_boost
        skill_raw_scores[skill] = total_score

    max_raw_score = float(max(skill_raw_scores.values(), default=1))
    if max_raw_score == 0: max_raw_score = 1

    rated_skills_importance = {}
    for skill, raw_score in skill_raw_scores.items():
        normalized_score = round((np.log1p(raw_score) / np.log1p(max_raw_score) * 100))
        rated_skills_importance[skill] = {"score": normalized_score, "frequency": len(skill_sentence_map[skill])}

    jd_degrees_found = extract_degrees_from_text(cleaned_text, COMBINED_DEGREES)

    return rated_skills_importance,jd_degrees_found

def generate_jd_treemap(skills_dict_with_freq: dict):
    """Generates and saves a treemap of JD skill importance."""
    if not skills_dict_with_freq:
        print("No skills to plot for treemap.")
        return

    labels = [f"{skill.capitalize()}\n{data['score']}" for skill, data in skills_dict_with_freq.items() if data['score'] > 0]
    sizes = [data['score'] for data in skills_dict_with_freq.values() if data['score'] > 0]

    if not labels:
        print("All skill scores are 0, treemap will not be generated.")
        return

    colors = plt.cm.viridis(np.linspace(0, 1, len(sizes)))
    fig = plt.figure(figsize=(14, 9))
    squarify.plot(sizes=sizes, label=labels, color=colors, alpha=0.7, text_kwargs={'fontsize': 10, 'wrap': True})
    plt.title('Job Description: Skill Importance Treemap (0-100 Scale)', fontsize=16)
    plt.axis('off')
    try:
        plt.savefig('jd_skill_treemap.png')
        print("\nJob Description skill treemap saved as 'jd_skill_treemap.png'")
    except Exception as e:
        print(f"Error saving treemap: {e}")
    finally:
        plt.close(fig)

# --- Resume Analysis Module ---
def identify_skills_and_frequency_resume(text: str, skill_list: list) -> Counter:
    """Identifies skills in resume text and counts their frequency using regex."""
    found_skills_freq = Counter()
    processed_text = text.lower()
    for skill in skill_list:
        pattern = r"\b" + re.escape(skill.lower()) + r"\b"
        try:
            matches = re.findall(pattern, processed_text)
            if matches:
                found_skills_freq[skill] = len(matches)
        except re.error as e:
            print(f"Regex error for skill '{skill}': {e}")
    return found_skills_freq

def calculate_skill_weights_resume(text: str, identified_skills_freq: Counter, proficiency_modifiers_dict: dict) -> dict:
    """Calculates raw weighted scores for skills based on sentence context and frequency in resume."""
    weighted_skills = {}
    processed_text = text.lower()
    # Split into sentences by punctuation
    sentences = re.split(r'(?<=[\.\!\?])\s+', processed_text)

    for skill, frequency in identified_skills_freq.items():
        if frequency == 0:
            continue
        frequency_component = frequency
        modifier_sum = 0
        # Look for skill in each sentence
        for sentence in sentences:
            if re.search(r"\b" + re.escape(skill.lower()) + r"\b", sentence):
                for modifier_keyword, modifier_value in proficiency_modifiers_dict.items():
                    if re.search(r"\b" + re.escape(modifier_keyword) + r"\b", sentence):
                        modifier_sum += (modifier_value)
        raw_score = frequency_component + modifier_sum
        weighted_skills[skill] = raw_score
    return weighted_skills

def rate_skills_resume_proficiency(weighted_skills: dict) -> dict:
    """Normalizes resume skill scores to a 0-100 scale (proficiency rating)."""
    if not weighted_skills:
        return {}
    max_score = max(weighted_skills.values(), default=0)
    if max_score == 0:
        return {skill: 0.0 for skill in weighted_skills}
    return {skill: min(score,10) for skill, score in weighted_skills.items()}

def analyze_resume_for_candidate(pdf_path: str, jd_skill_names: list) -> dict:
    """
    Analyzes a single resume PDF.
    jd_skill_names: A list of skill names (strings) extracted from the job description.
    """
    print(f"\nAnalyzing resume: {os.path.basename(pdf_path)}")
    resume_text = extract_text_from_pdf(pdf_path)
    if not resume_text:
        return {
            "filename": os.path.basename(pdf_path), "candidate_skills_proficiency": {},
            "degrees": [], "median_frequency": 0.0, "skill_frequencies": Counter(),
            "error": f"Could not extract text from {os.path.basename(pdf_path)}"
        }

    skills_freq_in_resume = identify_skills_and_frequency_resume(resume_text, KNOWN_SKILLS)
    weighted_resume_skills = calculate_skill_weights_resume(resume_text, skills_freq_in_resume, PROFICIENCY_MODIFIERS)
    candidate_skills_proficiency = rate_skills_resume_proficiency(weighted_resume_skills)
    degrees_in_resume = extract_degrees_from_text(resume_text, COMBINED_DEGREES)


    print(f"  Found {len(candidate_skills_proficiency)} skills in resume.")
    print(f"  Degrees: {', '.join(degrees_in_resume) if degrees_in_resume else 'None'}")

    return {
        "filename": os.path.basename(pdf_path),
        "candidate_skills_proficiency": candidate_skills_proficiency,
        "skill_frequencies": skills_freq_in_resume, # Raw frequencies of skills found in resume
        "degrees": degrees_in_resume,
        "error": None
    }

# --- EDUCATIONAL QUALIFICATION CHECK
def check_educational_qualification(jd_degrees: list, candidate_degrees: list) -> tuple[bool, str]:
    """
    Checks if candidate meets minimum educational requirements from JD.
    Returns (is_qualified, message).
    """
    if not jd_degrees:
        return True, "No specific degree requirement in JD."

    if not candidate_degrees:
        return False, "Candidate has no listed degrees, but JD requires: " + ", ".join(jd_degrees)

    min_jd_rank_value = DEFAULT_DEGREE_RANK
    if jd_degrees:
        min_jd_rank_value = max(DEGREE_HIERARCHY.get(d, DEFAULT_DEGREE_RANK) for d in jd_degrees)

    candidate_best_rank_value = DEFAULT_DEGREE_RANK
    if candidate_degrees:
         candidate_best_rank_value = min(DEGREE_HIERARCHY.get(d, DEFAULT_DEGREE_RANK) for d in candidate_degrees)

    if candidate_best_rank_value < min_jd_rank_value:
        return True, "Candidate's education exceeds the minimum requirement."
    elif candidate_best_rank_value == min_jd_rank_value:
        return True, "Candidate's education meets the minimum requirement."
    else:
        return False, f"Candidate's education does not meet the minimum requirement for {', '.join(jd_degrees)}."

def calculate_ats_score(candidate_profile: dict, resume_text: str, pdf_path: str) -> float:
    """
    Advanced ATS score calculation (0-100) using spaCy for NLP analysis.
    Evaluates resume's ATS-friendliness across multiple dimensions with enhanced accuracy.
    """
    # Basic validation: Return 0.0 if there's an error in the profile or resume text is empty/None.
    if candidate_profile.get("error") or not resume_text or not resume_text.strip():
        return 0.0

    # Calculate scores for different components of the resume.
    # These components cover parsing, structure, content, formatting, and keywords.
    parsing_score = _calculate_parsing_compatibility(resume_text, pdf_path)
    structure_score = _calculate_structure_quality(resume_text)
    content_score = _calculate_content_optimization(resume_text, candidate_profile)
    formatting_score = _calculate_formatting_quality(resume_text)
    keyword_score = _calculate_keyword_density(resume_text, candidate_profile)

    # Combine component scores using weights based on their typical importance for ATS.
    # Parsing compatibility is often the most critical first step.
    final_score = (
        parsing_score * 0.30 +      # Critical: Can the ATS even read the resume?
        structure_score * 0.25 +    # Important: Is the resume well-organized with standard sections?
        content_score * 0.20 +      # Important: Does the content showcase achievements and skills effectively?
        formatting_score * 0.15 +   # Relevant: Is the formatting clean, readable, and ATS-friendly?
        keyword_score * 0.10        # Relevant: Are appropriate keywords present?
    )

    # Ensure the final score is within the 0-100 range and rounded to two decimal places.
    return round(min(max(final_score, 0.0), 100.0), 2)


def _calculate_parsing_compatibility(text: str, file_path: str) -> float:
    """
    Scores the resume based on how easily an Applicant Tracking System (ATS) can parse its content.
    Penalizes for non-standard file formats, problematic characters, and poor text structure.
    """
    score = 100.0 # Start with a perfect score and deduct points for issues.

    # File format check: Penalize non-standard or problematic file types.
    if file_path:
        ext = os.path.splitext(file_path)[1].lower()
        if ext == '.pdf':
            # PDFs can sometimes be image-based or have complex encoding, making parsing harder.
            score -= 5  # Minor penalty.
        elif ext not in ['.docx', '.doc', '.txt']: # .txt is generally safest.
            score -= 30  # Heavier penalty for formats like .pages, .odt, image files, etc.
    else:
        # If no file path is provided, assume raw text which might miss some file-based context.
        score -= 5

    # Character and pattern checks: Penalize characters and patterns known to cause issues for ATS.
    problematic_patterns = [
        (r'[^\x00-\x7F]', 3),      # Non-ASCII characters (e.g., unusual symbols, some foreign characters).
        (r'[\u2010-\u2015]', 2),  # Various dash types (hyphens are generally safer).
        (r'[\u2018-\u201F\u201C\u201D]', 2),  # Smart quotes (standard quotes ' " are safer).
        (r'\t{2,}', 4),           # Multiple consecutive tabs (often used for layout that breaks parsing).
        (r' {3,}', 3),            # Multiple consecutive spaces (use single spaces for separation).
        (r'[|⁄\\]{2,}', 7),       # Characters like multiple pipes or slashes used for tables/layout.
        (r'\b(_{2,}|-{3,})\b', 5) # Excessive underscores or long hyphens used as separators.
    ]

    for pattern, penalty in problematic_patterns:
        matches = len(re.findall(pattern, text))
        if matches > 0:
            score -= min(matches * penalty, 20) # Cap penalty per pattern type to avoid excessive deduction.

    # Check for text in ALL CAPS: Can be hard to read and is sometimes flagged by ATS.
    all_caps_words = sum(1 for word in text.split() if word.isupper() and len(word) > 1 and word.isalpha())
    total_words = len(text.split())
    if total_words > 0 and (all_caps_words / total_words) > 0.08: # If more than 8% of words are in ALL CAPS.
        score -= 10

    # Check for overly long lines without breaks (can indicate poor formatting for parsing).
    lines = text.split('\n')
    long_lines = sum(1 for line in lines if len(line) > 120) # Lines longer than 120 characters.
    if len(lines) > 0 and (long_lines / len(lines)) > 0.1: # If more than 10% of lines are very long.
        score -= 7

    # spaCy-based analysis for text quality (if spaCy model 'nlp' is available).
    if 'nlp' in globals() and nlp:
        try:
            # Analyze a sample of the text for efficiency, but large enough for robustness.
            doc = nlp(text[:min(len(text), 3000)]) # Analyze up to the first 3000 characters.

            # Check token count relative to text length.
            tokens = [token for token in doc if not token.is_space and not token.is_punct]
            if len(text) > 200 and len(tokens) < 50 :  # Very few useful tokens in a reasonably long text.
                score -= 25
            elif len(tokens) < 20 and len(text) > 100: # Extremely few tokens.
                 score -= 35

            # Check sentence structure.
            sentences = list(doc.sents)
            if len(sentences) < 3 and len(tokens) > 70 : # Very few sentences for a decent amount of text.
                score -= 20

            # Check average sentence length for extremes.
            if sentences:
                avg_sentence_len_tokens = sum(len([t for t in sent if not t.is_punct and not t.is_space]) for sent in sentences) / len(sentences)
                if avg_sentence_len_tokens > 35 : # Very long average sentence length can be hard to parse.
                    score -= 7
                if avg_sentence_len_tokens < 6 and len(sentences) > 10: # Many very short, possibly fragmented sentences.
                    score -= 7
        except Exception:
            # If spaCy processing fails, it might indicate issues with the text itself.
            score -= 15
    else:
        # Fallback checks if spaCy is not available.
        if len(text.split()) < 50 and len(text) > 300: # Basic word count check.
            score -= 20
        if len(re.findall(r'[.!?]\s',text)) < 2 and len(text.split()) > 70 : # Approximate sentence count.
            score -=15

    return max(score, 0.0)


def _calculate_structure_quality(text: str) -> float:
    """
    Scores the resume based on its structural organization.
    Looks for essential sections, contact information, date ranges, and clear itemization (e.g., bullet points).
    """
    score = 0.0 # Start with 0 and add points for positive structural elements.
    text_lower = text.lower() # Use lowercased text for case-insensitive matching.

    # Define essential resume sections and their corresponding keywords and point values.
    essential_sections = {
        'contact': (r'\b(contact|email|e-mail|phone|mobile|cell|linkedin|portfolio|github|address|location)\b', 20),
        'summary': (r'\b(summary|objective|profile|overview|about me|professional summary)\b', 10),
        'experience': (r'\b(experience|employment|work history|career|professional experience|positions held)\b', 25),
        'education': (r'\b(education|academic|degree|university|college|school|qualifications|certification)\b', 20),
        'skills': (r'\b(skills|technical skills|proficiencies|expertise|competencies|technologies|tools)\b', 15)
    }

    found_sections_count = 0
    for section_name, (pattern, points) in essential_sections.items():
        if re.search(pattern, text_lower):
            score += points
            found_sections_count += 1

    # Penalize if fewer than 3 critical sections (Contact, Experience, Education) are clearly identifiable.
    # This check is simplified; a more robust check would ensure these specific three are present.
    if found_sections_count < 3:
        score -= (3 - found_sections_count) * 10

    # Check for presence of clear contact information (email and phone).
    if re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text): # Email pattern.
        score += 7
    else: # Penalize if no clear email is found.
        score -= 10

    if re.search(r'(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b', text): # Phone pattern.
        score += 7
    else: # Penalize if no clear phone number is found.
        score -= 10

    # Date ranges are crucial for experience and education timelines.
    # Improved regex for various date range formats, including "Present".
    date_range_pattern = r'(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)[\s,.]*\d{2,4}\s*[-–to]+\s*(?:Present|Current|Ongoing|Till Date|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)[\s,.]*\d{2,4}))|\b(\d{1,2}/\d{2,4}\s*[-–to]+\s*(?:Present|Current|Ongoing|Till Date|\d{1,2}/\d{2,4}))|\b(\d{4}\s*[-–to]+\s*(?:Present|Current|Ongoing|Till Date|\d{4}))'
    date_ranges_found = len(re.findall(date_range_pattern, text, re.IGNORECASE))
    if date_ranges_found > 1: # At least two distinct periods (e.g., for different jobs/degrees).
        score += 10
    elif date_ranges_found == 1:
        score += 5

    # Bullet points or clear itemization indicate good readability and structure for ATS.
    bullet_pattern = r'^\s*[•·▪▫◦‣⁃*-]\s+' # Common bullet point markers at the start of a line.
    bullet_points_found = len(re.findall(bullet_pattern, text, re.MULTILINE))
    if bullet_points_found > 3: # Several bullet points suggest well-structured descriptions.
        score += 10
    elif bullet_points_found > 0:
        score += 5
    # Alternative for structure: good use of line breaks if not using bullets.
    elif text.count('\n') > (len(text) / 70): # Heuristic: sufficient newlines suggest structured items.
        score += 5

    # spaCy-based analysis for structural entities (if 'nlp' is available).
    if 'nlp' in globals() and nlp:
        try:
            doc = nlp(text)
            entities = Counter([ent.label_ for ent in doc.ents])

            # Presence of organizations (companies, schools) and dates supports structure.
            if entities.get('ORG', 0) > 1:
                score += 5
            if entities.get('DATE', 0) > 2 and date_ranges_found == 0 : # Add points if spaCy finds dates but regex didn't
                score += 5

            # Proper nouns can indicate job titles, company names, technologies, etc.
            propn_count = sum(1 for token in doc if token.pos_ == 'PROPN' and not token.is_stop and len(token.text)>2)
            if propn_count > 8 :
                score += 5
        except Exception:
            # If spaCy fails, rely on regex-based checks.
            if re.search(r'\b\d{4}\b',text_lower) and date_ranges_found == 0: score +=3
            pass
    else:
         if re.search(r'\b\d{4}\b',text_lower) and date_ranges_found == 0: score +=3

    return min(max(score, 0.0), 100.0) # Ensure score is capped at 100.


def _calculate_content_optimization(text: str, candidate_profile: dict) -> float:
    """
    Scores the resume content based on the use of action verbs, quantifiable achievements (metrics),
    contextual skill mentions, and overall professionalism. Uses spaCy for deeper analysis.
    """
    score = 0.0
    text_lower = text.lower()

    # Expanded list of action verbs crucial for showcasing achievements.
    action_verbs = [
        'achieved', 'accelerated', 'accomplished', 'acquired', 'adapted', 'administered', 'advanced', 'advised',
        'advocated', 'aligned', 'analyzed', 'applied', 'approved', 'architected', 'arranged', 'assessed', 'assisted',
        'attained', 'audited', 'augmented', 'authored', 'automated', 'balanced', 'benchmarked', 'boosted', 'briefed',
        'budgeted', 'built', 'calculated', 'calibrated', 'campaigned', 'captured', 'cataloged', 'centralized',
        'chaired', 'championed', 'changed', 'clarified', 'classified', 'coached', 'coded', 'collaborated', 'collected',
        'combined', 'communicated', 'compiled', 'completed', 'composed', 'computed', 'conceived', 'conceptualized',
        'condensed', 'conducted', 'configured', 'consolidated', 'constructed', 'consulted', 'contacted', 'contributed',
        'controlled', 'converted', 'convinced', 'coordinated', 'corrected', 'counseled', 'created', 'critiqued',
        'cultivated', 'customized', 'debugged', 'decreased', 'defined', 'delegated', 'delivered', 'demonstrated',
        'deployed', 'derived', 'designed', 'detected', 'determined', 'developed', 'devised', 'diagnosed', 'directed',
        'discovered', 'dispatched', 'distributed', 'documented', 'doubled', 'drafted', 'drove', 'earned', 'edited',
        'educated', 'effected', 'elicited', 'eliminated', 'enabled', 'encouraged', 'engineered', 'enhanced',
        'ensured', 'entertained', 'established', 'estimated', 'evaluated', 'examined', 'exceeded', 'executed',
        'expanded', 'expedited', 'explained', 'explored', 'extracted', 'fabricated', 'facilitated', 'familiarized',
        'fashioned', 'filed', 'financed', 'focused', 'forecasted', 'formalized', 'formed', 'formulated', 'fostered',
        'founded', 'fulfilled', 'funded', 'gained', 'gathered', 'generated', 'governed', 'graded', 'granted',
        'grouped', 'grew', 'guided', 'handled', 'headed', 'helped', 'hired', 'hosted', 'identified', 'illustrated',
        'implemented', 'improved', 'improvised', 'inaugurated', 'increased', 'influenced', 'informed', 'initiated',
        'innovated', 'inspected', 'inspired', 'installed', 'instituted', 'instructed', 'insured', 'integrated',
        'interacted', 'interpreted', 'interviewed', 'introduced', 'invented', 'inventoried', 'invested',
        'investigated', 'isolated', 'issued', 'joined', 'judged', 'justified', 'keyed', 'launched', 'lectured',
        'led', 'licensed', 'listened', 'lobbied', 'localized', 'located', 'logged', 'lowered', 'maintained',
        'managed', 'manipulated', 'manufactured', 'mapped', 'marketed', 'mastered', 'maximized', 'measured',
        'mediated', 'mentored', 'merged', 'met', 'minimized', 'mobilized', 'modeled', 'moderated', 'modernized',
        'modified', 'monitored', 'motivated', 'moved', 'multiplied', 'navigated', 'negotiated', 'networked',
        'observed', 'obtained', 'offered', 'operated', 'optimized', 'orchestrated', 'ordered', 'organized',
        'oriented', 'originated', 'outlined', 'overcame', 'overhauled', 'oversaw', 'owned', 'participated',
        'partnered', 'patented', 'perceived', 'performed', 'persuaded', 'phased', 'piloted', 'pioneered', 'placed',
        'planned', 'played', 'predicted', 'prepared', 'prescribed', 'presented', 'preserved', 'presided',
        'prevented', 'printed', 'prioritized', 'processed', 'procured', 'produced', 'profiled', 'programmed',
        'projected', 'promoted', 'proofread', 'proposed', 'protected', 'proved', 'provided', 'publicized',
        'published', 'purchased', 'pursued', 'qualified', 'quantified', 'queried', 'questioned', 'raised', 'ranked',
        'rated', 'reached', 'read', 'realized', 'reasoned', 'received', 'recognized', 'recommended', 'reconciled',
        'recorded', 'recruited', 'rectified', 'redesigned', 'reduced', 'refined', 'refocused', 'reformatted',
        'regulated', 'rehabilitated', 'reinforced', 'related', 'remediated', 'remodeled', 'rendered', 'renewed',
        'repaired', 'replaced', 'reported', 'represented', 'researched', 'resolved', 'responded', 'restored',
        'restructured', 'retained', 'retrieved', 'revamped', 'reversed', 'reviewed', 'revised', 'revitalized',
        'rewarded', 'routed', 'ran', 'saved', 'scanned', 'scheduled', 'screened', 'scripted', 'scrutinized',
        'searched', 'secured', 'segmented', 'selected', 'separated', 'served', 'serviced', 'set', 'settled',
        'shaped', 'shared', 'shortened', 'showcased', 'simplified', 'simulated', 'sketched', 'sold', 'solved',
        'sorted', 'sourced', 'sparked', 'specified', 'spoke', 'sponsored', 'stabilized', 'staffed', 'staged',
        'standardized', 'started', 'steered', 'stimulated', 'strategized', 'streamlined', 'strengthened',
        'stressed', 'stretched', 'structured', 'studied', 'submitted', 'substituted', 'succeeded', 'suggested',
        'summarized', 'superseded', 'supervised', 'supplied', 'supported', 'surveyed', 'synthesized',
        'systematized', 'tabulated', 'tackled', 'tailored', 'targeted', 'taught', 'teamed', 'terminated', 'tested',
        'testified', 'tracked', 'traded', 'trained', 'transacted', 'transcribed', 'transferred', 'transformed',
        'translated', 'transmitted', 'transported', 'traveled', 'treated', 'trimmed', 'tripled', 'troubleshot',
        'tutored', 'uncovered', 'underlined', 'understood', 'undertook', 'unified', 'united', 'unraveled',
        'updated', 'upgraded', 'utilized', 'validated', 'valued', 'verbalized', 'verified', 'visualized',
        'volunteered', 'weighed', 'widened', 'won', 'worked', 'wrote'
    ]

    action_verb_score = 0
    if 'nlp' in globals() and nlp:
        try:
            doc = nlp(text)
            # Count occurrences of lemmatized action verbs.
            lemmatized_verbs_in_doc = [token.lemma_.lower() for token in doc if token.pos_ == 'VERB']
            action_verb_found_count = sum(1 for verb_lemma in lemmatized_verbs_in_doc if verb_lemma in action_verbs)

            # Consider verbs at the beginning of sentences or after bullets (common for impact statements).
            impact_action_verbs = 0
            for sent in doc.sents:
                first_token_in_sent = None
                for token in sent: # Find first non-space, non-punct token
                    if not token.is_space and not token.is_punct:
                        first_token_in_sent = token
                        break
                if first_token_in_sent and first_token_in_sent.pos_ == 'VERB' and first_token_in_sent.lemma_.lower() in action_verbs:
                    impact_action_verbs += 1

            action_verb_score += min((action_verb_found_count * 1) + (impact_action_verbs * 1.5) , 30)

            # Sentence complexity: Look for sentences with subordinate clauses (advcl, ccomp, xcomp, relcl).
            complex_sentences = sum(1 for sent in doc.sents if any(tok.dep_ in ['advcl', 'ccomp', 'xcomp', 'relcl'] for tok in sent))
            action_verb_score += min(complex_sentences * 1.0, 10)

            # Professional context entities: ORG, PRODUCT, EVENT, MONEY, PERCENT, QUANTITY etc.
            prof_entities_count = sum(1 for ent in doc.ents if ent.label_ in ['ORG', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'MONEY', 'PERCENT', 'QUANTITY', 'NORP', 'GPE'])
            action_verb_score += min(prof_entities_count * 0.75, 15)

        except Exception:
            # Fallback to basic regex verb counting if spaCy fails.
            action_verb_found_count = sum(1 for verb in action_verbs if re.search(r'\b' + re.escape(verb) + r'\b', text_lower))
            action_verb_score += min(action_verb_found_count * 1.5, 30)
    else:
        action_verb_found_count = sum(1 for verb in action_verbs if re.search(r'\b' + re.escape(verb) + r'\b', text_lower))
        action_verb_score += min(action_verb_found_count * 1.5, 30)
    score += action_verb_score


    # Quantifiable achievements (metrics are highly valued by ATS and recruiters).
    # Expanded patterns to capture more types of metrics.
    metrics_patterns = [
        r'\b\d+(?:\.\d+)?%?\s*(?:increase|decrease|improvement|reduction|growth|savings|efficiency)\b', # 10% increase
        r'\b(?:increased|decreased|improved|reduced|grew|generated|saved|optimized|achieved|exceeded|completed|managed|led|delivered)\s(?:by\s)?(?:over\s|approx\.?\s|more than\s|above\s)?[\$€£]?\d{1,3}(?:[,.]\d{3})*(?:\.\d+)?[%kKmMbB]?\b', # increased by 10%, saved $5k
        r'\b[\$€£]\d{1,3}(?:[,.]\d{3})*(?:\.\d+)?[kKmMbB]?\b',  # Currency values like $100k, €2.5M.
        r'\b\d+(?:\.\d+)?%?\b',  # Percentages like 25%, 10.5%.
        r'\b\d+\+?\s*(?:years?|months?|yrs?|mos?|weeks?|days?)\b',  # Time periods like 5+ years.
        r'\b(?:over|under|above|below|approx(?:imately)?\.?|avg\.?|average|up to|exceeding)\s+\d+\b', # Comparative numbers.
        r'\b\d+\s*(?:projects?|clients?|users?|customers?|accounts?|downloads?|transactions?|countries|members|tasks|issues|features|products|reports|campaigns|events)\b', # Quantities of items.
        r'\b(?:from\s[\d\.\$€£%kKmMbB]+\s(?:to|by)\s[\d\.\$€£%kKmMbB]+)\b' # "from X to Y" improvements.
    ]
    metric_count = 0
    for pattern in metrics_patterns:
        metric_count += len(re.findall(pattern, text, re.IGNORECASE)) # Ignore case for metrics pattern.
    score += min(metric_count * 2.5, 25) # Increased weight for metrics.

    # Skills mentioned in a contextual way (demonstrating application, not just a list).
    contextual_skills_score = 0
    if candidate_profile.get("candidate_skills_proficiency"):
        skills_from_profile = list(candidate_profile["candidate_skills_proficiency"].keys())
        contextual_skills_found = 0

        if 'nlp' in globals() and nlp:
            try:
                # Assuming 'doc' is already processed from action verb analysis.
                # If not, process it: doc = nlp(text)
                for skill in skills_from_profile:
                    skill_lower = skill.lower()
                    # Use Matcher for more robust phrase matching, including multi-word skills.
                    # This is a simplified version; a full Matcher setup is more involved.
                    # For now, we'll check for the skill's presence and its syntactic context.
                    for token in doc:
                        if skill_lower in token.text.lower() or skill_lower in token.lemma_.lower():
                            # Check if the skill is an object of a verb, part of a prepositional phrase modifying a verb,
                            # or if an action verb is an ancestor.
                            if (token.head.pos_ == 'VERB' and token.dep_ in ['dobj', 'pobj', 'agent', 'attr']) or \
                               (token.dep_ in ['pobj', 'compound'] and token.head.head.pos_ == 'VERB') or \
                               any(ancestor.lemma_.lower() in action_verbs for ancestor in token.ancestors if ancestor.pos_ == 'VERB'):
                                contextual_skills_found += 1
                                break # Count skill once per contextual mention type.
                    # Fallback regex for simpler "using skill" or "skill for X" contexts.
                    else:
                        if re.search(rf'\b(?:using|with|leveraging|developing|implementing|managing|leading|applying|utilizing|on|for|in)\s+(?:the\s+)?{re.escape(skill_lower)}\b', text_lower) or \
                           re.search(rf'\b{re.escape(skill_lower)}\s+(?:for|to\s\w+|in|on|development|implementation|analysis|design)\b', text_lower):
                            contextual_skills_found +=1

                contextual_skills_score = min(contextual_skills_found * 1.5, 20) # Max 20 points for contextual skills.
            except Exception:
                # Fallback if spaCy fails for contextual skills.
                contextual_skills_found = 0
                for skill in skills_from_profile:
                    if re.search(rf'\b(?:using|with|developing|implementing|managing|leading|applying|utilizing)\s+(?:the\s+)?{re.escape(skill.lower())}\b', text_lower) or \
                       re.search(rf'\b{re.escape(skill.lower())}\s+(?:to|for|in|on|development|implementation|analysis|design)\b', text_lower):
                        contextual_skills_found += 1
                contextual_skills_score = min(contextual_skills_found * 1.0, 15)
        else: # No spaCy.
            contextual_skills_found = 0
            for skill in skills_from_profile:
                if re.search(rf'\b(?:using|with|developing|implementing|managing|leading|applying|utilizing)\s+(?:the\s+)?{re.escape(skill.lower())}\b', text_lower) or \
                   re.search(rf'\b{re.escape(skill.lower())}\s+(?:to|for|in|on|development|implementation|analysis|design)\b', text_lower):
                    contextual_skills_found += 1
            contextual_skills_score = min(contextual_skills_found * 1.0, 15)
        score += contextual_skills_score

    return min(max(score, 0.0), 100.0)


def _calculate_formatting_quality(text: str) -> float:
    """
    Scores the resume based on clean, ATS-friendly formatting, and readability.
    Checks for appropriate length, consistent formatting, and readability metrics.
    """
    score = 100.0 # Start high and deduct for formatting issues.

    word_count = len(text.split())
    char_count = len(text)
    lines = text.split('\n')
    non_empty_lines = len([line for line in lines if line.strip()])

    # Resume length and density checks.
    if word_count < 200: # Ideal length is often debated, but very short is usually bad.
        score -= 25
    elif word_count > 1000: # Overly verbose resumes can be problematic.
        score -= 15

    if word_count > 0 and (char_count / word_count) > 7.5 : # Very long words on average might indicate complex, hard-to-parse text.
        score -= 7
    if non_empty_lines > 0 and (char_count / non_empty_lines) > 100 : # Very long lines on average.
        score -= 10

    # Excessive blank lines can make the resume look sparse or poorly formatted.
    if non_empty_lines > 0 and (len(lines) / non_empty_lines) > 2.0: # If total lines are more than double content lines.
        score -= 10

    # Check for very long unbroken paragraphs (walls of text).
    max_consecutive_content_lines = 0
    current_consecutive_lines = 0
    for line in lines:
        if line.strip(): # Line has content.
            current_consecutive_lines +=1
        else: # Blank line, reset counter.
            max_consecutive_content_lines = max(max_consecutive_content_lines, current_consecutive_lines)
            current_consecutive_lines = 0
    max_consecutive_content_lines = max(max_consecutive_content_lines, current_consecutive_lines) # Check last block.

    if max_consecutive_content_lines > 8 : # More than 8 consecutive content lines without a break.
        score -= 10
    elif max_consecutive_content_lines > 6:
        score -= 5

    # Use of ALL CAPS (also checked in parsing, but important for formatting too).
    uppercase_lines = sum(1 for line in lines if line.strip() and line.strip().isupper() and len(line.strip().split()) > 1)
    if non_empty_lines > 0 and (uppercase_lines / non_empty_lines) > 0.15: # More than 15% of content lines are all caps.
        score -= 10

    # spaCy for sentence length analysis (if 'nlp' is available).
    # This provides a more accurate measure of sentence structure than simple line checks.
    if 'nlp' in globals() and nlp:
        try:
            doc = nlp(text)
            sentence_token_lengths = [len([tok for tok in sent if not tok.is_punct and not tok.is_space]) for sent in doc.sents if sent.text.strip()]
            if sentence_token_lengths:
                avg_sentence_len = sum(sentence_token_lengths) / len(sentence_token_lengths)

                if not (8 <= avg_sentence_len <= 28): # Slightly too short or too long on average.
                    score -= 5
                if avg_sentence_len > 30: # Sentences are generally too complex or verbose.
                    score -= 7
                if avg_sentence_len < 7 and len(sentence_token_lengths) > 5 : # Many very short, choppy sentences.
                    score -=7

                # Check for excessive number of very long or very short sentences.
                num_very_long_sentences = sum(1 for s_len in sentence_token_lengths if s_len > 35)
                num_very_short_sentences = sum(1 for s_len in sentence_token_lengths if s_len < 6)
                if len(sentence_token_lengths) > 7:
                    if (num_very_long_sentences / len(sentence_token_lengths)) > 0.25: # >25% sentences are very long.
                        score -=7
                    if (num_very_short_sentences / len(sentence_token_lengths)) > 0.35 and avg_sentence_len < 10: # >35% sentences are very short.
                        score -=7
        except Exception:
            pass # No specific penalty if spaCy fails here; other checks cover general formatting.

    # Readability score (e.g., Flesch Reading Ease) if 'textstat' library is available.
    # This function assumes 'flesch_reading_ease' is imported and available globally.
    try:
        if 'flesch_reading_ease' in globals() and callable(flesch_reading_ease):
            f_score = flesch_reading_ease(text)
            # Ideal Flesch score for professional documents is often cited in the 30-70 range.
            if 40 <= f_score <= 70:  # Good readability.
                score += 5 # Small bonus for good readability.
            elif f_score < 30:  # Very difficult to read.
                score -= 15
            elif f_score > 75: # Potentially too simplistic for some professional contexts.
                score -= 5
        else:
            pass # 'flesch_reading_ease' function not available.
    except Exception:
        pass # Catch any error from the flesch_reading_ease function.

    return max(score, 0.0)


def _calculate_keyword_density(text: str, candidate_profile: dict) -> float:
    """
    Scores the resume based on keyword optimization, considering skills from the candidate profile
    and general industry-relevant terms. Uses lemmatization with spaCy for better matching.
    """
    score = 0.0

    # Prepare text for analysis: lemmatized if spaCy is available, otherwise lowercased.
    text_for_analysis = text.lower()
    doc_tokens_for_industry_terms = [word.lower() for word in re.split(r'\W+', text.lower()) if word and len(word)>1] # Basic tokenization for fallback.

    if 'nlp' in globals() and nlp:
        try:
            doc = nlp(text)
            # Use lemmatized tokens for keyword matching.
            lemmatized_tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_space]
            text_for_analysis = " ".join(lemmatized_tokens) # Reconstruct text from lemmas for phrase matching.
            doc_tokens_for_industry_terms = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
        except Exception:
            # If spaCy fails, fallback to using the original lowercased text and basic tokenization.
            pass

    total_meaningful_words = len(doc_tokens_for_industry_terms)
    if total_meaningful_words == 0:
        return 0.0 # Avoid division by zero if no meaningful words are found.

    # Skills keyword analysis based on skills provided in the candidate's profile.
    skills_score = 0
    max_possible_skills_score = 60 # Maximum points achievable from skills keywords.
    num_skills_evaluated = 0

    if candidate_profile.get("candidate_skills_proficiency"):
        skills_to_check = list(candidate_profile["candidate_skills_proficiency"].keys())
        num_skills_evaluated = len(skills_to_check)

        if num_skills_evaluated > 0:
            points_per_skill_category = { # Different point values for occurrence counts.
                (1, 3): 7,  # Optimal: 1-3 occurrences.
                (4, 4): 4,  # Slightly high: 4 occurrences.
                (5, float('inf')): 1 # Potential keyword stuffing: >4 occurrences.
            }

            total_skill_points_achieved = 0
            for skill in skills_to_check:
                skill_to_search = skill.lower()
                # If spaCy was used, lemmatize the skill itself for a fair comparison.
                if 'nlp' in globals() and nlp and text_for_analysis != text.lower():
                    try:
                        skill_doc = nlp(skill.lower())
                        lemmatized_skill_tokens = [token.lemma_ for token in skill_doc if not token.is_punct and not token.is_space]
                        if lemmatized_skill_tokens:
                            skill_to_search = " ".join(lemmatized_skill_tokens)
                    except Exception:
                        pass # Use original lowercased skill if lemmatization fails.

                # Use regex for robust whole-word phrase matching.
                occurrences = len(re.findall(rf'\b{re.escape(skill_to_search)}\b', text_for_analysis, re.IGNORECASE))

                for (min_occ, max_occ), points in points_per_skill_category.items():
                    if min_occ <= occurrences <= max_occ:
                        total_skill_points_achieved += points
                        break

            # Normalize skill score based on the number of skills evaluated, capped by max_possible_skills_score.
            # Max theoretical points if all skills are optimal: num_skills_evaluated * 7.
            if num_skills_evaluated * 7 > 0 :
                 skills_score = (total_skill_points_achieved / (num_skills_evaluated * 7)) * max_possible_skills_score
            else:
                skills_score = 0
            skills_score = min(skills_score, max_possible_skills_score) # Ensure it doesn't exceed cap.
    score += skills_score

    # Industry-relevant terms analysis (general professional keywords).
    # This list should be periodically updated or made domain-specific if possible.
    industry_terms = [
        'project management', 'agile methodology', 'scrum framework', 'data analysis', 'data visualization', 'business intelligence',
        'strategic planning', 'business development', 'market research', 'financial analysis', 'risk management', 'budgeting',
        'software development life cycle', 'sddlc', 'cloud computing', 'aws', 'azure', 'gcp', 'saas', 'paas', 'iaas',
        'machine learning', 'artificial intelligence', 'ai', 'natural language processing', 'nlp', 'deep learning',
        'cybersecurity', 'information security', 'network security', 'penetration testing', 'devops practices', 'ci/cd pipeline',
        'digital marketing', 'seo strategy', 'sem', 'content creation', 'social media marketing', 'ux/ui design principles',
        'customer relationship management', 'crm systems', 'salesforce', 'enterprise resource planning', 'erp systems', 'sap',
        'financial modeling', 'quantitative analysis', 'supply chain management', 'logistics operations', 'quality assurance', 'qa testing',
        'product development', 'product lifecycle', 'product roadmap', 'team leadership', 'cross-functional team', 'stakeholder engagement',
        'change management', 'process improvement', 'lean principles', 'six sigma', 'technical support', 'client relations',
        'contract negotiation', 'vendor management', 'regulatory compliance', 'data privacy', 'gdpr', 'hipaa',
        'communication skills', 'problem-solving', 'critical thinking', 'innovation', 'efficiency', 'optimization', 'scalability'
    ]

    # If spaCy is used, lemmatize industry terms for better matching with the lemmatized resume text.
    processed_industry_terms = []
    if 'nlp' in globals() and nlp and text_for_analysis != text.lower():
        for term in industry_terms:
            try:
                term_doc = nlp(term) # Terms are already lowercase.
                lemmas = [token.lemma_ for token in term_doc if not token.is_punct and not token.is_space]
                if lemmas: processed_industry_terms.append(" ".join(lemmas))
                else: processed_industry_terms.append(term) # Fallback if lemmatization yields nothing.
            except: processed_industry_terms.append(term)
    else:
        processed_industry_terms = industry_terms

    relevant_terms_found_count = 0
    # Count how many of these industry terms appear in the resume.
    for term in processed_industry_terms:
        if re.search(rf'\b{re.escape(term)}\b', text_for_analysis, re.IGNORECASE): # Match whole words/phrases.
            relevant_terms_found_count +=1

    industry_score_max = 40 # Max points from general industry terms.
    industry_score = min(relevant_terms_found_count * 1.0, industry_score_max) # Award points per term found, up to cap.
    score += industry_score

    return min(max(score, 0.0), 100.0) # Ensure final score is capped.  n
# --- SWOT ANALYSIS ---
import math

def generate_swot_analysis(jd_skill_importance_map: dict, candidate_profile: dict) -> dict:
    """Generates a polished, sentence-embedded SWOT analysis using normalized proficiency."""
    swot = {"strengths": [], "weaknesses": [], "opportunities": [], "threats": []}
    candidate_proficiencies = candidate_profile.get("candidate_skills_proficiency", {})

    if not candidate_proficiencies:
        return {k: ["No candidate proficiency data provided."] for k in swot}

    max_prof_score = max(candidate_proficiencies.values())

    def normalize(prof_score):
        return (math.log1p(prof_score) / math.log1p(max_prof_score)) * 100 if max_prof_score > 0 else 0

    strength_skills, weakness_skills, opportunity_skills, threat_skills = [], [], [], []

    for skill, prof_score in candidate_proficiencies.items():
        if skill in jd_skill_importance_map:
            importance_score = jd_skill_importance_map[skill].get("score", 0)
            norm_score = normalize(prof_score)
            if norm_score >= STRENGTH_PROFICIENCY_THRESHOLD and importance_score >= STRENGTH_IMPORTANCE_THRESHOLD:
                strength_skills.append(f"{skill.capitalize()} (Proficiency: {norm_score:.0f}, JD Importance: {importance_score:.0f})")

    for skill, jd_data in jd_skill_importance_map.items():
        importance_score = jd_data.get("score", 0)
        prof_score = candidate_proficiencies.get(skill, 0)
        norm_score = normalize(prof_score)

        if importance_score >= WEAKNESS_IMPORTANCE_THRESHOLD and norm_score < WEAKNESS_PROFICIENCY_THRESHOLD:
            weakness_skills.append(f"{skill.capitalize()} (Proficiency: {norm_score:.0f}, JD Importance: {importance_score:.0f})")

        if importance_score >= THREAT_JD_IMPORTANCE_THRESHOLD and norm_score < THREAT_PROFICIENCY_THRESHOLD:
            threat_skills.append(f"{skill.capitalize()} (Proficiency: {norm_score:.0f}, JD Importance: {importance_score:.0f})")

    for skill, prof_score in candidate_proficiencies.items():
        norm_score = normalize(prof_score)
        if skill in jd_skill_importance_map:
            importance_score = jd_skill_importance_map[skill].get("score", 0)
            if norm_score >= OPPORTUNITY_PROFICIENCY_THRESHOLD and importance_score < OPPORTUNITY_IMPORTANCE_THRESHOLD:
                opportunity_skills.append(f"{skill.capitalize()} (Proficiency: {norm_score:.0f}, JD Importance: {importance_score:.0f})")
        elif norm_score >= OPPORTUNITY_PROFICIENCY_THRESHOLD:
            opportunity_skills.append(f"{skill.capitalize()} (Proficiency: {norm_score:.0f})")

    def embed_skills_in_sentence(skill_list, intro_phrase_singular, intro_phrase_plural, conclusion_phrase):
        if not skill_list:
            return "None identified in this category based on current criteria."
        joined = "; ".join(skill_list)
        if len(skill_list) == 1:
            return f"{intro_phrase_singular} {joined}, {conclusion_phrase}"
        else:
            return f"{intro_phrase_plural} {joined}, {conclusion_phrase}"

    swot["strengths"] = [
        embed_skills_in_sentence(
            strength_skills,
            "The candidate demonstrates a key strength in",
            "The candidate demonstrates key strengths in",
            "which align strongly with the role's core requirements."
        )
    ]

    swot["weaknesses"] = [
        embed_skills_in_sentence(
            weakness_skills,
            "A notable weakness is observed in",
            "Notable weaknesses are observed in",
            "which may impact performance in critical areas."
        )
    ]

    swot["opportunities"] = [
        embed_skills_in_sentence(
            opportunity_skills,
            "An opportunity lies in the candidate's proficiency in",
            "Opportunities lie in the candidate's proficiencies in",
            "which can be leveraged for broader roles or future growth."
        )
    ]

    swot["threats"] = [
        embed_skills_in_sentence(
            threat_skills,
            "A potential threat is the lack of proficiency in",
            "Potential threats arise from lack of proficiency in",
            "which could hinder the candidate’s ability to meet role expectations."
        )
    ]

    return swot

# --- SCORING AND RANKING MODULE ---
def calculate_candidate_raw_score(jd_skill_importance_map: dict, candidate_profile: dict) -> float:
    """Calculates a raw composite score for a candidate (this is the absolute score)."""
    candidate_proficiencies = candidate_profile.get("candidate_skills_proficiency", {})
    median_frequency = candidate_profile.get("median_frequency", 0.0) # This is now based on JD skills

    if not candidate_proficiencies or not jd_skill_importance_map:
        return 0.0

    candidate_skill_sum = 0.0
    matched_skill_details = []

    for skill, proficiency_score in candidate_proficiencies.items():
        if skill in jd_skill_importance_map:
            importance_data = jd_skill_importance_map[skill]
            importance_score = importance_data.get("score", 0)
            # Product of (proficiency/100) and (importance) we are doing so as
            # consider someone is just 10% proficient in a skill and other is
            # 50% so latter will get more score in overall
            combined_skill_score = (proficiency_score / 100.0) * (importance_score)
            candidate_skill_sum += combined_skill_score
            if combined_skill_score > 0:
                 matched_skill_details.append({
                     "skill": skill, "proficiency": proficiency_score,
                     "importance": importance_score, "combined": combined_skill_score * 100
                 })

    candidate_profile["matched_skill_details"] = sorted(matched_skill_details, key=lambda x: x["combined"], reverse=True)
    raw_composite_score = candidate_skill_sum
    return raw_composite_score

# --- Main Execution ---
def main():
    print("--- Advanced Resume Analyzer and Candidate Ranker ---")

    # 1. Analyze Job Description
    print("\nStep 1: Analyze Job Description")
    print("Paste your job description below. End input with an empty line (or Ctrl+D/Ctrl+Z then Enter):")
    jd_lines = []
    while True:
        try:
            line = input()
            if not line.strip() and jd_lines: break
            elif not line.strip() and not jd_lines: continue
            jd_lines.append(line)
        except EOFError: break
    job_description = "\n".join(jd_lines).strip()

    if not job_description:
        print("No job description provided. Exiting.")
        return

    jd_skill_importance_map, jd_degrees_from_jd = rate_skills_from_job_description(job_description)
    jd_skill_names_list = list(jd_skill_importance_map.keys()) # Get list of skill names from JD

    print("\n--- Job Description Analysis ---")
    if jd_skill_importance_map:
        print("Skill Importance (0-100 scale, based on JD):")
        sorted_jd_skills = sorted(jd_skill_importance_map.items(), key=lambda item: item[1]["score"], reverse=True)
        for skill, data in sorted_jd_skills:
            print(f"- {skill.capitalize()} (mentioned {data['frequency']}x): {data['score']}")
    else:
        print("No skills identified in the job description. Median frequency for resumes will be based on an empty skill list (resulting in 0).")

    print(f"Required Degree(s) (from JD): {', '.join(jd_degrees_from_jd)}" if jd_degrees_from_jd else "Required Degree(s): Not clearly mentioned")

    print("\n--- Manual JD Skill Score Adjustment (Optional) ---")
    print("Enter skill name and new score (e.g., 'python 95'). Press Enter without input to finish.\n")
    while True:
        adjustment = input("Enter adjustment (or press Enter to skip/finish): ").strip()
        if not adjustment: break
        parts = adjustment.split()
        try:
            new_score = int(parts[-1])
            skill_name_to_adjust = " ".join(parts[:-1]).lower()
            if skill_name_to_adjust in jd_skill_importance_map:
                old_score = jd_skill_importance_map[skill_name_to_adjust]['score']
                jd_skill_importance_map[skill_name_to_adjust]['score'] = new_score
                print(f"Updated '{skill_name_to_adjust}' score from {old_score} to {new_score}")
            else:
                jd_skill_importance_map[skill_name_to_adjust] = {'score': new_score, 'frequency': 1}
                print(f"Added '{skill_name_to_adjust}' with score {new_score} (frequency set to 1)")
            # Update jd_skill_names_list if a new skill was added or if needed (though keys() will reflect it)
            jd_skill_names_list = list(jd_skill_importance_map.keys())
        except (IndexError, ValueError): print("Invalid format. Use 'skill_name new_score'.")

    if jd_skill_importance_map: generate_jd_treemap(jd_skill_importance_map)

    # Calculate maximum possible absolute score for percentage calculation
    max_absolute_score = calculate_max_possible_absolute_score(jd_skill_importance_map)


    # 2. Analyze Resumes
    print("\nStep 2: Analyze Resumes")
    resume_folder_path = input("Enter the path to the folder containing resume PDF files: ").strip()
    if not os.path.isdir(resume_folder_path):
        print("Invalid folder path. Exiting.")
        return

    all_candidate_profiles = []
    pdf_files = [f for f in os.listdir(resume_folder_path) if f.lower().endswith(".pdf")]
    if not pdf_files:
        print(f"No PDF files found in '{resume_folder_path}'. Exiting.")
        return
    print(f"Found {len(pdf_files)} PDF files. Processing up to 100.")

    for filename in pdf_files[:100]:
        full_path = os.path.join(resume_folder_path, filename)
        # Pass jd_skill_names_list to analyze_resume_for_candidate
        candidate_profile = analyze_resume_for_candidate(full_path, jd_skill_names_list)
        if candidate_profile:
            if candidate_profile.get("error"):
                print(f"  Skipping scoring for {filename} due to error: {candidate_profile['error']}")

            is_qualified, edu_message = check_educational_qualification(jd_degrees_from_jd, candidate_profile.get("degrees", []))
            candidate_profile["education_qualified"] = is_qualified
            candidate_profile["education_message"] = edu_message
            print(f"  Education Check: {edu_message}")

            all_candidate_profiles.append(candidate_profile)

    if not all_candidate_profiles:
        print("No resume data could be processed. Exiting.")
        return

    # 3. Score and Rank Candidates
    print("\nStep 3: Score and Rank Candidates")
    candidate_raw_scores_data = []
    for profile in all_candidate_profiles:
        absolute_raw_score = 0.0 # Default for error cases
        ats_score = 0.0 # Default ATS score
        resume_text = extract_text_from_pdf(os.path.join(resume_folder_path, profile["filename"])) if not profile.get("error") else ""
        if profile.get("error"):
            absolute_raw_score = -1 # Mark as error for score normalization
            ats_score = 0.0 # ATS score is 0 for error cases
        else:
            # This calculates the absolute score
            absolute_raw_score = calculate_candidate_raw_score(jd_skill_importance_map, profile)
            # Calculate ATS score (independent of JD)
            ats_score = calculate_ats_score(profile, resume_text, os.path.join(resume_folder_path, profile["filename"]))

        candidate_raw_scores_data.append({
            "filename": profile["filename"],
            "absolute_raw_score": absolute_raw_score, # Store the absolute score
            "ats_score": ats_score, # Store the ATS score
            "profile_details": profile
        })

    # Calculate absolute score percentages and normalize
    valid_abs_scores = [item["absolute_raw_score"] for item in candidate_raw_scores_data if item["absolute_raw_score"] >= 0]
    max_overall_abs_score = max(valid_abs_scores) if valid_abs_scores else 0

    final_ranked_candidates = []
    for item in candidate_raw_scores_data:
        normalized_relative_score = 0.0
        current_abs_score = item["absolute_raw_score"]

        # Calculate absolute score percentage using logarithmic scaling
        abs_score_percentage = (np.log1p(current_abs_score) / np.log1p(max_overall_abs_score)) * 100 if current_abs_score >= 0 and max_overall_abs_score > 0 else 0

        # Normalize based on absolute_score_percentage
        if current_abs_score < 0: # Error case
            normalized_relative_score = 0.0
        else:
            # Normalize using abs_score_percentage
            max_abs_score_percentage = (np.log1p(max_overall_abs_score) / np.log1p(max_overall_abs_score)) * 100 if max_overall_abs_score > 0 else 0
            normalized_relative_score = round((abs_score_percentage / max_abs_score_percentage) * 100, 2) if max_abs_score_percentage > 0 else 0.0

        final_ranked_candidates.append(
            (item["filename"], current_abs_score, normalized_relative_score, item["ats_score"], item["profile_details"])
        )

    # Sort by normalized_relative_score (index 2 of the tuple)
    final_ranked_candidates.sort(key=lambda x: x[2], reverse=True)

    print("\n--- Final Candidate Ranking ---")
    qualified_candidates = []
    unqualified_candidates = []

    # Separate qualified and unqualified candidates
    for candidate in final_ranked_candidates:
        if not candidate[4].get("education_qualified", True):
            unqualified_candidates.append(candidate)
        else:
            qualified_candidates.append(candidate)

    # Print qualified candidates
    for i, (filename, abs_score, norm_score, ats_score, profile) in enumerate(qualified_candidates):
        # Display absolute score as a percentage (abs_score / max_absolute_score)
        abs_score_percentage = (np.log1p(abs_score) / np.log1p(max_absolute_score)) * 100 if abs_score >= 0 and max_absolute_score > 0 else 0
        abs_score_display = f"{abs_score_percentage:.2f}"

        print(f"\n{i+1}. {filename}: Absolute Score(out of 100): {abs_score_display}, Normalized Score(out of 100): {norm_score:.2f}, ATS Score(out of 100): {ats_score:.2f}")

        if profile.get("error"):
            print(f"     Status: Error during processing - {profile.get('error')}")
            continue

        if "matched_skill_details" in profile and profile["matched_skill_details"]:
            top_matches = profile["matched_skill_details"][:3]
            match_strs = [m['skill'] for m in top_matches]
            print(f"     Top Matched Skills: {'; '.join(match_strs)}")
        print(f"     Degrees: {', '.join(profile['degrees']) if profile['degrees'] else 'N/A'}")

        if jd_skill_importance_map:
            swot = generate_swot_analysis(jd_skill_importance_map, profile)
            print(f"     Debug: Generating SWOT for {filename}")
            print("     SWOT Analysis:")
            print(f"       Strengths: {'; '.join(swot['strengths'])}")
            print(f"       Weaknesses: {'; '.join(swot['weaknesses'])}")
            print(f"       Opportunities: {'; '.join(swot['opportunities'])}")
            print(f"       Threats: {'; '.join(swot['threats'])}")
        else:
            print("     SWOT Analysis: Skipped (No JD skill importance data available).")

    # Print unqualified candidates
    if unqualified_candidates:
        print("\n--- Resumes Not Meeting Minimum Education Criteria ---")
        for i, (filename, abs_score, norm_score, ats_score, profile) in enumerate(unqualified_candidates):
            # Display absolute score as a percentage (abs_score / max_absolute_score)
            abs_score_percentage = (np.log1p(abs_score) / np.log1p(max_absolute_score)) * 100 if abs_score >= 0 and max_absolute_score > 0 else 0
            abs_score_display = f"{abs_score_percentage:.2f}"
            edu_status_msg = f" (WARNING: {profile.get('education_message', 'Does not meet educational requirements.')})"

            print(f"\n{i+1}. {filename}: Absolute Score(out of 100): {abs_score_display}, Normalized Score(out of 100): {norm_score:.2f}, ATS Score(out of 100): {ats_score:.2f}{edu_status_msg}")

            if profile.get("error"):
                print(f"     Status: Error during processing - {profile.get('error')}")
                continue

            if "matched_skill_details" in profile and profile["matched_skill_details"]:
                top_matches = profile["matched_skill_details"][:3]
                match_strs = [m['skill'] for m in top_matches]
                print(f"     Top Matched Skills: {'; '.join(match_strs)}")
            print(f"     Degrees: {', '.join(profile['degrees']) if profile['degrees'] else 'N/A'}")

            if jd_skill_importance_map:
                swot = generate_swot_analysis(jd_skill_importance_map, profile)
                print(f"     Debug: Generating SWOT for {filename}")
                print("     SWOT Analysis:")
                print(f"       Strengths: {'; '.join(swot['strengths'])}")
                print(f"       Weaknesses: {'; '.join(swot['weaknesses'])}")
                print(f"       Opportunities: {'; '.join(swot['opportunities'])}")
                print(f"       Threats: {'; '.join(swot['threats'])}")
            else:
                print("     SWOT Analysis: Skipped (No JD skill importance data available).")



    print("\n--- Analysis Complete ---")

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'squarify'