In [None]:
from transformers import pipeline
import re

# 1. Initialize the NER Pipeline
# We use aggregation_strategy="simple" to group sub-tokens initially,
# but we will use our manual expansion for 100% accuracy.
ner_pipeline = pipeline(
    "ner",
    model="dslim/bert-base-NER",
    aggregation_strategy="simple"
)

def expand_to_full_word(text, start, end):
    """
    Expands BERT indices to capture the full word from the original string.
    Ensures that 'Ra' becomes 'Rahul' and '##fosys' becomes 'Infosys'.
    """
    while start > 0 and text[start-1].isalnum():
        start -= 1
    while end < len(text) and text[end].isalnum():
        end += 1
    return text[start:end].strip()

def smart_resume_parser(resume_text):
    # Split the block of text into individual lines
    lines = [l.strip() for l in resume_text.splitlines() if l.strip()]

    name, university, raw_companies = "", "", []

    # Pre-defined patterns for Education (Regex is safer for Uni names)
    uni_patterns = [r'IIT\s+[A-Za-z]+', r'.*University', r'.*Institute of Technology', r'.*College']

    for i, line in enumerate(lines):
        # Run the line through the NER model
        entities = ner_pipeline(line)

        # --- 1. EXTRACT NAME (Usually line 0 or 1) ---
        if i < 2 and not name:
            for ent in entities:
                if ent['entity_group'] == 'PER':
                    # Expand the fragment to the full word in the original line
                    name = expand_to_full_word(line, ent['start'], ent['end'])
                    # Capture potential surname on the same line
                    surname = re.search(rf"{name}\s+([A-Z][a-z]+)", line)
                    if surname: name = surname.group()
                    break

        # --- 2. EXTRACT UNIVERSITY ---
        is_uni = False
        for pat in uni_patterns:
            if re.search(pat, line, re.IGNORECASE):
                # Clean up labels like "Education: "
                university = re.sub(r'^(Education|Uni|School):\s*', '', line, flags=re.I).strip()
                is_uni = True
                break

        # --- 3. EXTRACT COMPANIES ---
        if not is_uni:
            for ent in entities:
                if ent['entity_group'] == 'ORG':
                    org_full = expand_to_full_word(line, ent['start'], ent['end'])

                    # Clean the name: Remove roles like "Senior Engineer" or separators
                    clean_org = re.split(r'[-|–—,]|at\s|Software|Engineer|Developer|Intern|Assistant', org_full, flags=re.I)[0].strip()

                    # Avoid adding the candidate's own name as a company
                    if len(clean_org) > 2 and clean_org.lower() not in name.lower():
                        raw_companies.append(clean_org)

    # Deduplicate Companies (removes 'Google' if 'Google Cloud' is found)
    raw_companies = sorted(list(set(raw_companies)), key=len, reverse=True)
    final_companies = []
    for co in raw_companies:
        if not any(co in other for other in final_companies):
            final_companies.append(co)

    return {
        "Name": name if name else (lines[0] if lines else "Not Found"),
        "University": university if university else "Not Found",
        "Companies": final_companies[:2] # Limit to top 2 for clarity
    }

# ---------------------------------------------------------
# TEST DATA
# ---------------------------------------------------------
resumes_to_test = [
    """Amit Sharma
    Data Scientist with 5 years of experience in AI.
    Indian Institute of Technology Delhi
    Google Cloud Platform - Senior Engineer""",

    """Sita Ramakrishnan
    Education: Anna University
    Experience:
    Amazon.com, Software Development Intern
    Microsoft India, Research Assistant""",

    """Liam O'Connor
    University of California, Berkeley
    Facebook (Meta)
    Netflix""",

    """Neha Patil
    PES University
    J.P. Morgan Chase & Co.
    General Electric (GE) Aviation
    Amazon.com, Software Development Intern
    Microsoft India, Research Assistant"""
]

# ---------------------------------------------------------
# EXECUTION
# ---------------------------------------------------------
print("=== Smart Resume Parser ===\n")

for i, text in enumerate(resumes_to_test, 1):
    result = smart_resume_parser(text)
    print(f"RESUME {i}:")
    print(f"  Name:       {result['Name']}")
    print(f"  University: {result['University']}")
    print(f"  Companies:  {result['Companies']}")
    print("-" * 45)