In [14]:
from transformers import pipeline
import json

class ReliableResumeParser:
    def __init__(self):
        # We stick with the same model, but we will process its output much more strictly
        self.ner_pipeline = pipeline(
            "ner",
            model="dslim/bert-base-NER",
            aggregation_strategy="simple" # This attempts to join ##tokens
        )

        # A simple "Knowledge Base" to correct the AI's mistakes
        # If the AI thinks "Python" is a Company, this list will correct it to "Skill"
        self.known_skills = {
            "Python", "Java", "C++", "SQL", "Spark", "Kafka", "Hadoop",
            "Docker", "Kubernetes", "React", "AWS", "GCP", "Azure",
            "TensorFlow", "PyTorch", "Pandas", "Linux", "Git", "PostGIS"
        }

    def parse(self, resume_text):
        print("Analyzing resume...")

        # 1. Clean the text slightly (remove excessive tabs/newlines which confuse BERT)
        clean_text = " ".join(resume_text.split())

        entities = self.ner_pipeline(clean_text)

        data = {
            "Candidate Name": None,
            "Companies": [],
            "Universities": [],
            "Locations": [],
            "Detected Skills": [],  # New Category
            "Debug_Log": []
        }

        # 2. Iterate with STRICT Logic
        for entity in entities:
            text = entity['word'].strip()
            score = entity['score']
            group = entity['entity_group']

            # RULE 1: Ignore low confidence garbage (Fixes "Dock", "Ku", "##Flow")
            if score < 0.85:
                continue

            # RULE 2: Ignore tiny tokens (artifacts like "P", "A")
            if len(text) < 2:
                continue

            # RULE 3: Check our Skills Knowledge Base
            # If the text is in our known skills list, force it to 'Skills' regardless of what AI said
            if text in self.known_skills:
                data["Detected Skills"].append(text)
                continue

            # RULE 4: Categorization
            if group == "PER":
                # Only accept names if we haven't found one yet (usually the first one is correct)
                if data["Candidate Name"] is None:
                    data["Candidate Name"] = text

            elif group == "ORG":
                if any(x in text for x in ["University", "College", "School"]):
                    data["Universities"].append(text)
                else:
                    data["Companies"].append(text)

            elif group == "LOC":
                data["Locations"].append(text)

            data["Debug_Log"].append(f"Accepted: {text} ({group}) - {score:.2f}")

        # RULE 5: Fallback for Name
        # If AI didn't find a Person, assume the first line of the original text is the name
        if data["Candidate Name"] is None:
            first_line = resume_text.strip().split('\n')[0]
            # Heuristic: If first line is short (likely a name), use it
            if len(first_line) < 30:
                data["Candidate Name"] = first_line

        # Deduplicate lists
        for key in ["Companies", "Universities", "Locations", "Detected Skills"]:
            data[key] = list(set(data[key]))

        return data

# --- TEST ---
if __name__ == "__main__":
    real_resume = """
    John Smith
    Seattle | john.smith@email.com

EXPERIENCE

Software Engineer
Microsoft | Redmond, WA
2018 - Present
- Developed new features for Azure cloud services using C# and .NET.
- Collaborated with teams in London and Tokyo to improve server uptime.

Junior Developer
Amazon | Seattle, WA
2016 - 2018
- Maintained the internal inventory system using Java and AWS.
- Fixed bugs in the checkout process.

EDUCATION

Bachelor of Science in Computer Science
PES University | Seattle, WA"""

    parser = ReliableResumeParser()
    result = parser.parse(real_resume)

    print("\n--- ðŸ“„ Parse Results ---")
    print(json.dumps(result, indent=4))

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Analyzing resume...

--- ðŸ“„ Parse Results ---
{
    "Candidate Name": "John Smith",
    "Companies": [
        "Microsoft",
        "Amazon"
    ],
    "Universities": [
        "PES University"
    ],
    "Locations": [
        "WA",
        "Seattle",
        "Tokyo",
        "London",
        "Redmond"
    ],
    "Detected Skills": [
        "Java",
        "Azure",
        "AWS"
    ],
    "Debug_Log": [
        "Accepted: John Smith (PER) - 0.90",
        "Accepted: Microsoft (ORG) - 1.00",
        "Accepted: Redmond (LOC) - 0.96",
        "Accepted: WA (LOC) - 1.00",
        "Accepted: London (LOC) - 1.00",
        "Accepted: Tokyo (LOC) - 1.00",
        "Accepted: Amazon (ORG) - 0.99",
        "Accepted: Seattle (LOC) - 1.00",
        "Accepted: WA (LOC) - 1.00",
        "Accepted: PES University (ORG) - 0.93",
        "Accepted: Seattle (LOC) - 1.00",
        "Accepted: WA (LOC) - 1.00"
    ]
}
