# **Import Libs**

In [None]:
!pip install pdfplumber transformers torch

In [None]:
import pdfplumber
from google.colab import files
from transformers import pipeline
import re
import logging
# Suppress BERT model warnings
logging.getLogger("transformers").setLevel(logging.ERROR)

#  **The Model**

In [None]:
# Upload PDF
uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]

In [None]:
# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""
            return text
    except Exception as e:
        print(f"Unable to process {pdf_path}\nerror = {str(e)}")
        return None

# Load pre-trained BERT NER model silently
nlp = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER")

# Process the PDF
pdf_text = extract_text_from_pdf(pdf_path)
if pdf_text is None:
    print("Failed to extract text from PDF")
else:
    # Run BERT NER on the text
    ner_results = nlp(pdf_text)

    # Desired entities
    desired_entities = {"NAME", "PHONE", "EDUCATION_LOC", "EDUCATION_YEAR"}
    d = {}

    # Extract NAME (PERSON) from BERT with better reconstruction
    name_parts = []
    for i, entity in enumerate(ner_results):
        if entity["entity"].startswith("B-PER") or entity["entity"].startswith("I-PER"):
            name_parts.append(entity["word"])
        elif name_parts and not entity["entity"].startswith("I-PER"):
            full_name = " ".join(name_parts).replace(" ##", "").strip()
            if len(full_name.split()) >= 2:  # Multi-word name
                d["NAME"] = [full_name]
                break
            name_parts = []
    if name_parts and "NAME" not in d:  # Handle name at end
        full_name = " ".join(name_parts).replace(" ##", "").strip()
        if len(full_name.split()) >= 2:
            d["NAME"] = [full_name]
    if "NAME" not in d:  # Fallback: First line
        first_line = pdf_text.split('\n')[0].strip()
        if len(first_line.split()) >= 2 and not any(x in first_line.lower() for x in ["phone", "tel", "mobile", "contact"]):
            d["NAME"] = [first_line]

    # Extract PHONE (broader regex)
    phone_match = re.search(r'(Phone|Tel|Mobile|Contact):?\s*([\+0-9\-\(\)\s]{8,})', pdf_text, re.IGNORECASE)
    if phone_match:
        d["PHONE"] = [phone_match.group(2).strip()]
    else:
        phone_match = re.search(r'[\+0-9\-\(\)\s]{8,}', pdf_text)
        if phone_match:
            d["PHONE"] = [phone_match.group(0).strip()]

    # Extract EDUCATION_LOC (robust for both resumes)
    edu_loc_parts = []
    for i, entity in enumerate(ner_results):
        if entity["entity"].startswith("B-ORG") or entity["entity"].startswith("I-ORG") or \
           entity["entity"].startswith("B-LOC") or entity["entity"].startswith("I-LOC"):
            edu_loc_parts.append(entity["word"])
            if any(keyword in entity["word"].lower() for keyword in ["university", "college", "institute"]):
                full_loc = " ".join(edu_loc_parts).replace(" ##", "").strip()
                # Clean up to just the institution name
                match = re.search(r'(?:faculty of|at)?\s*([\w\s]+?(?:university|college|institute))\b', full_loc, re.IGNORECASE)
                if match:
                    d["EDUCATION_LOC"] = [match.group(1).strip()]
                else:
                    d["EDUCATION_LOC"] = [full_loc]
                break
            elif len(edu_loc_parts) > 3:  # Reset if too many unrelated parts
                edu_loc_parts = []
    if "EDUCATION_LOC" not in d:  # Fallback: Search lines with education keywords
        for line in pdf_text.split('\n'):
            if any(keyword in line.lower() for keyword in ["university", "college", "institute", "faculty"]):
                match = re.search(r'(faculty of|at)?\s*([\w\s]+?(?:university|college|institute))(?:,|\s*\d{4}|$)', line, re.IGNORECASE)
                if match:
                    d["EDUCATION_LOC"] = [match.group(2).strip()]
                else:
                    for keyword in ["university", "college", "institute"]:
                        if keyword in line.lower():
                            start = line.lower().index(keyword) - 20
                            end = line.lower().index(keyword) + len(keyword) + 20
                            d["EDUCATION_LOC"] = [line[start:end].strip()]
                            break
                break

    # Extract EDUCATION_YEAR (regex for years between 2000-2025)
    year_match = re.search(r'\b(20[0-2][0-5])\b', pdf_text)
    if year_match:
        d["EDUCATION_YEAR"] = [year_match.group(1)]

    # Write to file
    with open("resume_entities.txt", "w") as f:
        for label in sorted(desired_entities):
            if label in d:
                f.write(f"{label}:\n")
                for text in set(d[label]):
                    cleaned_text = text.replace('\n', '')
                    f.write(f"{cleaned_text}\n")
                f.write("\n")

    print("Entities saved to resume_entities.txt")