In [1]:
import spacy
#Pre-Trained Model For NER

In [5]:
import re
# Import Regex

In [6]:
# Load spaCy English NER model once
try:
    nlp = spacy.load("en_core_web_sm")
except:
    raise RuntimeError("Run this first: python -m spacy download en_core_web_sm")

In [7]:
def extract_entities_spacy_extended(text):
    """
    Extract important information from OCR text:
    Name, Roll No, Certificate ID, CGPA, Date of Issue, Institution Name
    
    Args:
        text (str): OCR extracted text.
    
    Returns:
        dict: extracted entities
    """
    entities = {
        "Name": [],
        "Roll No": [],
        "Certificate ID": [],
        "CGPA": [],
        "Date of Issue": [],
        "Institution Name": []
    }

    # --- Step 1: Use spaCy for names & organizations ---
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            entities["Name"].append(ent.text.strip())
        elif ent.label_ in ["ORG", "FAC", "GPE"]:
            entities["Institution Name"].append(ent.text.strip())
        elif ent.label_ == "DATE":
            entities["Date of Issue"].append(ent.text.strip())

    # --- Step 2: Regex for structured fields ---
    roll_matches = re.findall(r"\b(?:Roll\s*No|RollNumber|Roll)\s*[:\-]?\s*(\w+)", text, flags=re.I)
    cert_matches = re.findall(r"\b(?:Cert(?:ificate)?\s*ID|CID)\s*[:\-]?\s*(\w+)", text, flags=re.I)
    cgpa_matches = re.findall(r"\b(?:CGPA|GPA|Grade|Marks)\s*[:\-]?\s*([\d\.]+)", text, flags=re.I)
    doi_matches = re.findall(r"\b(?:Date\s*of\s*Issue|Issued\s*on|Date)\s*[:\-]?\s*([\d/-]+)", text, flags=re.I)

    entities["Roll No"].extend(roll_matches)
    entities["Certificate ID"].extend(cert_matches)
    entities["CGPA"].extend(cgpa_matches)
    entities["Date of Issue"].extend(doi_matches)

    # Remove duplicates
    for key in entities:
        entities[key] = list(set(entities[key]))

    return entities
