In [3]:
!pip install spacy



In [4]:
!python3 -m spacy download en_core_web_lg


Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
import spacy
from spacy import displacy
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

nlp = spacy.load("en_core_web_lg")

In [6]:
if "ner" in nlp.pipe_names:
    ner_labels = nlp.get_pipe("ner").labels
    print("Available Entity Categories")
    print(", ".join(sorted(ner_labels)))
else:
    print("NER pipeline component not found in the loaded model.")

Available Entity Categories
CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART


In [7]:
text_set = [
    "Leo, the first U.S. pope, was elected by the world's cardinals in May to replace the late Pope Francis. He has shown a different style from his predecessor, usually preferring to speak from carefully prepared remarks and rarely off the cuff.Leo previously called for Israel to allow more humanitarian aid to enter Gaza. He made his appeal on Wednesday at the end of his weekly audience.The Israel-Hamas conflict began on October 7, 2023, when Hamas-led gunmen burst into southern Israel, killing some 1,200 people, mainly civilians, according to Israeli tallies, and taking 251 hostages.",
    "MANILA, Philippines – The trough or extension of the low pressure area (LPA) over the West Philippine Sea is bringing scattered rain to Luzon, the weather bureau said on Wednesday afternoon, August 27.As of 3 pm on Wednesday, the LPA was located 305 kilometers west of Dagupan City, Pangasinan.Its trough is causing scattered rain and thunderstorms in Metro Manila, the Ilocos Region, Cagayan Valley, the Cordillera Administrative Region, Central Luzon, and Calabarzon.",
    "The crash occurred in the foggy hills above Calabasas, California, about 30 miles northwest of downtown Los Angeles. Bryant was killed, a person familiar with the situation told The Associated Press, and a different person familiar with the case confirmed Bryant’s 13-year-old daughter Gianna also died.",
]

In [8]:
ground_truth = {
    text_set[0]: [
        ("Leo", "PERSON"),
        ("U.S.", "GPE"),
        ("pope", "TITLE"),
        ("Pope Francis", "PERSON"),
        ("May", "DATE"),
        ("Israel", "GPE"),
        ("Gaza", "GPE"),
        ("Wednesday", "DATE"),
        ("Israel-Hamas conflict", "EVENT"),
        ("October 7, 2023", "DATE"),
        ("Hamas", "ORG"),
        ("southern Israel", "GPE"),
        ("1,200", "CARDINAL"),
        ("251", "CARDINAL"),
    ],
    text_set[1]: [
        ("MANILA", "GPE"),
        ("Philippines", "GPE"),
        ("low pressure area", "WEATHER_EVENT"),
        ("West Philippine Sea", "LOC"),
        ("Luzon", "GPE"),
        ("Wednesday", "DATE"),
        ("August 27", "DATE"),
        ("3 pm", "TIME"),
        ("Dagupan City", "GPE"),
        ("Pangasinan", "GPE"),
        ("Metro Manila", "GPE"),
        ("Ilocos Region", "GPE"),
        ("Cagayan Valley", "GPE"),
        ("Cordillera Administrative Region", "GPE"),
        ("Central Luzon", "GPE"),
        ("Calabarzon", "GPE"),
    ],
    text_set[2]: [
        ("Calabasas", "GPE"),
        ("California", "GPE"),
        ("30 miles", "QUANTITY"),
        ("northwest", "DIRECTION"),
        ("Los Angeles", "GPE"),
        ("Bryant", "PERSON"),
        ("The Associated Press", "ORG"),
        ("Gianna", "PERSON"),
        ("13-year-old", "AGE"),
    ],
}


In [9]:
true_labels = []
pred_labels = []

In [10]:
for text in text_set:
    doc = nlp(text)

    if doc.ents:
        for ent in doc.ents:
            print(f"Predicted -> Text: '{ent.text}' | Label: {ent.label_} ({spacy.explain(ent.label_)})")
        displacy.render(doc, style="ent", jupyter=True)
    else:
        print("No named entities detected in the text.")

    # Convert gold entities into spans for alignment
    gold_spans = []
    for (ent_text, ent_label) in ground_truth[text]:
        start = text.find(ent_text)
        if start != -1:
            end = start + len(ent_text)
            gold_spans.append((start, end, ent_label))

    # Assign gold labels per token
    token_truth = []
    for token in doc:
        assigned_label = "O"
        for (start, end, ent_label) in gold_spans:
            if token.idx >= start and token.idx + len(token.text) <= end:
                assigned_label = ent_label
                break
        token_truth.append(assigned_label)

    # Predicted labels
    token_preds = []
    for token in doc:
        if token.ent_type_:
            token_preds.append(token.ent_type_)
        else:
            token_preds.append("O")

    true_labels.extend(token_truth)
    pred_labels.extend(token_preds)

Predicted -> Text: 'Leo' | Label: PERSON (People, including fictional)
Predicted -> Text: 'first' | Label: ORDINAL ("first", "second", etc.)
Predicted -> Text: 'U.S.' | Label: GPE (Countries, cities, states)
Predicted -> Text: 'May' | Label: DATE (Absolute or relative dates or periods)
Predicted -> Text: 'Francis' | Label: PERSON (People, including fictional)
Predicted -> Text: 'Leo' | Label: PERSON (People, including fictional)
Predicted -> Text: 'Israel' | Label: GPE (Countries, cities, states)
Predicted -> Text: 'Gaza' | Label: GPE (Countries, cities, states)
Predicted -> Text: 'Wednesday' | Label: DATE (Absolute or relative dates or periods)
Predicted -> Text: 'weekly' | Label: DATE (Absolute or relative dates or periods)
Predicted -> Text: 'Israel' | Label: GPE (Countries, cities, states)
Predicted -> Text: 'October 7, 2023' | Label: DATE (Absolute or relative dates or periods)
Predicted -> Text: 'Hamas' | Label: ORG (Companies, agencies, institutions, etc.)
Predicted -> Text: 'Is

Predicted -> Text: 'MANILA' | Label: GPE (Countries, cities, states)
Predicted -> Text: 'Philippines' | Label: GPE (Countries, cities, states)
Predicted -> Text: 'LPA' | Label: ORG (Companies, agencies, institutions, etc.)
Predicted -> Text: 'the West Philippine Sea' | Label: GPE (Countries, cities, states)
Predicted -> Text: 'Luzon' | Label: LOC (Non-GPE locations, mountain ranges, bodies of water)
Predicted -> Text: 'Wednesday' | Label: DATE (Absolute or relative dates or periods)
Predicted -> Text: 'afternoon' | Label: TIME (Times smaller than a day)
Predicted -> Text: '3 pm on' | Label: TIME (Times smaller than a day)
Predicted -> Text: 'Wednesday' | Label: DATE (Absolute or relative dates or periods)
Predicted -> Text: 'LPA' | Label: ORG (Companies, agencies, institutions, etc.)
Predicted -> Text: '305 kilometers' | Label: QUANTITY (Measurements, as of weight or distance)
Predicted -> Text: 'Dagupan City' | Label: GPE (Countries, cities, states)
Predicted -> Text: 'Pangasinan' | L

Predicted -> Text: 'Calabasas' | Label: GPE (Countries, cities, states)
Predicted -> Text: 'California' | Label: GPE (Countries, cities, states)
Predicted -> Text: 'about 30 miles' | Label: QUANTITY (Measurements, as of weight or distance)
Predicted -> Text: 'Los Angeles' | Label: GPE (Countries, cities, states)
Predicted -> Text: 'Bryant' | Label: PERSON (People, including fictional)
Predicted -> Text: 'The Associated Press' | Label: ORG (Companies, agencies, institutions, etc.)
Predicted -> Text: 'Bryant' | Label: PERSON (People, including fictional)
Predicted -> Text: '13-year-old' | Label: DATE (Absolute or relative dates or periods)
Predicted -> Text: 'Gianna' | Label: PERSON (People, including fictional)


In [11]:
# --- Evaluation Results ---
print("\n--- Evaluation Metrics ---")
print("Accuracy:", accuracy_score(true_labels, pred_labels))
print("Precision:", precision_score(true_labels, pred_labels, average="weighted", zero_division=0))
print("Recall:", recall_score(true_labels, pred_labels, average="weighted", zero_division=0))
print("F1 Score:", f1_score(true_labels, pred_labels, average="weighted", zero_division=0))


--- Evaluation Metrics ---
Accuracy: 0.8185328185328186
Precision: 0.823707081310768
Recall: 0.8185328185328186
F1 Score: 0.8141001240780467


In [26]:
import spacy
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load spaCy large model
nlp = spacy.load("en_core_web_lg")

# Load your Excel file
file_path = "Almazan - WW NER Model Evaluation.xlsx"
df = pd.read_excel(file_path, sheet_name="Sheet1", header=3).dropna(axis=1, how="all")

# Text samples
text_samples = {
    1: "Leo, the first U.S. pope, met with Pope Francis in May. Israel and Gaza clashed on Wednesday. Israel-Hamas conflict worsened on October 7, 2023.",
    2: "MANILA, Philippines — The West Philippine Sea dispute continues as Luzon faced tensions on Wednesday, Aug-27, at 3:00 PM. Dagupan City was also affected, with distances up to 305 kilometers reported.",
    3: "Your third article text goes here..."
}

# Extract ground truth entities
true_entities = {}
for _, row in df.iterrows():
    try:
        text_id = int(row["Article 1"])
        entity_text = str(row["Unnamed: 3"])
        label = str(row["Unnamed: 6"])
        if text_id not in true_entities:
            true_entities[text_id] = []
        true_entities[text_id].append((entity_text, label))
    except:
        continue

# Predict entities with spaCy
pred_entities = {}
for tid, text in text_samples.items():
    doc = nlp(text)
    pred_entities[tid] = [(ent.text, ent.label_) for ent in doc.ents]

# Build true vs pred label lists
true_labels = []
pred_labels = []
comparison_rows = []  # <-- will hold rows for Excel table

for tid in text_samples.keys():
    t_ents = true_entities.get(tid, [])
    p_ents = pred_entities.get(tid, [])

    # Build mapping of predicted text → label
    pred_map = {text: label for text, label in p_ents}

    for ent_text, true_label in t_ents:
        pred_label = pred_map.get(ent_text, "O")  # if spaCy missed it
        true_labels.append(true_label)
        pred_labels.append(pred_label)
        comparison_rows.append({
            "Text_ID": tid,
            "Entity_Text": ent_text,
            "True_Label": true_label,
            "Predicted_Label": pred_label
        })

# === Evaluation metrics ===
accuracy = accuracy_score(true_labels, pred_labels)
precision = precision_score(true_labels, pred_labels, average="weighted", zero_division=0)
recall = recall_score(true_labels, pred_labels, average="weighted", zero_division=0)
f1 = f1_score(true_labels, pred_labels, average="weighted", zero_division=0)

labels = sorted(list(set(true_labels + pred_labels)))
cm = confusion_matrix(true_labels, pred_labels, labels=labels)
class_report = classification_report(true_labels, pred_labels, output_dict=True, zero_division=0)

# === Save results into NEW Excel file ===
output_file = "NER_Evaluation_Results.xlsx"
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
    # Save confusion matrix
    cm_df = pd.DataFrame(cm, index=[f"True-{l}" for l in labels], columns=[f"Pred-{l}" for l in labels])
    cm_df.to_excel(writer, sheet_name="ConfusionMatrix")

    # Save metrics
    metrics_df = pd.DataFrame({
        "Metric": ["Accuracy", "Precision", "Recall", "F1 Score"],
        "Score": [accuracy, precision, recall, f1]
    })
    metrics_df.to_excel(writer, sheet_name="Metrics", index=False)

    # Save classification report
    report_df = pd.DataFrame(class_report).transpose()
    report_df.to_excel(writer, sheet_name="ClassReport")

    # Save comparison table
    comp_df = pd.DataFrame(comparison_rows)
    comp_df.to_excel(writer, sheet_name="Predicted_vs_True", index=False)

print(f"✅ Results saved into {output_file}")


✅ Results saved into NER_Evaluation_Results.xlsx
