# Named Entity Recognition (NER) for News Articles

## Imports

In [1]:
import pandas as pd
import spacy
from spacy import displacy
import os
import joblib
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support
import re
from pathlib import Path

# Ensure spaCy models are downloaded
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m129.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: 

## 1. Load and Preprocess Dataset

In [10]:
# Load CoNLL-2003 dataset
def load_conll_data(file_path):
    sentences, labels = [], []
    current_sentence, current_labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == '' or line.startswith('-DOCSTART-'):
                if current_sentence:
                    sentences.append(' '.join(current_sentence))
                    labels.append(current_labels)
                    current_sentence, current_labels = [], []
            else:
                token, _, _, label = line.strip().split()
                current_sentence.append(token)
                current_labels.append(label)
    if current_sentence:
        sentences.append(' '.join(current_sentence))
        labels.append(current_labels)
    return pd.DataFrame({'text': sentences, 'labels': labels})

# Load train, validation, and test datasets
train_df = load_conll_data('train.txt')
valid_df = load_conll_data('valid.txt')
test_df = load_conll_data('test.txt')

train_df.head()

Unnamed: 0,text,labels
0,EU rejects German call to boycott British lamb .,"[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]"
1,Peter Blackburn,"[B-PER, I-PER]"
2,BRUSSELS 1996-08-22,"[B-LOC, O]"
3,The European Commission said on Thursday it di...,"[O, B-ORG, I-ORG, O, O, O, O, O, O, B-MISC, O,..."
4,Germany 's representative to the European Unio...,"[B-LOC, O, O, O, O, B-ORG, I-ORG, O, O, O, B-P..."


## 2. Rule-Based NER

In [3]:
# Simple rule-based NER using regex patterns
def rule_based_ner(text):
    entities = {'PERSON': [], 'ORG': [], 'LOC': [], 'MISC': []}
    # Basic patterns for names, organizations, locations, and misc
    person_pattern = r'\b[A-Z][a-z]+ [A-Z][a-z]+\b'
    org_pattern = r'\b[A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)+\b'
    loc_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\b'

    entities['PERSON'].extend(re.findall(person_pattern, text))
    entities['ORG'].extend([org for org in re.findall(org_pattern, text) if org not in entities['PERSON']])
    entities['LOC'].extend([loc for loc in re.findall(loc_pattern, text) if loc not in entities['PERSON'] and loc not in entities['ORG']])
    entities['MISC'].extend(['German', 'British'] if 'German' in text or 'British' in text else [])

    return entities

# Apply rule-based NER to a sample
sample_text = train_df['text'].iloc[0]
rule_entities = rule_based_ner(sample_text)
print(f"Sample Text: {sample_text}")
print("Rule-Based NER Results:", rule_entities)

Sample Text: EU rejects German call to boycott British lamb .
Rule-Based NER Results: {'PERSON': [], 'ORG': [], 'LOC': ['German', 'British'], 'MISC': ['German', 'British']}


## 3. Model-Based NER with spaCy

In [12]:
# Load spaCy models
nlp_sm = spacy.load('en_core_web_sm')
nlp_lg = spacy.load('en_core_web_lg')

# Function to extract entities using spaCy
def spacy_ner(text, model):
    doc = model(text)
    entities = {'PERSON': [], 'ORG': [], 'LOC': [], 'MISC': []}
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            entities['PERSON'].append(ent.text)
        elif ent.label_ == 'ORG':
            entities['ORG'].append(ent.text)
        elif ent.label_ == 'GPE':
            entities['LOC'].append(ent.text)
        elif ent.label_ == 'NORP':
            entities['MISC'].append(ent.text)
    return entities

# Apply spaCy models with progress bar
tqdm.pandas()
train_df['entities_sm'] = train_df['text'].progress_apply(lambda x: spacy_ner(x, nlp_sm))
train_df['entities_lg'] = train_df['text'].progress_apply(lambda x: spacy_ner(x, nlp_lg))
valid_df['entities_sm'] = valid_df['text'].progress_apply(lambda x: spacy_ner(x, nlp_sm))
valid_df['entities_lg'] = valid_df['text'].progress_apply(lambda x: spacy_ner(x, nlp_lg))
test_df['entities_sm'] = test_df['text'].progress_apply(lambda x: spacy_ner(x, nlp_sm))
test_df['entities_lg'] = test_df['text'].progress_apply(lambda x: spacy_ner(x, nlp_lg))

# Display sample results
print("\n\nSmall Model NER Results (First Row, Train):")
print(train_df['entities_sm'].iloc[0])
print("\nLarge Model NER Results (First Row, Train):")
print(train_df['entities_lg'].iloc[0])

100%|██████████| 14041/14041 [01:39<00:00, 140.96it/s]
100%|██████████| 14041/14041 [01:46<00:00, 131.51it/s]
100%|██████████| 3250/3250 [00:23<00:00, 136.16it/s]
100%|██████████| 3250/3250 [00:24<00:00, 131.93it/s]
100%|██████████| 3453/3453 [00:23<00:00, 150.07it/s]
100%|██████████| 3453/3453 [00:24<00:00, 138.38it/s]



Small Model NER Results (First Row, Train):
{'PERSON': [], 'ORG': ['EU'], 'LOC': [], 'MISC': ['German', 'British']}

Large Model NER Results (First Row, Train):
{'PERSON': [], 'ORG': ['EU'], 'LOC': [], 'MISC': ['German', 'British']}





## 4. Visualize Entities with displaCy

In [5]:
# Visualize entities for a sample text
sample_doc_sm = nlp_sm(sample_text)
sample_doc_lg = nlp_lg(sample_text)

print("\nSmall Model Visualization:")
displacy.render(sample_doc_sm, style='ent', jupyter=True)

print("\nLarge Model Visualization:")
displacy.render(sample_doc_lg, style='ent', jupyter=True)


Small Model Visualization:



Large Model Visualization:


## 5. Evaluate Models

In [6]:
# Function to extract true entities from CoNLL labels
def extract_true_entities(text, labels):
    entities = {'PER': [], 'ORG': [], 'LOC': [], 'MISC': []}
    current_entity = []
    current_type = None
    for token, label in zip(text.split(), labels):
        if label.startswith('B-'):
            if current_entity:
                entities[current_type].append(' '.join(current_entity))
            current_entity = [token]
            current_type = label[2:]
        elif label.startswith('I-') and current_type == label[2:]:
            current_entity.append(token)
        else:
            if current_entity:
                entities[current_type].append(' '.join(current_entity))
                current_entity = []
                current_type = None
    if current_entity:
        entities[current_type].append(' '.join(current_entity))
    return entities

# Function to evaluate NER performance
def evaluate_ner(true_labels, pred_entities):
    y_true, y_pred = [], []
    for (text, labels), pred in zip(true_labels, pred_entities):
        true_entities = extract_true_entities(text, labels)
        for entity_type in ['PER', 'ORG', 'LOC', 'MISC']:
            pred_key = entity_type if entity_type != 'PER' else 'PERSON'
            for entity in true_entities[entity_type]:
                y_true.append(entity_type + ':' + entity)
                y_pred.append(entity_type + ':' + entity if entity in pred[pred_key] else 'O')
            for entity in pred[pred_key]:
                if entity not in true_entities[entity_type]:
                    y_true.append('O')
                    y_pred.append(entity_type + ':' + entity)

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='micro', zero_division=0)
    return precision, recall, f1

# Evaluate on validation and test sets
valid_sm_entities = valid_df['entities_sm'].tolist()
valid_lg_entities = valid_df['entities_lg'].tolist()
test_sm_entities = test_df['entities_sm'].tolist()
test_lg_entities = test_df['entities_lg'].tolist()

valid_sm_precision, valid_sm_recall, valid_sm_f1 = evaluate_ner(valid_df[['text', 'labels']].values, valid_sm_entities)
valid_lg_precision, valid_lg_recall, valid_lg_f1 = evaluate_ner(valid_df[['text', 'labels']].values, valid_lg_entities)
test_sm_precision, test_sm_recall, test_sm_f1 = evaluate_ner(test_df[['text', 'labels']].values, test_sm_entities)
test_lg_precision, test_lg_recall, test_lg_f1 = evaluate_ner(test_df[['text', 'labels']].values, test_lg_entities)

# Print evaluation results
print("\nValidation Set - Small Model Scores:")
print(f"Precision: {valid_sm_precision:.4f}, Recall: {valid_sm_recall:.4f}, F1: {valid_sm_f1:.4f}")
print("Validation Set - Large Model Scores:")
print(f"Precision: {valid_lg_precision:.4f}, Recall: {valid_lg_recall:.4f}, F1: {valid_lg_f1:.4f}")
print("\nTest Set - Small Model Scores:")
print(f"Precision: {test_sm_precision:.4f}, Recall: {test_sm_recall:.4f}, F1: {test_sm_f1:.4f}")
print("Test Set - Large Model Scores:")
print(f"Precision: {test_lg_precision:.4f}, Recall: {test_lg_recall:.4f}, F1: {test_lg_f1:.4f}")


Validation Set - Small Model Scores:
Precision: 0.4748, Recall: 0.4748, F1: 0.4748
Validation Set - Large Model Scores:
Precision: 0.5386, Recall: 0.5386, F1: 0.5386

Test Set - Small Model Scores:
Precision: 0.4351, Recall: 0.4351, F1: 0.4351
Test Set - Large Model Scores:
Precision: 0.4933, Recall: 0.4933, F1: 0.4933


## 6. Save Models

In [7]:
# Create directory for saved models
os.makedirs('saved_models', exist_ok=True)

# Save spaCy models
nlp_sm.to_disk('saved_models/en_core_web_sm')
nlp_lg.to_disk('saved_models/en_core_web_lg')

# Save entity results for inference
joblib.dump({
    'train': train_df[['text', 'entities_sm', 'entities_lg']],
    'valid': valid_df[['text', 'entities_sm', 'entities_lg']],
    'test': test_df[['text', 'entities_sm', 'entities_lg']]
}, 'saved_models/ner_results.pkl')

['saved_models/ner_results.pkl']

## 7. Inference on New Articles

In [8]:
# Load saved models
nlp_sm = spacy.load('saved_models/en_core_web_sm')
nlp_lg = spacy.load('saved_models/en_core_web_lg')

# New articles for inference
new_articles = [
    "Apple Inc. announced a new product launch in San Francisco.",
    "Elon Musk visited London to discuss Tesla's expansion plans.",
    "The United Nations held a climate conference in Paris."
]

for article in new_articles:
    # Rule-based NER
    rule_entities = rule_based_ner(article)

    # spaCy NER
    entities_sm = spacy_ner(article, nlp_sm)
    entities_lg = spacy_ner(article, nlp_lg)

    print(f"\nArticle: {article}")
    print("Rule-Based NER:", rule_entities)
    print("Small Model NER:", entities_sm)
    print("Large Model NER:", entities_lg)

    # Visualize with displaCy
    print("\nSmall Model Visualization:")
    displacy.render(nlp_sm(article), style='ent', jupyter=True)
    print("Large Model Visualization:")
    displacy.render(nlp_lg(article), style='ent', jupyter=True)


Article: Apple Inc. announced a new product launch in San Francisco.
Rule-Based NER: {'PERSON': ['Apple Inc', 'San Francisco'], 'ORG': [], 'LOC': [], 'MISC': []}
Small Model NER: {'PERSON': [], 'ORG': ['Apple Inc.'], 'LOC': ['San Francisco'], 'MISC': []}
Large Model NER: {'PERSON': [], 'ORG': ['Apple Inc.'], 'LOC': ['San Francisco'], 'MISC': []}

Small Model Visualization:


Large Model Visualization:



Article: Elon Musk visited London to discuss Tesla's expansion plans.
Rule-Based NER: {'PERSON': ['Elon Musk'], 'ORG': [], 'LOC': ['London', 'Tesla'], 'MISC': []}
Small Model NER: {'PERSON': ['Elon Musk'], 'ORG': ['Tesla'], 'LOC': ['London'], 'MISC': []}
Large Model NER: {'PERSON': ['Elon Musk'], 'ORG': ['Tesla'], 'LOC': ['London'], 'MISC': []}

Small Model Visualization:


Large Model Visualization:



Article: The United Nations held a climate conference in Paris.
Rule-Based NER: {'PERSON': ['The United'], 'ORG': ['The United Nations'], 'LOC': ['Nations', 'Paris'], 'MISC': []}
Small Model NER: {'PERSON': [], 'ORG': ['The United Nations'], 'LOC': ['Paris'], 'MISC': []}
Large Model NER: {'PERSON': [], 'ORG': ['The United Nations'], 'LOC': ['Paris'], 'MISC': []}

Small Model Visualization:


Large Model Visualization:
