In [1]:
import sys
import os

# Add the parent directory of 'scripts' to sys.path
sys.path.append(os.path.abspath("../scripts"))

# Import modules
from fetch_article import fetch_article_from_url
from ner_tools import load_ner_tools, apply_ner_tools
from comparison import compare_ner_results, display_comparison, extract_and_display_organizations

# Test in English

In [2]:
# news article's url
url = "https://www.dw.com/en/volkswagen-vw-banking-on-global-sales-to-stay-ahead-of-the-mobility-curve/a-71064923"
# the idea is to use a language identificator, but for now this is the way
language = "en"

In [3]:
# Fetch article
print("Fetching article...")
article_text = fetch_article_from_url(url)
print("Article fetched successfully!")

Fetching article...
Article fetched successfully!


In [4]:
# Load NER tools
print("Loading NER tools...")
tools = load_ner_tools(language=language)
print("NER tools loaded!")

Loading NER tools...


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NER tools loaded!


In [5]:
# Apply NER tools
print("Applying NER tools...")
ner_results = apply_ner_tools(article_text, tools)

Applying NER tools...


In [6]:
# Compare results
print("Comparing results...")
comparison = compare_ner_results(article_text, ner_results)

Comparing results...


In [7]:
# Display results
print("Results:")
display_comparison(comparison)

Results:
Word                          spaCy                         Hugging Face (BERT)           
Germanness                    ORG                           N/A                           
China                         GPE                           I-LOC                         
Brazil                        GPE                           I-LOC                         
US                            GPE                           I-LOC                         
Volkswagen                    ORG                           I-ORG                         
VW                            ORG                           N/A                           
Germanwith                    PERSON                        N/A                           
Beetle                        GPE                           N/A                           
Golf                          GPE                           I-MISC                        
Polo                          PERSON                        I-MISC               

In [8]:
# Display tokens identified as organizations
print("\nOrganizations identified:")
organizations = extract_and_display_organizations(comparison)


Organizations identified:
Token                         spaCy                         Hugging Face (BERT)           
Germanness                    ORG                           N/A                           
China                         GPE                           I-LOC                         
Brazil                        GPE                           I-LOC                         
US                            GPE                           I-LOC                         
Volkswagen                    ORG                           I-ORG                         
VW                            ORG                           N/A                           
Germanwith                    PERSON                        N/A                           
Beetle                        GPE                           N/A                           
Golf                          GPE                           I-MISC                        
Polo                          PERSON                        I-M

# Testing in German

In [9]:
# Define URL and language
url = "https://www.dw.com/de/was-bei-einem-tiktok-bann-in-den-usa-f%C3%BCr-nutzer-wichtig-ist/a-71289246"  # Replace with a valid URL
language = "de"

In [10]:
# Fetch article
print("Fetching article...")
article_text = fetch_article_from_url(url)
print("Article fetched successfully!")

Fetching article...
Article fetched successfully!


In [11]:
# Load NER tools
print("Loading NER tools...")
tools = load_ner_tools(language=language)
print("NER tools loaded!")

Loading NER tools...


config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/240k [00:00<?, ?B/s]

NER tools loaded!


In [12]:
# Apply NER tools
print("Applying NER tools...")
ner_results = apply_ner_tools(article_text, tools)

Applying NER tools...


In [13]:
# Compare results
print("Comparing results...")
comparison = compare_ner_results(article_text, ner_results)

Comparing results...


In [14]:
# Display results
print("Results:")
display_comparison(comparison)

Results:
Word                          spaCy                         Hugging Face (BERT)           
TikTok                        MISC                          N/A                           
den USA                       LOC                           N/A                           
KonzernBytedance              MISC                          N/A                           
amerikanischen                MISC                          LABEL_0                       
App-Stores                    LOC                           N/A                           
USA                           LOC                           LABEL_0                       
Browser                       MISC                          LABEL_1                       
US-Regierung                  LOC                           N/A                           
chinesische                   MISC                          LABEL_1                       
Bytedance                     LOC                           N/A                  

In [15]:
# Display tokens identified as organizations
print("\nOrganizations identified:")
organizations = extract_and_display_organizations(comparison)


Organizations identified:
Token                         spaCy                         Hugging Face (BERT)           
TikTok                        MISC                          N/A                           
den USA                       LOC                           N/A                           
KonzernBytedance              MISC                          N/A                           
amerikanischen                MISC                          LABEL_0                       
App-Stores                    LOC                           N/A                           
USA                           LOC                           LABEL_0                       
Browser                       MISC                          LABEL_1                       
US-Regierung                  LOC                           N/A                           
chinesische                   MISC                          LABEL_1                       
Bytedance                     LOC                           N/A