In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch

In [3]:
# Initialize the NER pipeline
ner_pipeline = pipeline("ner",
                        model="dbmdz/bert-large-cased-finetuned-conll03-english",
                        aggregation_strategy="simple")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Device set to use cpu


Available tokens
1.  PER (Person)
2.  ORG (Organization)
3.  LOC (Location)
4.  MISC (Miscellaneous)

In [8]:
# Text example
text = "I live in India"

# Perform NER
entities = ner_pipeline(text)

# Print the results
for entity in entities:
    print(f"Entity: {entity['word']}")
    print(f"Type: {entity['entity_group']}")
    print(f"Confidence: {entity['score']:.4f}")
    print("-" * 30)

Entity: India
Type: LOC
Confidence: 0.9998
------------------------------


In [9]:
# Load model and tokenizer
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# Text example
text = "Google and Microsoft are competing in the AI space while Elon Musk founded SpaceX."

# Tokenize the text
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True)

In [11]:
inputs

{'input_ids': tensor([[  101,  7986,  1105,  6998,  1132,  6259,  1107,  1103, 19016,  2000,
          1229,  2896,  1320, 19569,  5276,  1771,  4525,  3190,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [12]:
# Get predictions
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)

In [13]:
# Convert predictions to labels
label_list = model.config.id2label
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
predictions = predictions[0].tolist()

In [14]:
# Process results
current_entity = []
current_entity_type = None

for token, prediction in zip(tokens, predictions):
    if token.startswith("##"):
        if current_entity:
            current_entity.append(token[2:])
    else:
        if current_entity:
            print(f"Entity: {''.join(current_entity)}")
            print(f"Type: {current_entity_type}")
            print("-" * 30)
            current_entity = []

        if label_list[prediction] != "O":
            current_entity = [token]
            current_entity_type = label_list[prediction]

# Print final entity if exists
if current_entity:
    print(f"Entity: {''.join(current_entity)}")
    print(f"Type: {current_entity_type}")

Entity: Google
Type: I-ORG
------------------------------
Entity: Microsoft
Type: I-ORG
------------------------------
Entity: Elon
Type: I-PER
------------------------------
Entity: Musk
Type: I-PER
------------------------------
Entity: SpaceX
Type: I-ORG
------------------------------


In [15]:

from transformers import pipeline
import torch
import logging
from typing import List, Dict

class NERProcessor:
    def __init__(self,
                 model_name: str = "dbmdz/bert-large-cased-finetuned-conll03-english",
                 confidence_threshold: float = 0.8):
        self.confidence_threshold = confidence_threshold
        try:
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
            self.ner_pipeline = pipeline("ner",
                                         model=model_name,
                                         aggregation_strategy="simple",
                                         device=self.device)
        except Exception as e:
            logging.error(f"Failed to initialize NER pipeline: {str(e)}")
            raise

    def process_text(self, text: str) -> List[Dict]:
        if not text or not isinstance(text, str):
            logging.warning("Invalid input text")
            return []

        try:
            # Get predictions
            entities = self.ner_pipeline(text)

            # Post-process results
            filtered_entities = [
                entity for entity in entities
                if entity['score'] >= self.confidence_threshold
            ]

            return filtered_entities
        except Exception as e:
            logging.error(f"Error processing text: {str(e)}")
            return []


if __name__ == "__main__":
    # Initialize processor
    processor = NERProcessor()

    # Text example
    text = """
    Apple Inc. CEO Tim Cook announced new partnerships with Microsoft
    and Google during a conference in New York City. The event was also
    attended by Sundar Pichai and Satya Nadella.
    """

    # Process text
    results = processor.process_text(text)

    # Print results
    for entity in results:
        print(f"Entity: {entity['word']}")
        print(f"Type: {entity['entity_group']}")
        print(f"Confidence: {entity['score']:.4f}")
        print("-" * 30)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Entity: Apple Inc
Type: ORG
Confidence: 0.9995
------------------------------
Entity: Tim Cook
Type: PER
Confidence: 0.9997
------------------------------
Entity: Microsoft
Type: ORG
Confidence: 0.9996
------------------------------
Entity: Google
Type: ORG
Confidence: 0.9992
------------------------------
Entity: New York City
Type: LOC
Confidence: 0.9993
------------------------------
Entity: Sundar Pichai
Type: PER
Confidence: 0.9911
------------------------------
Entity: Satya Nadella
Type: PER
Confidence: 0.9961
------------------------------
