In [None]:
import requests
from bs4 import BeautifulSoup
import json
import re
import torch
from transformers import (AutoModelForTokenClassification, AutoTokenizer, 
                          TrainingArguments, Trainer, DataCollatorForTokenClassification)
from datasets import Dataset

# Define Animal List & Wikipedia URLs
ANIMAL_LIST = {'butterfly', 'cat', 'chicken', 'cow', 'dog', 'elephant', 
               'horse', 'sheep', 'spider', 'squirrel'}

WIKI_URLS = [f"https://en.wikipedia.org/wiki/{animal.capitalize()}" for animal in ANIMAL_LIST]

# Scrape Wikipedia for Animal Data
def scrape_wikipedia(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = [p.text for p in soup.find_all("p")]
    return paragraphs

def clean_text(text):
    text = re.sub(r"\[\d+\]", "", text)  # Remove citations like [1]
    text = re.sub(r"\s+", " ", text).strip()  # Normalize spaces
    return text

# Collect Sentences
animal_sentences = []
for url in WIKI_URLS:
    animal_sentences.extend(scrape_wikipedia(url))

clean_animal_sentences = [clean_text(sent) for sent in animal_sentences if len(sent.split()) > 5]

# Auto-labeling for NER
def annotate_sentences(sentences):
    dataset = []
    for sentence in sentences:
        tokens = sentence.split()  
        labels = [1 if token.lower() in ANIMAL_LIST else 0 for token in tokens]  # B-ANIMAL (1) or O (0)

        dataset.append({"tokens": list(tokens), "ner_tags": list(labels)})
    return dataset

ner_dataset = annotate_sentences(clean_animal_sentences)

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(ner_dataset)

# Load Tokenizer
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=128
    )

    word_ids = tokenized_inputs.word_ids()
    labels = [-100 if word_id is None else examples["ner_tags"][word_id] for word_id in word_ids]

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize Data
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False, remove_columns=["tokens", "ner_tags"])

# Split Train & Validation Sets
split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split["train"]
eval_dataset = split["test"]

# Load Pretrained Model for Token Classification
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=2)  # 0 = O, 1 = B-ANIMAL

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./models/ner_animals",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",  # Enable step-wise logging
    logging_steps=10,  # Log every 10 steps
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    logging_dir="./logs",  # Enables logging in TensorBoard
)

# Train the Model
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()
trainer.save_model("./models/ner_animals")
tokenizer.save_pretrained("./models/ner_animals")

print("Model retraining complete! Saved to ./models/ner_animals")


Map:   0%|          | 0/643 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0518,0.0542
2,0.0612,0.052853
3,0.0537,0.052452


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

def infer_animals_in_text(text, model_path="./models/ner_model"):
    """
    Given raw text, load a trained NER model and return a list of extracted animals.
    """
    # 1. Load tokenizer & model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    model.eval()

    # 2. Tokenize
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs).logits  # shape: [batch_size, seq_len, num_labels]
    
    predictions = torch.argmax(outputs, dim=2).squeeze().tolist()
    # if there's only one sequence, shape => (seq_len)
    
    # For example, assume label 1 is "B-ANIMAL", label 0 is "O".
    # Adjust based on how you trained it
    extracted_animals = []
    for idx, pred_label_id in enumerate(predictions):
        if pred_label_id == 1:  # means "B-ANIMAL"
            token = inputs["input_ids"][0, idx]
            word = tokenizer.decode(token)
            extracted_animals.append(word)

    return extracted_animals

if __name__ == "__main__":
    # Example usage
    test_text = "I see a cow in the distance."
    animals = infer_animals_in_text(test_text, model_path="./models/ner_model")
    print("Extracted animals:", animals)
