In [7]:
import requests
from bs4 import BeautifulSoup
import json
import re
import torch
from transformers import (AutoModelForTokenClassification, AutoTokenizer, 
                          TrainingArguments, Trainer, DataCollatorForTokenClassification)
from datasets import Dataset

# Define Wikipedia URLs for Animal Articles
ANIMAL_LIST = {'butterfly', 'cat', 'chicken', 'cow', 'dog', 'elephant', 
               'horse', 'sheep', 'spider', 'squirrel'}

WIKI_URLS = [f"https://en.wikipedia.org/wiki/{animal.capitalize()}" for animal in ANIMAL_LIST]

# Function to Scrape Wikipedia
def scrape_wikipedia(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = [p.text for p in soup.find_all("p")]
    return paragraphs

# Scrape and Clean Text Data
def clean_text(text):
    text = re.sub(r"\[\d+\]", "", text)  # Remove citations like [1], [2]
    text = re.sub(r"\s+", " ", text).strip()  # Normalize spaces
    return text

animal_sentences = []
for url in WIKI_URLS:
    animal_sentences.extend(scrape_wikipedia(url))

clean_animal_sentences = [clean_text(sent) for sent in animal_sentences if len(sent.split()) > 5]

# Auto-labeling for NER
def annotate_sentences(sentences):
    dataset = []
    for sentence in sentences:
        tokens = sentence.split()  
        labels = [1 if token.lower() in ANIMAL_LIST else 0 for token in tokens]  # B-ANIMAL (1) or O (0)
        
        # Ensure tokens and labels are consistently formatted
        dataset.append({"tokens": list(tokens), "ner_tags": list(labels)})
    return dataset

ner_dataset = annotate_sentences(clean_animal_sentences)

# Convert to Hugging Face Dataset Format
dataset = Dataset.from_list(ner_dataset)

# Load Tokenizer and Tokenize Data
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=128
    )
    
    word_ids = tokenized_inputs.word_ids()  
    labels = [-100 if word_id is None else examples["ner_tags"][word_id] for word_id in word_ids]
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Map with `remove_columns`
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False, remove_columns=["tokens", "ner_tags"])

# Split into Train & Validation Sets
split = tokenized_dataset.train_test_split(test_size=0.2)  # 80% Train, 20% Validation
train_dataset = split["train"]
eval_dataset = split["test"]

# Load Pretrained NER Model
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=2)  # 2 labels: O (0) & B-ANIMAL (1)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./models/ner_animals",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    logging_dir="./logs",
)

# Initialize Trainer
data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,  
)

# Train and Save Model
trainer.train()
trainer.save_model("./models/ner_animals")
tokenizer.save_pretrained("./models/ner_animals")

print("Model training complete! Saved to ./models/ner_animals")


Map:   0%|          | 0/643 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.008106
2,No log,0.003916
3,No log,0.004483
4,No log,0.004965
5,No log,0.005033


✅ Model training complete! Saved to ./models/ner_animals
