In [2]:
import praw
import json
import re
import random
from sklearn.model_selection import train_test_split
import torch
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support

In [17]:
# Configuration for accessing the Reddit API
reddit = praw.Reddit(
    client_id='', 
    client_secret='', 
    user_agent='appname:v1.0 (by /u/username)',
    check_for_async=False  # Додаємо цей параметр
)

# Searching for posts related to animals 
animal_subreddits = [
    'animals', 'pets', 'wildlife', 'hen',
    'cat', 'dog', 'puppy', 'kitten',  
    'parrot', 'monkeys',
    'corgi', 'butterfly', 'chinchilla',
    'lizard', 'frog', 'horse', 'sheep', 'squirrel', 'spider', 
    'cow', 'pig', 'panda', 'elephant' 
]
posts = []

for subreddit in animal_subreddits:
    for submission in reddit.subreddit(subreddit).hot(limit=50):  # Retrieve the top 50 most popular posts 
        posts.append({
            "title": submission.title,
            "selftext": submission.selftext,
            "url": submission.url
        })

# Saving data in JSON format  
with open(r'C:\Users\Irenchik\OneDrive\Робочий стіл\Winstars_test\task_2\ner_model\data\reddit_animal_posts.json', 'w') as f:
    json.dump(posts, f, indent=4)

print("Reddit posts saved to reddit_animal_posts.json")

Reddit posts saved to reddit_animal_posts.json


In [18]:
# Load the data
with open(r'C:\Users\Irenchik\OneDrive\Робочий стіл\Winstars_test\task_2\ner_model\data\synthetic_animal_data.json', 'r') as f:
    synthetic_data = json.load(f)

# Data validation and formatting
formatted_data = []
for entry in synthetic_data:
    text = entry['text']
    entities = entry['entities']
    formatted_data.append({
        "text": text,
        "entities": entities
    })

# Save in the required format
with open(r'C:\Users\Irenchik\OneDrive\Робочий стіл\Winstars_test\task_2\ner_model\data\formatted_ner_data.json', 'w') as f:
    json.dump(formatted_data, f, indent=4)

print("Formatted data saved to formatted_ner_data.json")

Formatted data saved to formatted_ner_data.json


In [19]:
with open(r'C:\Users\Irenchik\OneDrive\Робочий стіл\Winstars_test\task_2\ner_model\data\formatted_ner_data.json', 'r') as f:
    data = json.load(f)

print(json.dumps(data[:5], indent=4))  # Display the first 5 records

[
    {
        "text": "The spider spun a web in the corner of my room.",
        "entities": [
            {
                "start": 4,
                "end": 10,
                "label": "ANIMAL"
            }
        ]
    },
    {
        "text": "A squirrel ran across the fence, holding a nut.",
        "entities": [
            {
                "start": 2,
                "end": 10,
                "label": "ANIMAL"
            }
        ]
    },
    {
        "text": "My cat loves to sleep on the windowsill all day long.",
        "entities": [
            {
                "start": 3,
                "end": 6,
                "label": "ANIMAL"
            }
        ]
    },
    {
        "text": "The dog barked loudly at the passing car.",
        "entities": [
            {
                "start": 4,
                "end": 7,
                "label": "ANIMAL"
            }
        ]
    },
    {
        "text": "A monkey was swinging from tree to tree in the jungle.",
    

In [20]:
# List of animals for NER
ANIMAL_LIST = [
    "spider", "squirrel", "cat", "dog", "monkey", "elephant", "horse", "hen", "cow", "butterfly", "sheep", "panda", "hamster", "goat", "bird", "hyena", "wolf", "fox", "rabbit", "leopard",
    "tiger", "lion", "giraffe", "zebra", "kangaroo", "bear", "deer"
]

# Function to clean the text by removing links, emojis, and unwanted characters
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Видалення URL
    text = re.sub(r'[^a-zA-Z0-9.,!?\s]', '', text)  # Видалення спецсимволів
    text = text.strip()
    return text

# Function to annotate animals in the text
def annotate_text(text, animal_list):
    entities = []
    for animal in animal_list:
        for match in re.finditer(rf'\b{animal}\b', text, re.IGNORECASE):
            entities.append({"start": match.start(), "end": match.end(), "label": "ANIMAL"})
    return {"text": text, "entities": entities} if entities else None

# Load Reddit data
with open(r'C:\Users\Irenchik\OneDrive\Робочий стіл\Winstars_test\task_2\ner_model\data\reddit_animal_posts.json', 'r') as f:
    reddit_data = json.load(f)

# Process and annotate Reddit data
annotated_reddit_data = []
for post in reddit_data:
    text = clean_text(post.get("title", "") + " " + post.get("selftext", ""))
    annotated = annotate_text(text, ANIMAL_LIST)
    if annotated:
        annotated_reddit_data.append(annotated)

# Load synthetic data
with open(r'C:\Users\Irenchik\OneDrive\Робочий стіл\Winstars_test\task_2\ner_model\data\synthetic_animal_data.json', 'r') as f:
    synthetic_data = json.load(f)

# Combine synthetic data and cleaned Reddit data
combined_data = synthetic_data + annotated_reddit_data

# Split data into training and testing datasets
train_data, test_data = train_test_split(combined_data, test_size=0.2, random_state=42)

# Save the datasets to files
with open(r'C:\Users\Irenchik\OneDrive\Робочий стіл\Winstars_test\task_2\ner_model\data\train_ner_data.json', 'w') as f:
    json.dump(train_data, f, indent=4)

with open(r'C:\Users\Irenchik\OneDrive\Робочий стіл\Winstars_test\task_2\ner_model\data\test_ner_data.json', 'w') as f:
    json.dump(test_data, f, indent=4)

print("Data preparation complete. Train and test datasets are saved.")

Data preparation complete. Train and test datasets are saved.


In [1]:
# Loading annotated data
with open(r'C:\Users\Irenchik\OneDrive\Робочий стіл\Winstars_test\task_2\ner_model\data\train_ner_data.json', 'r') as f:
    train_data = json.load(f)
with open(r'C:\Users\Irenchik\OneDrive\Робочий стіл\Winstars_test\task_2\ner_model\data\test_ner_data.json', 'r') as f:
    test_data = json.load(f)

# Loading the tokenizer
MODEL_NAME = "bert-base-cased"
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

from datasets import Dataset

def encode_data(data):
    texts = []
    labels = []
    input_ids = []
    attention_masks = []
    
    for entry in data:
        text = entry["text"]
        entities = entry["entities"]

        # Tokenization of the text
        tokens = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_offsets_mapping=True)

        print(f"Tokenized text: {tokens}")  # Checking the tokenization result

        # Create a set of positions for each entity
        entity_positions = set((entity["start"], entity["end"]) for entity in entities)

        print(f"Entity positions: {entity_positions}")  # Output to verify start and end positions

        # Creating labels for each token
        label_ids = [1 if (start, end) in entity_positions else 0 for start, end in tokens["offset_mapping"]]

        # Adding to the corresponding lists
        input_ids.append(tokens["input_ids"])
        attention_masks.append(tokens["attention_mask"])
        texts.append(text)
        labels.append(label_ids)

    # Returning all tokenized data as a dictionary
    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }

def convert_to_dataset(encodings):
    return Dataset.from_dict(encodings)

# Using the function to process your data
train_encodings = encode_data(train_data)
test_encodings = encode_data(test_data)

train_dataset = convert_to_dataset(train_encodings)
test_dataset = convert_to_dataset(test_encodings)

# Loading the model
model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Setting up the Trainer
training_args = TrainingArguments(
    output_dir="./ner_model",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
)

# Function for computing metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Filtering out values that are not labels (e.g., [PAD])
    true_predictions = [pred for pred, lab in zip(predictions.flatten(), labels.flatten()) if lab != -100]
    true_labels = [lab for lab in labels.flatten() if lab != -100]

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_predictions, average="binary")

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Training the model
trainer.train()

# Evaluating the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# Saving the model
model.save_pretrained(r'C:\Users\Irenchik\OneDrive\Робочий стіл\Winstars_test\task_2\ner_model\ner_model')
tokenizer.save_pretrained(r'C:\Users\Irenchik\OneDrive\Робочий стіл\Winstars_test\task_2\ner_model\ner_model')

print("Model training complete and saved.")

Tokenized text: {'input_ids': [101, 138, 11057, 4860, 1113, 1139, 1289, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.00783,0.8,0.984615,0.882759
2,No log,0.003123,0.878378,1.0,0.935252
3,No log,0.000679,0.969697,0.984615,0.977099


Evaluation Results: {'eval_loss': 0.0006789031904190779, 'eval_precision': 0.9696969696969697, 'eval_recall': 0.9846153846153847, 'eval_f1': 0.9770992366412213, 'eval_runtime': 19.8531, 'eval_samples_per_second': 3.274, 'eval_steps_per_second': 0.453, 'epoch': 3.0}
Model training complete and saved.
