In [1]:
# --- 1. Setup and Imports ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix  # Example
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments  # If using Transformers
import torch
from seqeval.metrics import classification_report as seqeval_report  # For seqeval
import platform
from datasets import load_dataset

# --- Optional ---
import nltk
# import spacy

# --- Set random seed for reproducibility ---
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

# --- M1 Specific Check and Setup ---
if platform.machine() == "arm64":  # Detect Apple Silicon
    print("Running on Apple Silicon (M1)")
    if torch.backends.mps.is_available(): #Ensure доступность GPU
        device = torch.device("mps")
        torch.mps.manual_seed(SEED)
        print("MPS device found, using GPU")
    else:
        device = torch.device("cpu")
        torch.cuda.manual_seed_all(SEED)  # If using GPU
        print("MPS device not found, using CPU")
else:
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    torch.cuda.manual_seed_all(SEED)  # If using GPU
    print("Not running on Apple Silicon, using", device)

# --- 2. Data Loading and Exploration ---

# Load the dataset
dataset = load_dataset("surrey-nlp/PLOD-CW-25")
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

def extract_data(split):
    tokens = []
    pos_tags = []
    bio_tags = []
    for example in split:
        tokens.extend(example['tokens'])
        pos_tags.extend(example['pos_tags'])
        bio_tags.extend(example['bio_tags'])
    return tokens, pos_tags, bio_tags

train_tokens, train_pos_tags, train_bio_tags = extract_data(train_data)
val_tokens, val_pos_tags, val_bio_tags = extract_data(validation_data)
test_tokens, test_pos_tags, test_bio_tags = extract_data(test_data)

# Basic Data Exploration
print(f"Number of training tokens: {len(train_tokens)}")
print(f"Number of validation tokens: {len(val_tokens)}")
print(f"Number of test tokens: {len(test_tokens)}")

# Example: Distribution of BIO tags
bio_tag_counts = pd.Series(train_bio_tags).value_counts()
print("\nBIO Tag Distribution:\n", bio_tag_counts)
bio_tag_counts.plot(kind='bar')
plt.title('Distribution of BIO Tags')
plt.xlabel('BIO Tag')
plt.ylabel('Count')
plt.show()

# Add more exploration as needed (e.g., POS tag distribution, sequence lengths, etc.)

ModuleNotFoundError: No module named 'pandas'