In [None]:
# Install necessary libraries
%pip install -q datasets transformers[sentencepiece] simpletransformers scikit-learn

In [None]:
import os
import zipfile

# Define paths
zip_path = "super-ai-ss-5-named-entity-recognition.zip"
extract_path = "super-ai-ss-5-named-entity-recognition"

# Unzip the dataset if not already unzipped
if not os.path.exists(extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    # Flatten directory structure
    for split in ['train', 'eval', 'test']:
        split_path = os.path.join(extract_path, split)
        nested = os.path.join(split_path, split)
        if os.path.exists(nested):
            for fname in os.listdir(nested):
                os.rename(os.path.join(nested, fname), os.path.join(split_path, fname))
            os.rmdir(nested)


In [None]:
from datasets import load_dataset

data_files = {
    "train": "super-ai-ss-5-named-entity-recognition/train/train.csv",
    "validation": "super-ai-ss-5-named-entity-recognition/eval/eval.csv",
    "test": "super-ai-ss-5-named-entity-recognition/test/test.csv"
}

dataset = load_dataset("csv", data_files=data_files)
dataset

In [None]:
from simpletransformers.ner import NERModel, NERArgs
import pandas as pd

# Prepare training and evaluation data
train_df = pd.read_csv("super-ai-ss-5-named-entity-recognition/train/train.csv")
eval_df = pd.read_csv("super-ai-ss-5-named-entity-recognition/eval/eval.csv")

# Set up model arguments
model_args = NERArgs()
model_args.num_train_epochs = 3
model_args.train_batch_size = 8
model_args.evaluate_during_training = True
model_args.labels_list = list(train_df['labels'].unique())

# Create a NERModel
model = NERModel(
    "bert", 
    "bert-base-multilingual-cased", 
    args=model_args,
    use_cuda=False
)

# Train the model
model.train_model(train_df, eval_data=eval_df)


In [None]:
# Evaluate model on test data
test_df = pd.read_csv("super-ai-ss-5-named-entity-recognition/test/test.csv")
result, model_outputs, predictions = model.eval_model(test_df)
print(result)
