In [1]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")  # Adjust the model if needed

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


+ datapreprocessing

In [6]:
def load_iob_data(filepath):
    tokens, labels = [], []
    sentence_tokens, sentence_labels = [], []
    
    with open(filepath, "r") as file:
        for line in file:
            line = line.strip()
            
            # Check for blank line (indicates end of sentence)
            if not line:
                if sentence_tokens:
                    tokens.append(sentence_tokens)
                    labels.append(sentence_labels)
                    sentence_tokens, sentence_labels = [], []
            else:
                # Token format: word POS_tag IOB_label
                parts = line.split()
                if len(parts) == 3:
                    token, pos_tag, label = parts
                    sentence_tokens.append(token)
                    sentence_labels.append(label)
                else:
                    raise ValueError(f"Unexpected format in line: {line}")
                    
    # Add the last sentence if it exists
    if sentence_tokens:
        tokens.append(sentence_tokens)
        labels.append(sentence_labels)
    
    return tokens, labels

In [3]:
# Load each data split
train_tokens, train_labels = load_iob_data("fold1/train.txt")
val_tokens, val_labels = load_iob_data("fold1/val.txt")
test_tokens, test_labels = load_iob_data("fold1/test.txt")

print(f"Loaded {len(train_tokens)} sentences for training.")
print(f"Loaded {len(val_tokens)} sentences for validation.")
print(f"Loaded {len(test_tokens)} sentences for testing.")

Loaded 1992 sentences for training.
Loaded 850 sentences for validation.
Loaded 864 sentences for testing.


In [7]:
# Create a set of unique labels and a label-to-id mapping
unique_labels = set(label for sentence_labels in train_labels + val_labels + test_labels for label in sentence_labels)
label_to_id = {label: idx for idx, label in enumerate(sorted(unique_labels))}
id_to_label = {idx: label for label, idx in label_to_id.items()}

print("label_to_id:", label_to_id)
print("id_to_label:", id_to_label)

label_to_id: {'B-ART': 0, 'B-CON': 1, 'B-LOC': 2, 'B-MAT': 3, 'B-PER': 4, 'B-SPE': 5, 'I-ART': 6, 'I-CON': 7, 'I-LOC': 8, 'I-MAT': 9, 'I-PER': 10, 'I-SPE': 11, 'O': 12}
id_to_label: {0: 'B-ART', 1: 'B-CON', 2: 'B-LOC', 3: 'B-MAT', 4: 'B-PER', 5: 'B-SPE', 6: 'I-ART', 7: 'I-CON', 8: 'I-LOC', 9: 'I-MAT', 10: 'I-PER', 11: 'I-SPE', 12: 'O'}


In [8]:
# Tokenize and align labels with tokenized data
def tokenize_and_align_labels(tokens, labels):
    tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True)

    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to original word IDs
        label_ids = []
        previous_word_idx = None

        if word_ids is None:
            print(f"Warning: No word_ids generated for sentence {i}. Tokens: {tokens[i]}")
            continue

        # Align each token's label
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id.get(label[word_idx], -100))  # Convert label to ID
            else:
                label_ids.append(-100)  # Ignore sub-tokens
            previous_word_idx = word_idx

        aligned_labels.append(label_ids)

    if len(aligned_labels) != len(tokenized_inputs["input_ids"]):
        print(f"Mismatch in length for tokens and labels. Tokens: {len(tokenized_inputs['input_ids'])}, Labels: {len(aligned_labels)}")
        raise ValueError(f"Mismatch in length for tokens and labels: {len(tokenized_inputs['input_ids'])} vs {len(aligned_labels)}")

    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

In [9]:
# Tokenize and align the datasets
train_data = tokenize_and_align_labels(train_tokens, train_labels)
val_data = tokenize_and_align_labels(val_tokens, val_labels)
test_data = tokenize_and_align_labels(test_tokens, test_labels)

# Convert tokenized data into Hugging Face Dataset format
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)
test_dataset = Dataset.from_dict(test_data)

# Display dataset structure for verification
print(train_dataset[0])

{'input_ids': [101, 15982, 1407, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1], 'labels': [-100, 12, 12, 12, -100]}


In [10]:
print(train_dataset)
print(val_dataset)
print(test_dataset)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1992
})
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 850
})
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 864
})
