In [6]:
from datasets import load_dataset
import os
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
import json

In [7]:
ROOT = Path(os.path.abspath(""))

In [8]:
#Create list of dictionaries that contains the sentences and the labels.
def process_dataset(path):
    dataset = []
    sentences = []
    labels = []
    with open(path,"r") as file:
        for line in file :

            tokens = line.strip().split()
            if len(tokens) != 0 : 
                sentences.append(tokens[0])
                labels.append(tokens[3])
            else :
                dataset.append({'sentences':sentences, 'labels': labels})
                sentences = []
                labels = []
    return dataset

In [9]:
train = process_dataset(ROOT / 'dataset/train.txt')
validation = process_dataset(ROOT / 'dataset/valid.txt')
test = process_dataset(ROOT / 'dataset/test.txt')

In [10]:
#Save dataset in json format
for i in ['train','test','validation'] : 
    with open(f'{i}.json', "w") as json_file:
        json.dump(eval(i), json_file, indent=4)  # 'indent' for pretty formatting (optional)


In [11]:
data_files = { "train": "train.json", "validation": "validation.json", "test": "test.json"}
dataset = load_dataset('json', data_files=data_files)

Using custom data configuration default-218ce2b7ad6263e1


Downloading and preparing dataset json/default to /Users/jlbt/.cache/huggingface/datasets/json/default-218ce2b7ad6263e1/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /Users/jlbt/.cache/huggingface/datasets/json/default-218ce2b7ad6263e1/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentences', 'labels'],
        num_rows: 14987
    })
    validation: Dataset({
        features: ['sentences', 'labels'],
        num_rows: 3466
    })
    test: Dataset({
        features: ['sentences', 'labels'],
        num_rows: 3684
    })
})

In [13]:
#Encode labels
full_labels=[]
for l in dataset['train']:
    full_labels.extend(l['labels'])

unique_labels = list(set(full_labels))
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(unique_labels)

In [14]:
def encode_label(dataset):
    dataset['encoded_label'] = label_encoder.transform(dataset['labels'])
    return dataset

In [22]:
from transformers import AutoTokenizer
ckpt = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(ckpt)

In [23]:
def tokenize(dataset):
#for idx,  d in enumerate(dataset):
        dataset['inputs'] = tokenizer(dataset['sentences'],is_split_into_words=True)
        return dataset

In [17]:
def words_ids(dataset):
    dataset['word_ids'] = dataset['inputs'].word_ids()

In [18]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label < 4 :
                label += 4
            new_labels.append(label)

    return new_labels


In [19]:
def align(dataset):
    dataset['new_label'] = align_labels_with_tokens(list(dataset['encoded_label']),dataset['inputs'].word_ids())

In [20]:
encode_datasets = dataset.map(
    encode_label,
    #batched=True,
    #remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/14987 [00:00<?, ?ex/s]

  0%|          | 0/3466 [00:00<?, ?ex/s]

  0%|          | 0/3684 [00:00<?, ?ex/s]

In [24]:
tokenized_datasets = encode_datasets.map(
    tokenize,
    #batched=True,
    #remove_columns=raw_datasets["train"].column_names,
)


  0%|          | 0/14987 [00:00<?, ?ex/s]

  0%|          | 0/3466 [00:00<?, ?ex/s]

  0%|          | 0/3684 [00:00<?, ?ex/s]

In [25]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentences', 'labels', 'encoded_label', 'inputs'],
        num_rows: 14987
    })
    validation: Dataset({
        features: ['sentences', 'labels', 'encoded_label', 'inputs'],
        num_rows: 3466
    })
    test: Dataset({
        features: ['sentences', 'labels', 'encoded_label', 'inputs'],
        num_rows: 3684
    })
})

In [26]:
align_datasets = tokenized_datasets.map(
    align,
    #batched=True,
    #remove_columns=raw_datasets["train"].column_names,
)

  0%|          | 0/14987 [00:00<?, ?ex/s]

AttributeError: 'dict' object has no attribute 'word_ids'

In [None]:
tokenized_datasets['train']['inputs']

[{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1],
  'input_ids': [101, 118, 141, 9244, 9272, 12426, 1942, 118, 102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0]},
 {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'input_ids': [101,
   7270,
   22961,
   1528,
   1840,
   1106,
   21423,
   1418,
   2495,
   12913,
   119,
   102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]},
 {'attention_mask': [1, 1, 1, 1],
  'input_ids': [101, 1943, 14428, 102],
  'token_type_ids': [0, 0, 0, 0]},
 {'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  'input_ids': [101,
   26660,
   13329,
   12649,
   15928,
   1820,
   118,
   4775,
   118,
   1659,
   102],
  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]},
 {'attention_mask': [1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1],
  'input_ids': [101,
   1109,
   1735,


In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentences', 'labels', 'encoded_label', 'inputs'],
        num_rows: 14987
    })
    validation: Dataset({
        features: ['sentences', 'labels', 'encoded_label', 'inputs'],
        num_rows: 3466
    })
    test: Dataset({
        features: ['sentences', 'labels', 'encoded_label', 'inputs'],
        num_rows: 3684
    })
})

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentences', 'labels', 'encoded_label', 'inputs'],
        num_rows: 14987
    })
    validation: Dataset({
        features: ['sentences', 'labels', 'encoded_label', 'inputs'],
        num_rows: 3466
    })
    test: Dataset({
        features: ['sentences', 'labels', 'encoded_label', 'inputs'],
        num_rows: 3684
    })
})

In [None]:
dataset[1]

{'sentences': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'labels': ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'],
 'encoded_label': array([2, 8, 1, 8, 8, 8, 1, 8, 8]),
 'inputs': {'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 'new_label': [-100, 2, 8, 1, 8, 8, 8, 1, 8, 8, 8, -100],
 'word_ids': [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]}

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([dataset[i]['inputs'] for i in range(2)])
batch["inputs"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyError: 'inputs'

In [None]:
dataset[2]

{'sentences': ['Peter', 'Blackburn'],
 'labels': ['B-PER', 'I-PER'],
 'encoded_label': array([3, 7]),
 'inputs': {'input_ids': [101, 1943, 14428, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]},
 'new_label': [-100, 3, 7, -100],
 'word_ids': [None, 0, 1, None]}

In [None]:
d={'train':dataset}

In [None]:
d['train']

[{'sentences': ['-DOCSTART-'],
  'labels': ['O'],
  'encoded_label': array([8]),
  'inputs': {'input_ids': [101, 118, 141, 9244, 9272, 12426, 1942, 118, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]},
  'new_label': [-100, 8, 8, 8, 8, 8, 8, 8, -100],
  'word_ids': [None, 0, 0, 0, 0, 0, 0, 0, None]},
 {'sentences': ['EU',
   'rejects',
   'German',
   'call',
   'to',
   'boycott',
   'British',
   'lamb',
   '.'],
  'labels': ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'],
  'encoded_label': array([2, 8, 1, 8, 8, 8, 1, 8, 8]),
  'inputs': {'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
  'new_label': [-100, 2, 8, 1, 8, 8, 8, 1, 8, 8, 8, -100],
  'word_ids': [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]},
 {'sentences': ['Peter', 'Blackburn'],
  'labels': ['B-PER', 'I-PER'],