### START

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, recall_score, precision_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("CUDA availaible:", torch.cuda.is_available())
print("Current device index:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))


CUDA availaible: True
Current device index: 0
Device name: NVIDIA GeForce GTX 1650 with Max-Q Design


### DATA PREPARION

In [3]:
def read_conli_file(file_path):
    with open(file_path, "r") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data


In [4]:

def convert_to_dataset(data,label_map):
    formatted_data = {"tokens":[],"ner_tags":[]}
    for sentence in data:
        tokens = [token_data[0] for token_data in sentence]
        ner_tags = [label_map[token_data[3]] for token_data in sentence]
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
    return Dataset.from_dict(formatted_data)


In [5]:

train_data = read_conli_file("conll2003/eng.train")
valid_data = read_conli_file("conll2003/eng.testa")
test_data = read_conli_file("conll2003/eng.testb")



In [6]:
label_list = sorted(list(set([token_data[3] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}

train_data = convert_to_dataset(train_data,label_map)
test_data = convert_to_dataset(test_data,label_map)
valid_data = convert_to_dataset(valid_data,label_map)

datasets = DatasetDict({
    "train":train_data,
    "test":test_data,
    "validation":valid_data
})

### INIT TOKENIZER & MODEL

In [7]:
print(len(datasets["validation"]))

3466


In [8]:
model_name = "google-bert/bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))
    

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### METRICS


In [9]:
def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels) 
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_predictions, true_labels),
    }


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"],truncation=True, is_split_into_words=True, padding=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [10]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

training_args = TrainingArguments(
    output_dir = "./results",
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=100,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

Map: 100%|██████████| 14987/14987 [00:01<00:00, 14149.77 examples/s]
Map: 100%|██████████| 3684/3684 [00:00<00:00, 10119.97 examples/s]
Map: 100%|██████████| 3466/3466 [00:00<00:00, 14340.39 examples/s]


### Trainer

In [11]:
def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)

    return {
        "input_ids":input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,

)

In [12]:
trainer.train()

  5%|▌         | 100/1874 [00:50<11:55,  2.48it/s]

{'loss': 0.3357, 'grad_norm': 1.6143169403076172, 'learning_rate': 4.7331910352187837e-05, 'epoch': 0.05}


 11%|█         | 200/1874 [01:36<13:50,  2.01it/s]

{'loss': 0.1206, 'grad_norm': 7.7172064781188965, 'learning_rate': 4.466382070437567e-05, 'epoch': 0.11}


 16%|█▌        | 300/1874 [02:22<11:33,  2.27it/s]

{'loss': 0.0863, 'grad_norm': 0.772691011428833, 'learning_rate': 4.1995731056563505e-05, 'epoch': 0.16}


 21%|██▏       | 400/1874 [03:09<09:08,  2.69it/s]

{'loss': 0.0807, 'grad_norm': 2.7196974754333496, 'learning_rate': 3.932764140875134e-05, 'epoch': 0.21}


 27%|██▋       | 500/1874 [03:59<11:21,  2.02it/s]

{'loss': 0.0807, 'grad_norm': 3.582540988922119, 'learning_rate': 3.665955176093917e-05, 'epoch': 0.27}


                                                  
 27%|██▋       | 500/1874 [04:56<11:21,  2.02it/s]

{'eval_loss': 0.0668988823890686, 'eval_precision': 0.9072869768217442, 'eval_recall': 0.9156849545607539, 'eval_f1': 0.911466621995142, 'eval_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.96      0.90      0.93      1956\n        MISC       0.85      0.78      0.82      1003\n         ORG       0.84      0.92      0.88      1218\n         PER       0.96      0.97      0.97      1820\n\n   micro avg       0.92      0.91      0.91      5997\n   macro avg       0.90      0.89      0.90      5997\nweighted avg       0.92      0.91      0.91      5997\n', 'eval_runtime': 57.1539, 'eval_samples_per_second': 60.643, 'eval_steps_per_second': 7.594, 'epoch': 0.27}


 32%|███▏      | 600/1874 [05:46<09:31,  2.23it/s]  

{'loss': 0.0664, 'grad_norm': 3.3886260986328125, 'learning_rate': 3.3991462113127e-05, 'epoch': 0.32}


 37%|███▋      | 700/1874 [06:35<10:05,  1.94it/s]

{'loss': 0.0505, 'grad_norm': 3.3517889976501465, 'learning_rate': 3.1323372465314835e-05, 'epoch': 0.37}


 43%|████▎     | 800/1874 [07:24<08:19,  2.15it/s]

{'loss': 0.0602, 'grad_norm': 0.05632724612951279, 'learning_rate': 2.8655282817502672e-05, 'epoch': 0.43}


 48%|████▊     | 900/1874 [08:12<10:10,  1.60it/s]

{'loss': 0.046, 'grad_norm': 2.9125659465789795, 'learning_rate': 2.5987193169690503e-05, 'epoch': 0.48}


 53%|█████▎    | 1000/1874 [09:03<08:42,  1.67it/s]

{'loss': 0.0489, 'grad_norm': 2.8323752880096436, 'learning_rate': 2.3319103521878334e-05, 'epoch': 0.53}


                                                   
 53%|█████▎    | 1000/1874 [10:00<08:42,  1.67it/s]

{'eval_loss': 0.058455970138311386, 'eval_precision': 0.9289178792440207, 'eval_recall': 0.9347021204981488, 'eval_f1': 0.9318010234040769, 'eval_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.97      0.92      0.95      1946\n        MISC       0.82      0.92      0.87       819\n         ORG       0.91      0.89      0.90      1367\n         PER       0.97      0.97      0.97      1847\n\n   micro avg       0.93      0.93      0.93      5979\n   macro avg       0.92      0.93      0.92      5979\nweighted avg       0.94      0.93      0.93      5979\n', 'eval_runtime': 57.4697, 'eval_samples_per_second': 60.31, 'eval_steps_per_second': 7.552, 'epoch': 0.53}


 59%|█████▊    | 1100/1874 [10:52<07:07,  1.81it/s]  

{'loss': 0.0617, 'grad_norm': 2.8857696056365967, 'learning_rate': 2.0651013874066168e-05, 'epoch': 0.59}


 64%|██████▍   | 1200/1874 [11:41<05:05,  2.20it/s]

{'loss': 0.048, 'grad_norm': 1.5948944091796875, 'learning_rate': 1.7982924226254002e-05, 'epoch': 0.64}


 69%|██████▉   | 1300/1874 [12:30<04:17,  2.23it/s]

{'loss': 0.0471, 'grad_norm': 3.3147616386413574, 'learning_rate': 1.5314834578441836e-05, 'epoch': 0.69}


 75%|███████▍  | 1400/1874 [13:16<04:04,  1.94it/s]

{'loss': 0.0455, 'grad_norm': 1.4808944463729858, 'learning_rate': 1.264674493062967e-05, 'epoch': 0.75}


 80%|████████  | 1500/1874 [14:03<03:18,  1.89it/s]

{'loss': 0.0391, 'grad_norm': 0.057029496878385544, 'learning_rate': 9.978655282817503e-06, 'epoch': 0.8}


                                                   
 80%|████████  | 1500/1874 [14:59<03:18,  1.89it/s]

{'eval_loss': 0.04023347795009613, 'eval_precision': 0.9406935835148266, 'eval_recall': 0.9449680242342645, 'eval_f1': 0.9428259591973805, 'eval_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.96      0.97      0.96      1810\n        MISC       0.91      0.86      0.88       983\n         ORG       0.92      0.92      0.92      1342\n         PER       0.97      0.97      0.97      1834\n\n   micro avg       0.94      0.94      0.94      5969\n   macro avg       0.94      0.93      0.93      5969\nweighted avg       0.94      0.94      0.94      5969\n', 'eval_runtime': 56.5719, 'eval_samples_per_second': 61.267, 'eval_steps_per_second': 7.672, 'epoch': 0.8}


 85%|████████▌ | 1600/1874 [15:46<02:01,  2.25it/s]  

{'loss': 0.0414, 'grad_norm': 1.4138754606246948, 'learning_rate': 7.310565635005337e-06, 'epoch': 0.85}


 91%|█████████ | 1700/1874 [16:35<01:40,  1.74it/s]

{'loss': 0.0482, 'grad_norm': 1.4132384061813354, 'learning_rate': 4.6424759871931695e-06, 'epoch': 0.91}


 96%|█████████▌| 1800/1874 [17:26<00:28,  2.58it/s]

{'loss': 0.046, 'grad_norm': 3.5463814735412598, 'learning_rate': 1.9743863393810032e-06, 'epoch': 0.96}


100%|██████████| 1874/1874 [18:03<00:00,  1.73it/s]

{'train_runtime': 1083.9852, 'train_samples_per_second': 13.826, 'train_steps_per_second': 1.729, 'train_loss': 0.07341841547059148, 'epoch': 1.0}





TrainOutput(global_step=1874, training_loss=0.07341841547059148, metrics={'train_runtime': 1083.9852, 'train_samples_per_second': 13.826, 'train_steps_per_second': 1.729, 'total_flos': 960565719981294.0, 'train_loss': 0.07341841547059148, 'epoch': 1.0})

### TEST

In [28]:
sentence = """Rave George Washington was born in 1732 and died in 1799. Berlin Aristotle Emanuel Kant Nietzche"""


tokenized_input = tokenizer(sentence, return_tensors="pt").to(model.device)

print(tokenized_input)
outputs = model(**tokenized_input)


predicted_labels = outputs.logits.argmax(-1)[0]


named_entities = [tokenizer.decode([token]) for token, label in zip(tokenized_input["input_ids"][0], predicted_labels) if label != 0 and label != label_map['O']]


print("Named Entities - Example 1:", named_entities)


{'input_ids': tensor([[  101, 16890,  2707,  1667,  1994,  1108,  1255,  1107, 21121,  1477,
          1105,  1452,  1107, 14287,   119,  3206, 18727, 23271, 14812,  2227,
         27453,  2105,  1584,  4386,   102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]], device='cuda:0')}
Named Entities - Example 1: ['Ra', '##ve', 'George', 'Washington', 'Aristotle', 'Emanuel', 'Ka', '##nt', 'Ni', '##et', '##z', '##che']


In [27]:
trainer.save_model("model/")