In [1]:
PROJECT_DIR = r'G:\PythonProjects\WineRecognition2'
TRAIN_DATASET_PATH = r'G:\PythonProjects\WineRecognition2\data\text\data_and_menu_gen_samples\Halliday_WineSearcher_Bruxelles_MenuGenSamples_v5_BottleSize_fixed.txt'
TEST_DATASET_PATH = r'G:\PythonProjects\WineRecognition2\data\text\menu_txt_tagged_fixed_bottlesize.txt'
DATA_INFO_PATH = r'G:\PythonProjects\WineRecognition2\data_info.json'
VOCAB_PATH = r'G:\PythonProjects\WineRecognition2\data\vocabs\Words_Halliday_WineSearcher_Bruxelles.json'

TASK = "ner"
NUM_EPOCHS=10
MODEL_CHECKPOINT = "distilbert-base-uncased"
BATCH_SIZE = 64
TEST_SIZE = 0.2
LABEL_ALL_TOKENS = True

In [2]:
import os
from datetime import datetime
import sys
import json
import itertools
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch
if PROJECT_DIR not in sys.path:
    sys.path.insert(0, PROJECT_DIR)
from nn.utils import CustomDataset, train, plot_losses, generate_tag_to_ix, get_model_confidence
from data_master import DataGenerator, count_unk_foreach_tag

  from pyarrow import HadoopFileSystem


In [3]:
with open(DATA_INFO_PATH) as file:
    label_list = json.load(file)['keys']['all']
    tag_to_ix = generate_tag_to_ix(label_list)

In [4]:
def get_token_dataset(train_dataset_path, test_dataset_path, columns = ('tokens', f'{TASK}_tags')):
    with open(train_dataset_path, encoding='utf-8') as file:
        train_sents = DataGenerator.generate_sents2(file.read().split('\n'))

    with open(test_dataset_path, encoding='utf-8') as file:
        test_sents = DataGenerator.generate_sents2(file.read().split('\n'))
        
    train_df = pd.DataFrame(train_sents, columns=columns)
    test_df = pd.DataFrame(test_sents, columns=columns)
    
    train_df['whole_string'] = train_df['tokens'].apply(' '.join)
    test_df['whole_string'] = test_df['tokens'].apply(' '.join)
    
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)
    
    return train_dataset, test_dataset

In [5]:
old_tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
train_dataset, test_dataset = get_token_dataset(TRAIN_DATASET_PATH, TEST_DATASET_PATH)

In [6]:
def get_training_corpus():
    for start_idx in range(0, len(train_dataset), 1000):
        samples = train_dataset[start_idx : start_idx + 1000]
        yield samples["whole_string"]

In [7]:
tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), vocab_size=52000)
tokenizer.save_pretrained(f'./tokenizers/{MODEL_CHECKPOINT}-{datetime.today().strftime("%Y-%m-%d")}')

('./tokenizers/distilbert-base-uncased-2023-03-05\\tokenizer_config.json',
 './tokenizers/distilbert-base-uncased-2023-03-05\\special_tokens_map.json',
 './tokenizers/distilbert-base-uncased-2023-03-05\\vocab.txt',
 './tokenizers/distilbert-base-uncased-2023-03-05\\added_tokens.json',
 './tokenizers/distilbert-base-uncased-2023-03-05\\tokenizer.json')

In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{TASK}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(tag_to_ix[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(tag_to_ix[label[word_idx]] if LABEL_ALL_TOKENS else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [9]:
tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/735803 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

In [10]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=len(tag_to_ix))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [11]:
model_name = MODEL_CHECKPOINT.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{TASK}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    push_to_hub=False,
)

In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [13]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [14]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [15]:
tokenized_train_eval_dataset = tokenized_train_dataset.train_test_split(test_size=TEST_SIZE)

In [16]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_eval_dataset["train"],
    eval_dataset=tokenized_train_eval_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [17]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: whole_string, tokens, ner_tags. If whole_string, tokens, ner_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 588642
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 91980
  Number of trainable parameters = 66375184
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0166,0.011717,0.994432,0.994145,0.994288,0.996384
2,0.0078,0.007705,0.996674,0.996173,0.996423,0.997688
3,0.0052,0.005077,0.997586,0.997415,0.997501,0.998406
4,0.0034,0.004526,0.998023,0.997926,0.997975,0.998662
5,0.0026,0.004406,0.998173,0.997991,0.998082,0.998677
6,0.0018,0.004484,0.998394,0.998196,0.998295,0.998861
6,0.0016,0.004462,0.998278,0.998185,0.998232,0.998818


Saving model checkpoint to distilbert-base-uncased-finetuned-ner\checkpoint-500
Configuration saved in distilbert-base-uncased-finetuned-ner\checkpoint-500\config.json
Model weights saved in distilbert-base-uncased-finetuned-ner\checkpoint-500\pytorch_model.bin
tokenizer config file saved in distilbert-base-uncased-finetuned-ner\checkpoint-500\tokenizer_config.json
Special tokens file saved in distilbert-base-uncased-finetuned-ner\checkpoint-500\special_tokens_map.json
Saving model checkpoint to distilbert-base-uncased-finetuned-ner\checkpoint-1000
Configuration saved in distilbert-base-uncased-finetuned-ner\checkpoint-1000\config.json
Model weights saved in distilbert-base-uncased-finetuned-ner\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in distilbert-base-uncased-finetuned-ner\checkpoint-1000\tokenizer_config.json
Special tokens file saved in distilbert-base-uncased-finetuned-ner\checkpoint-1000\special_tokens_map.json
Saving model checkpoint to distilbert-base-unca

KeyboardInterrupt: 

In [18]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: whole_string, tokens, ner_tags. If whole_string, tokens, ner_tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 147161
  Batch size = 64


{'eval_loss': 0.004462342243641615,
 'eval_precision': 0.9982782408266464,
 'eval_recall': 0.9981850011422628,
 'eval_f1': 0.9982316188071947,
 'eval_accuracy': 0.9988182253672276}

In [None]:
trainer.save_model(f'{MODEL_CHECKPOINT}-wine')

In [None]:
trainer.evaluate(tokenized_test_dataset)