# **Load Data**

In [1]:
!pip install datasets==3.2.0



In [2]:
from datasets import load_dataset

data = load_dataset("thainq107/abte-restaurants")

README.md:   0%|          | 0.00/454 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/183k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/61.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3602 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['Tokens', 'Tags', 'Polarities'],
        num_rows: 3602
    })
    test: Dataset({
        features: ['Tokens', 'Tags', 'Polarities'],
        num_rows: 1119
    })
})

In [4]:
data['train'][0]

{'Tokens': ['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.'],
 'Tags': ['0', '0', '1', '0', '0', '0', '0', '0', '0'],
 'Polarities': ['-1', '-1', '0', '-1', '-1', '-1', '-1', '-1', '-1']}

# **Tokenization**

In [5]:
from transformers import AutoTokenizer

# Model for Tokenization
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = []
    labels = []
    for tokens, tags in zip(examples['Tokens'], examples['Tags']):

        bert_tokens = []
        bert_tags = []
        for i in range(len(tokens)):
            t = tokenizer.tokenize(tokens[i])
            bert_tokens += t
            bert_tags += [int(tags[i])]*len(t)

        bert_ids = tokenizer.convert_tokens_to_ids(bert_tokens)

        tokenized_inputs.append(bert_ids)
        labels.append(bert_tags)

    return {
            'input_ids': tokenized_inputs,
            'labels': labels
        }
preprocessed_data = data.map(tokenize_and_align_labels, batched=True)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3602 [00:00<?, ? examples/s]

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

# **Data Collator**

In [6]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer = tokenizer)

# **Eval**

In [7]:
!pip install seqeval==1.2.2
import numpy as np
from seqeval.metrics import f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        # Ignore Tokens with Idx = -100 (Padding Tokens)
        # Too many padding tokens => Inaccurate accuracy_score because there's too many <PAD>
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = f1_score(true_predictions, true_labels)
    return {"F1-score": results}

Collecting seqeval==1.2.2
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=089f34c2057ace31638e0d6dfa7d12a853c0c998d7364c57b2a19380575e182c
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


# **Model**

In [8]:
from transformers import AutoModelForTokenClassification
id2label = {
    0: "0",
    1: "B-Term", # Begin Terminology
    2: "I-Term" # Inside Terminology
}

label2id = {
    "0": 0,
    "B-Term": 1,
    "I-Term": 2
}

In [9]:
# Model Token Classification
model = AutoModelForTokenClassification.from_pretrained(
    'distilbert/distilbert-base-uncased',
    num_labels = 3, id2label=id2label, label2id = label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Training**

In [10]:
from huggingface_hub import login

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HUGGINGFACE_TOKEN = user_secrets.get_secret("HF_TOKEN")

# Login to Hugging Face
login(HUGGINGFACE_TOKEN)

In [11]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = './ATE',
    learning_rate = 1e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 10,
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    logging_strategy = 'epoch',
    logging_steps = 1,
    report_to = 'none',
    push_to_hub = True,
    load_best_model_at_end = True,
    metric_for_best_model = 'F1-score'
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = preprocessed_data['train'],
    eval_dataset = preprocessed_data['test'],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

trainer.train()

# Save Model
trainer.save_model("./ATE")

# Save Model to Hugging Face Hub
trainer.push_to_hub("KanWasTaken/ATE")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1-score
1,0.2899,0.220312,0.683841
2,0.1197,0.189248,0.776599
3,0.0694,0.191853,0.796061
4,0.0459,0.222295,0.800843
5,0.0389,0.228283,0.809047
6,0.0268,0.237394,0.809544
7,0.0225,0.245751,0.808367
8,0.0194,0.255738,0.809441
9,0.017,0.258805,0.810572
10,0.0163,0.2578,0.806428


No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/KanWasTaken/ATE/commit/f452d5ff524f65bc5df88e38ad6606468e58a44c', commit_message='KanWasTaken/ATE', commit_description='', oid='f452d5ff524f65bc5df88e38ad6606468e58a44c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KanWasTaken/ATE', endpoint='https://huggingface.co', repo_type='model', repo_id='KanWasTaken/ATE'), pr_revision=None, pr_num=None)

# **Infer**

In [12]:
from transformers import pipeline
token_cls = pipeline(
    model = 'KanWasTaken/ATE',
    aggregation_strategy = 'simple' # Aggregate Tokens Begin-Inside
)

test_sentence = "Shikanoko Nokonoko Koshitantan"
results = token_cls(test_sentence)

config.json:   0%|          | 0.00/712 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0
