# **Load Data**

In [1]:
!pip install datasets==3.2.0



In [2]:
from datasets import load_dataset

data = load_dataset("thainq107/abte-restaurants")

README.md:   0%|          | 0.00/454 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/183k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/61.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3602 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['Tokens', 'Tags', 'Polarities'],
        num_rows: 3602
    })
    test: Dataset({
        features: ['Tokens', 'Tags', 'Polarities'],
        num_rows: 1119
    })
})

In [4]:
data['train'][0]

{'Tokens': ['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.'],
 'Tags': ['0', '0', '1', '0', '0', '0', '0', '0', '0'],
 'Polarities': ['-1', '-1', '0', '-1', '-1', '-1', '-1', '-1', '-1']}

# **Tokenization**

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") # "albert-base-v2"

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
test = tokenizer(data['train'][0]['Tokens'])
test

{'input_ids': [[101, 2021, 102], [101, 1996, 102], [101, 3095, 102], [101, 2001, 102], [101, 2061, 102], [101, 9202, 102], [101, 2000, 102], [101, 2149, 102], [101, 1012, 102]], 'attention_mask': [[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1]]}

In [7]:
def tokenize_and_align_labels(data):
    tokens, tags = [], []
    labels = []

    for tok, pols in zip(data['Tokens'], data['Polarities']):
        bert_tokens = []
        bert_att = []
        pols_label = 0
        for i in range(len(tok)):
            t = tokenizer(tok[i])
            bert_tokens += t
            
            # Don't take the Noun/Subject since we will be using <SEP> to separate it
            if int(pols[i]) != -1:
                bert_att += t
                pols_label = int(pols[i])
                
        tokens.append(' '.join(bert_tokens))
        tags.append(" ".join(bert_att))
        labels.append(pols_label)
    tokenized_inputs = tokenizer(tokens, tags, padding=True, truncation=True,
                                return_tensors = 'pt')
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

preprocessed_data = data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3602 [00:00<?, ? examples/s]

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [8]:
preprocessed_data

DatasetDict({
    train: Dataset({
        features: ['Tokens', 'Tags', 'Polarities', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3602
    })
    test: Dataset({
        features: ['Tokens', 'Tags', 'Polarities', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1119
    })
})

In [9]:
preprocessed_data['train'][0]

{'Tokens': ['But', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us', '.'],
 'Tags': ['0', '0', '1', '0', '0', '0', '0', '0', '0'],
 'Polarities': ['-1', '-1', '0', '-1', '-1', '-1', '-1', '-1', '-1'],
 'input_ids': [101,
  7953,
  1035,
  8909,
  2015,
  3086,
  1035,
  7308,
  7953,
  1035,
  8909,
  2015,
  3086,
  1035,
  7308,
  7953,
  1035,
  8909,
  2015,
  3086,
  1035,
  7308,
  7953,
  1035,
  8909,
  2015,
  3086,
  1035,
  7308,
  7953,
  1035,
  8909,
  2015,
  3086,
  1035,
  7308,
  7953,
  1035,
  8909,
  2015,
  3086,
  1035,
  7308,
  7953,
  1035,
  8909,
  2015,
  3086,
  1035,
  7308,
  7953,
  1035,
  8909,
  2015,
  3086,
  1035,
  7308,
  7953,
  1035,
  8909,
  2015,
  3086,
  1035,
  7308,
  102,
  7953,
  1035,
  8909,
  2015,
  3086,
  1035,
  7308,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

# **Eval**

In [10]:
!pip install evaluate==0.4.3

Collecting evaluate==0.4.3
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [11]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [12]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# **Model**

In [13]:
from transformers import AutoModelForSequenceClassification

id2label = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
label2id = {'Negative': 0, 'Neutral': 1, 'Positive': 2}

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased",
    num_labels=3, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# **Train**

In [15]:
from huggingface_hub import login

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HUGGINGFACE_TOKEN = user_secrets.get_secret("HF_TOKEN")

# Login to Hugging Face
login(HUGGINGFACE_TOKEN)

In [16]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = './ATSC',
    learning_rate = 1e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 10,
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    logging_strategy = 'epoch',
    logging_steps = 1,
    report_to = 'none',
    push_to_hub = True,
    load_best_model_at_end = True,
    metric_for_best_model = 'accuracy'
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = preprocessed_data['train'],
    eval_dataset = preprocessed_data['test'],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()

# Save Model
trainer.save_model("./ATSC")

# Save Model to Hugging Face Hub
trainer.push_to_hub("KanWasTaken/ATSC")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9519,0.900198,0.649687
2,0.9445,0.903439,0.649687
3,0.9412,0.892403,0.649687
4,0.9388,0.892036,0.649687
5,0.9369,0.895045,0.649687
6,0.9359,0.895692,0.649687
7,0.9327,0.89761,0.649687
8,0.9338,0.90063,0.649687
9,0.9312,0.901289,0.649687
10,0.9317,0.902079,0.649687


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/KanWasTaken/ATSC/commit/22727317dedc5607a501332833c3f452e3a27a59', commit_message='KanWasTaken/ATSC', commit_description='', oid='22727317dedc5607a501332833c3f452e3a27a59', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KanWasTaken/ATSC', endpoint='https://huggingface.co', repo_type='model', repo_id='KanWasTaken/ATSC'), pr_revision=None, pr_num=None)

# **Infer**

In [17]:
from transformers import pipeline

token_classifier = pipeline(
    model="thainq107/abte-restaurants-distilbert-base-uncased",
    aggregation_strategy="simple"
)

classifier = pipeline(
    model="KanWasTaken/ATSC"
)

config.json:   0%|          | 0.00/712 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/784 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


In [18]:
test_sentence = 'Shikanoko Nokonoko Koshitantan'
results = token_classifier(test_sentence)
sentence_tags = " ".join([result['word'] for result in results])
pred_label = classifier(f'{test_sentence} [SEP] {sentence_tags}')

print(sentence_tags)
print("-"*59)
print(pred_label)

shi ##kan ##oko no ##kon ##oko ko ##shi ##tan ##tan
-----------------------------------------------------------
[{'label': 'Positive', 'score': 0.5092877149581909}]
