In [1]:
!pip -q install underthesea mlflow
!pip -q install transformers[torch]

In [2]:
import pandas as pd
import re
import numpy as np
from underthesea import word_tokenize

In [3]:
period_strip = re.compile(r'(?!<=\d)(\.)(?!\d)')
comma_strip = re.compile(r'(\d)(,)(\d)')
punctuation_chars = re.escape(r';/[]"{}()=+\_-><@`,?!.')
punctuation = re.compile(r'([{}])'.format(re.escape(punctuation_chars)))
punctuation_with_a_space = re.compile(r'(?<= )([{0}])|([{0}])(?= )'.format(punctuation_chars))

def process_punctuation(s):
    if punctuation.search(s) is None:
        return s
    s = punctuation_with_a_space.sub('', s)
    if re.search(comma_strip, s) is not None:
        s = s.replace(',', '')
    s = punctuation.sub(' ', s)
    s = period_strip.sub('', s)
    return s.strip()

In [4]:
train_df = pd.read_excel('/content/Label.xlsx')[['CONTENT', 'QUALITY']]
train_df.reset_index(drop=True, inplace=True)
train_df['QUALITY'] = train_df['QUALITY'].replace({'Negative': 0, 'Positive': 1, 'Neutral': 2, np.NaN: 3}).astype(int)
train_df.dropna(inplace=True)

train_df['CONTENT'] = train_df['CONTENT'].apply(lambda x: word_tokenize(process_punctuation(x.lower()), format="text"))

test_df = pd.read_excel('/content/NonLabel.xlsx')[['CONTENT']]
test_df['CONTENT'] = test_df['CONTENT'].apply(lambda x: word_tokenize(process_punctuation(x.lower()), format="text"))

In [5]:
from sklearn.model_selection import train_test_split

train_df['dummy_target'] = train_df['QUALITY']
X_train, X_eval, _, _ = train_test_split(
    train_df,
    train_df['dummy_target'],
    test_size=0.2,
    random_state=42,
    stratify=train_df['dummy_target']
)

train_df = pd.DataFrame({
    'CONTENT': X_train['CONTENT'],
    'QUALITY': X_train['QUALITY']
})

eval_df = pd.DataFrame({
    'CONTENT': X_eval['CONTENT'],
    'QUALITY': X_eval['QUALITY']
})

train_df.reset_index(drop=True, inplace=True)
eval_df.reset_index(drop=True, inplace=True)

In [6]:
from transformers import RobertaForSequenceClassification, RobertaConfig, AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False)
# model = RobertaForSequenceClassification.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", num_labels=4, ignore_mismatched_sizes=True)

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
model = RobertaForSequenceClassification.from_pretrained("vinai/phobert-base-v2", num_labels=4)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from torch.utils.data import Dataset

class CommentDataset(Dataset):
  def __init__(self, df, tokenizer):
    self.df = df
    self.tokenizer = tokenizer
  def __len__(self):
    return len(self.df)
  def __getitem__(self, idx):
    item = self.df.iloc[idx]

    comment = item['CONTENT']

    try:
      label = item['QUALITY']
    except:
      label = None

    tokenized_comment = self.tokenizer(
        comment,
        truncation=True,
        max_length=250
    )

    if label is None:
      return {
        'input_ids': tokenized_comment.input_ids,
        'attention_mask': tokenized_comment.attention_mask
      }

    return {
        'input_ids': tokenized_comment.input_ids,
        'attention_mask': tokenized_comment.attention_mask,
        'labels': label
    }

train_dataset = CommentDataset(train_df, tokenizer)
eval_dataset = CommentDataset(eval_df, tokenizer)
test_dataset = CommentDataset(test_df, tokenizer)

In [8]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
from sklearn.metrics import accuracy_score, f1_score, recall_score
import numpy as np

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)

  result = {
      'accuracy': accuracy_score(predictions, labels) * 100,
      'f1': f1_score(predictions, labels, average='macro') * 100,
      'recall': recall_score(predictions, labels, average='macro') * 100
  }

  return {k: round(v, 4) for k, v in result.items()}

In [10]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

args = TrainingArguments(
    output_dir="output",
    lr_scheduler_type='cosine',
    warmup_ratio=0.1,
    logging_strategy='epoch',
    evaluation_strategy = "epoch",
    save_strategy='epoch',
    save_total_limit=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=40,
    learning_rate=2e-5,
    weight_decay=0.01,
    dataloader_num_workers=2,
    report_to='mlflow',
    save_safetensors=False,
    fp16=True,
    bf16=False,
    metric_for_best_model='accuracy',
    load_best_model_at_end=True,
    greater_is_better=True
)

# args = TrainingArguments(
#     output_dir="output",
#     lr_scheduler_type='cosine',
#     warmup_ratio=0.1,
#     logging_strategy='epoch',
#     # evaluation_strategy = "epoch",
#     save_strategy='epoch',
#     save_total_limit=1,
#     per_device_train_batch_size=48,
#     per_device_eval_batch_size=48,
#     num_train_epochs=4,
#     learning_rate=2e-5,
#     weight_decay=0.01,
#     dataloader_num_workers=2,
#     report_to='mlflow',
#     save_safetensors=False,
#     fp16=True,
#     bf16=False,
#     # metric_for_best_model='accuracy',
#     # load_best_model_at_end=True,
#     # greater_is_better=True
# )

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall
1,1.1905,0.962313,65.5652,35.7787,34.2821
2,0.859,0.766537,71.8261,56.1271,61.1579
3,0.7175,0.795003,71.2174,56.2907,63.2499
4,0.6345,0.723498,72.2609,64.3226,64.2149
5,0.55,0.704994,74.3478,65.2535,65.1984
6,0.4518,0.840989,71.1304,64.6079,64.9316


  _warn_prf(average, modifier, msg_start, len(result))
Checkpoint destination directory output/checkpoint-864 already exists and is non-empty. Saving will proceed but saved results may be invalid.


KeyboardInterrupt: 

In [None]:
mapping_rate = ['Negative', 'Positive', 'Neutral', '']

predictions = trainer.predict(test_dataset)[0]
predictions = np.argmax(predictions, axis=1)
predictions = [mapping_rate[pred] for pred in predictions]

In [None]:
test_df['QUALITY'] = predictions
test_df