In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from pathlib import Path
import random
import re

random.seed(13)

# full dataset from Kaggle
# dataset_path = Path('../../dataset/sa/dataset.csv').resolve()

# cleaned dataset
# dataset_heartless_path = Path('../../dataset/sa/dataset_cleaned_heartless.pkl').resolve()

# load with random selection
# dataset = pd.read_csv(dataset_path, skiprows=lambda i: i > 0 and random.random() > p)
# dataset.head()


# load cleaned dataset
# dataset = pd.read_pickle(dataset_heartless_path)
# dataset = dataset.sample(frac=p)
# dataset.info()

# copied from the first cell of eda.ipynb

dataset_heartless_path = Path('../../dataset/sa/dataset_cleaned_heartless.pkl').resolve()

dataset = pd.read_pickle(dataset_heartless_path)
# dataset = dataset.sample(frac=p)      # no sampling is needed

# convert the text to string object
dataset['review_text'] = dataset['review_text'].astype('str')

# drop any duplicate just in case
dataset = dataset.drop_duplicates(keep='first')

# replace -1 to 0
# then 0 = negative, 1 = positive
# for easier processing
dataset['review_score'] = dataset['review_score'].replace(-1, 0)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4891928 entries, 0 to 4891927
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
dtypes: int64(4), object(2)
memory usage: 223.9+ MB


In [2]:
# first do data cleaning

# convert to string
dataset['review_text'] = dataset['review_text'].astype('str')

In [3]:
dataset = dataset.drop_duplicates(keep='first')

remove some characters, but not do the stop words, stemming and lemmatizing (no doing them in Keras example ??)

In [4]:
def clean(raw):
    """ Remove hyperlinks and markup """
    result = re.sub("<[a][^>]*>(.+?)</[a]>", 'Link.', raw)
    result = re.sub('&gt;', "", result)
    result = re.sub('&#x27;', "'", result)
    result = re.sub('&quot;', '"', result)
    result = re.sub('&#x2F;', ' ', result)
    result = re.sub('<p>', ' ', result)
    result = re.sub('</i>', '', result)
    result = re.sub('&#62;', '', result)
    result = re.sub('<i>', ' ', result)
    result = re.sub("\n", '', result)
    return result

In [5]:
def remove_num(texts):
   output = re.sub(r'\d+', '', texts)
   return output

In [6]:
def deEmojify(x):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', x)

In [7]:
def unify_whitespaces(x):
    cleaned_string = re.sub(' +', ' ', x)
    return cleaned_string 

In [8]:
def remove_symbols(x):
    cleaned_string = re.sub(r"[^a-zA-Z0-9?!.,]+", ' ', x)
    return cleaned_string

In [9]:
def remove_punctuation(text):
    final = "".join(u for u in text if u not in ("?", ".", ";", ":",  "!",'"',','))
    return final

In [10]:
def cleaning(df, review_col_name):
    df[review_col_name] = df[review_col_name].apply(clean)
    df[review_col_name] = df[review_col_name].apply(deEmojify)
    # df[review_col_name] = df[review_col_name].str.lower()
    # df[review_col_name] = df[review_col_name].apply(remove_num)
    # df[review_col_name] = df[review_col_name].apply(remove_symbols)
    # df[review_col_name] = df[review_col_name].apply(remove_punctuation)

Time for cleaning

4.8M rows, two functions -> 30.5 sec

In [11]:
cleaning(dataset, 'review_text')

dataset.head(20)

Unnamed: 0,index,app_id,app_name,review_text,review_score,review_votes
0,0,10,Counter-Strike,Ruined my life.,1,0
1,1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,2,10,Counter-Strike,This game saved my virginity.,1,0
3,3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,4,10,Counter-Strike,"Easy to learn, hard to master.",1,1
5,5,10,Counter-Strike,"No r8 revolver, 10/10 will play again.",1,1
6,6,10,Counter-Strike,Still better than Call of Duty: Ghosts...,1,1
7,7,10,Counter-Strike,"cant buy skins, cases, keys, stickers - gaben ...",1,1
8,8,10,Counter-Strike,"Counter-Strike: Ok, after 9 years of unlimited...",1,1
9,9,10,Counter-Strike,Every server is spanish or french. I can now f...,1,0


In [12]:
# remove rows have all whitespaces
dataset['num_of_words'] = dataset['review_text'].apply(lambda x:len(str(x).split()))
dataset = dataset[dataset['num_of_words'] > 0]

dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4891259 entries, 0 to 4891927
Data columns (total 7 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   index         int64 
 1   app_id        int64 
 2   app_name      object
 3   review_text   object
 4   review_score  int64 
 5   review_votes  int64 
 6   num_of_words  int64 
dtypes: int64(5), object(2)
memory usage: 298.5+ MB


In [13]:
temp = dataset.groupby('review_score').count()['review_text'].reset_index().sort_values(by='review_score',ascending=True)
temp.style.background_gradient(cmap='Purples')

Unnamed: 0,review_score,review_text
0,0,780927
1,1,4110332


Create training, testing and validation dataset

(Training + Testing) : Validation = 8:2

In [14]:
X = dataset['review_text']
y = dataset['review_score']

In [15]:
from sklearn.model_selection import train_test_split
X_train_test, X_valid, y_train_test, y_valid = train_test_split(X, y, random_state=42, test_size=0.2)

In [16]:
print(len(X_valid))
print(len(y_valid))
print(len(X_train_test))
print(len(y_train_test))

978252
978252
3913007
3913007


Then we create a balanced dataset for training and testing

In [17]:
from imblearn.under_sampling import RandomUnderSampler

# X_train, X_test, y_train, y_test

# oversampling = RandomOverSampler(sampling_strategy=0.5)     # raise the ratio minority_data:majority_data as 1 (i.e. equal number of samples)
under = RandomUnderSampler(sampling_strategy=1.0, random_state=13)          # then select ? of it

# X_train_resampled, y_train_resampled = oversampling.fit_resample(X_train.to_numpy().reshape(-1, 1), y_train.to_numpy().reshape(-1, 1))
# X_train_resampled, y_train_resampled = under.fit_resample(X_train_resampled, y_train_resampled)

X_train_test_resampled, y_train__test_resampled = under.fit_resample(X_train_test.to_numpy().reshape(-1, 1), y_train_test.to_numpy().reshape(-1, 1))

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_train_test_resampled, y_train__test_resampled, random_state=13, test_size=.1)

In [19]:
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

1124715
1124715
124969
124969


Hugging face stuff :D

The original shape of X_train does not match with the requirement (need a flattened 1-D array of strings)

In [20]:
print(X_train.shape)
print(y_train.shape)

(1124715, 1)
(1124715,)


In [21]:
# create a dataset object for handling large amount of data
from datasets import Dataset

ds_train = Dataset.from_dict({
    "text": [str(s) for s in list(X_train.flatten())],
    "label": list(y_train)
})

ds_test = Dataset.from_dict({
    "text": [str(s) for s in list(X_test.flatten())],
    "label": list(y_test)
})

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_dataset(data):
    # Keys of the returned dictionary will be added to the dataset as columns
    return tokenizer(data["text"], max_length=tokenizer.model_max_length, truncation=True)

# the tokenizer only accept list of strings
# tokenized_data = tokenizer(ds_train['text'], return_tensors="np", padding=True)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
# tokenized_data = dict(tokenized_data)

# labels = np.array(dataset["label"])  # Label is already an array of 0 and 1

In [47]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [23]:
tokenizer.model_max_length 

512

In [24]:
ds_train

Dataset({
    features: ['text', 'label'],
    num_rows: 1124715
})

In [25]:
ds_test

Dataset({
    features: ['text', 'label'],
    num_rows: 124969
})

PyTorch Trainer API backend

Copy from https://huggingface.co/docs/transformers/training#train-with-pytorch-trainer

TODO: experiment on the TrainingArguments class, e.g. hyperparameters  
and also different parameters in 'Evaluate' library

Also do logging about the training loss for each epoch (read: https://discuss.huggingface.co/t/using-tensorboard-summarywriter-with-huggingface-trainerapi/23015/5 later)

In [26]:
def tokenize_dataset(data):
    # Keys of the returned dictionary will be added to the dataset as columns
    return tokenizer(data["text"], padding='max_length', max_length=tokenizer.model_max_length, truncation=True)

# apply tokenizer to the dataset
ds_train = ds_train.map(tokenize_dataset, batched=True)
ds_test = ds_test.map(tokenize_dataset, batched=True)

Map: 100%|██████████| 1124715/1124715 [01:48<00:00, 10342.70 examples/s]
Map: 100%|██████████| 124969/124969 [00:12<00:00, 10237.17 examples/s]


In [27]:
# selecting 100k samples (roughly 10%) for training (and testing the program flow
# and 10k samples for testing (reduce time)
n_train = 100 * 1000
n_test = 10 * 1000

ds_train_small = ds_train.shuffle(seed=42).select(range(n_train))
ds_test_small = ds_test.shuffle(seed=13).select(range(n_test))

In [37]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Plot both training and testing loss on the same graph

https://stackoverflow.com/questions/73281901/is-there-a-way-to-plot-training-and-validation-losses-on-the-same-graph-with-hug

In [36]:
import os
from transformers import TrainerCallback
from transformers.integrations import is_tensorboard_available

def custom_rewrite_logs(d, mode):
    '''If you want to combine train and eval for other metrics besides the loss then custom_rewrite_logs should be modified accordingly.'''
    new_d = {}
    eval_prefix = "eval_"
    eval_prefix_len = len(eval_prefix)
    test_prefix = "test_"
    test_prefix_len = len(test_prefix)

    # we combine loss, accuracy, recall and f1

    for k, v in d.items():
        if mode == 'eval' and k.startswith(eval_prefix):
            if k[eval_prefix_len:] == 'loss':
                new_d["combined/" + k[eval_prefix_len:]] = v

            elif k[eval_prefix_len:] == 'accuracy':
                new_d["combined/" + k[eval_prefix_len:]] = v

            elif k[eval_prefix_len:] == 'recall':
                new_d["combined/" + k[eval_prefix_len:]] = v

            elif k[eval_prefix_len:] == 'f1':
                new_d["combined/" + k[eval_prefix_len:]] = v
            
        elif mode == 'test' and k.startswith(test_prefix):
            if k[test_prefix_len:] == 'loss':
                new_d["combined/" + k[test_prefix_len:]] = v

            elif k[test_prefix_len:] == 'accuracy':
                new_d["combined/" + k[test_prefix_len:]] = v

            elif k[test_prefix_len:] == 'recall':
                new_d["combined/" + k[test_prefix_len:]] = v    

            elif k[test_prefix_len:] == 'f1':
                new_d["combined/" + k[test_prefix_len:]] = v

        elif mode == 'train':
            if k == 'loss':
                new_d["combined/" + k] = v

            elif k == 'train_accuracy':
                new_d["combined/" + 'accuracy'] = v

            elif k == 'train_recall':
                new_d["combined/" + 'recall'] = v

            elif k == 'train_f1':
                new_d["combined/" + 'f1'] = v
                
    return new_d


class CombinedTensorBoardCallback(TrainerCallback):
    """
    A [`TrainerCallback`] that sends the logs to [TensorBoard](https://www.tensorflow.org/tensorboard).
    Args:
        tb_writer (`SummaryWriter`, *optional*):
            The writer to use. Will instantiate one if not set.
    """

    def __init__(self, tb_writers=None):
        has_tensorboard = is_tensorboard_available()
        if not has_tensorboard:
            raise RuntimeError(
                "TensorBoardCallback requires tensorboard to be installed. Either update your PyTorch version or"
                " install tensorboardX."
            )
        if has_tensorboard:
            try:
                from torch.utils.tensorboard import SummaryWriter  # noqa: F401

                self._SummaryWriter = SummaryWriter
            except ImportError:
                try:
                    from tensorboardX import SummaryWriter

                    self._SummaryWriter = SummaryWriter
                except ImportError:
                    self._SummaryWriter = None
        else:
            self._SummaryWriter = None
        self.tb_writers = tb_writers

    def _init_summary_writer(self, args, log_dir=None):
        log_dir = log_dir or args.logging_dir
        if self._SummaryWriter is not None:
            self.tb_writers = dict(train=self._SummaryWriter(log_dir=os.path.join(log_dir, 'train')),
                                   eval=self._SummaryWriter(log_dir=os.path.join(log_dir, 'eval')))

    def on_train_begin(self, args, state, control, **kwargs):
        if not state.is_world_process_zero:
            return

        log_dir = None

        if state.is_hyper_param_search:
            trial_name = state.trial_name
            if trial_name is not None:
                log_dir = os.path.join(args.logging_dir, trial_name)

        if self.tb_writers is None:
            self._init_summary_writer(args, log_dir)

        for k, tbw in self.tb_writers.items():
            tbw.add_text("args", args.to_json_string())
            if "model" in kwargs:
                model = kwargs["model"]
                if hasattr(model, "config") and model.config is not None:
                    model_config_json = model.config.to_json_string()
                    tbw.add_text("model_config", model_config_json)
            # Version of TensorBoard coming from tensorboardX does not have this method.
            if hasattr(tbw, "add_hparams"):
                tbw.add_hparams(args.to_sanitized_dict(), metric_dict={})

    def on_log(self, args, state, control, logs=None, **kwargs):
        if not state.is_world_process_zero:
            return

        if self.tb_writers is None:
            self._init_summary_writer(args)

        for tbk, tbw in self.tb_writers.items():
            logs_new = custom_rewrite_logs(logs, mode=tbk)
            for k, v in logs_new.items():
                if isinstance(v, (int, float)):
                    tbw.add_scalar(k, v, state.global_step)
                else:
                    logger.warning(
                        "Trainer is attempting to log a value of "
                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
                        "This invocation of Tensorboard's writer.add_scalar() "
                        "is incorrect so we dropped this attribute."
                    )
            tbw.flush()

    def on_train_end(self, args, state, control, **kwargs):
        for tbw in self.tb_writers.values():
            tbw.close()
        self.tb_writers = None

In [38]:
from transformers import TrainingArguments, Trainer

# Specify where to save the checkpoints from your training:
training_args = TrainingArguments(output_dir="test_trainer_24-11-2023_v5",
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32,
                                  learning_rate=2e-5,
                                  weight_decay=0.01,
                                #   warmup_steps=10000,   # seems not required in fine-tuning, as we have less than 10k steps
                                  num_train_epochs=3,
                                  evaluation_strategy="steps",
                                  save_strategy="steps",
                                  save_steps=500,
                                  eval_steps=500,
                                  metric_for_best_model='eval_loss',
                                  load_best_model_at_end=True)

In [31]:
import evaluate

metric_acc = evaluate.load("accuracy")
metric_recall = evaluate.load('recall')
metric_f1 = evaluate.load('f1')

In [32]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred      # logits: an n*num_of_class array with probability, e.g. [[ 1.9851098, -1.6966375],[ 2.7240963, -2.372472 ],...], labels = true labels
    predictions = np.argmax(logits, axis=-1)
    acc = metric_acc.compute(predictions=predictions, references=labels)['accuracy']
    recall = metric_recall.compute(predictions=predictions, references=labels)['recall']
    f1_score = metric_f1.compute(predictions=predictions, references=labels, pos_label=1)['f1']
    # rocauc = metric_rocauc.compute(predictions=predictions, references=labels)['roc_auc']
    return {'accuracy': acc, "recall": recall, "f1": f1_score}

In [33]:
# override original trainer to add evaluation on training dataset as well

import math
import time
from typing import Dict, List, Optional
from transformers.trainer_utils import speed_metrics
from transformers.debug_utils import DebugOption
from transformers.utils import  is_torch_tpu_available

if is_torch_tpu_available(check_device=False):
    import torch_xla.core.xla_model as xm
    import torch_xla.debug.metrics as met

class MyTrainer(Trainer):
    def evaluate(self, 
                 eval_dataset: Optional[Dataset] = None,
                 ignore_keys: Optional[List[str]] = None,
                 metric_key_prefix: str = "eval"
                 ) -> Dict[str, float]:
        """
            Run evaluation and returns metrics.
            The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
            (pass it to the init `compute_metrics` argument).
            You can also subclass and override this method to inject custom behavior.
            Args:
                eval_dataset (`Dataset`, *optional*):
                    Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns
                    not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
                    method.
                ignore_keys (`Lst[str]`, *optional*):
                    A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                    gathering predictions.
                metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
                    An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
                    "eval_bleu" if the prefix is "eval" (default)
            Returns:
                A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
                dictionary also contains the epoch number which comes from the training state.
            """
        # memory metrics - must set up as early as possible
        self._memory_tracker.start()

        eval_dataloader = self.get_eval_dataloader(eval_dataset)
        train_dataloader = self.get_train_dataloader()
        start_time = time.time()

        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        eval_output = eval_loop(
            eval_dataloader,
            description="Evaluation",
            # No point gathering the predictions if there are no metrics, otherwise we defer to
            # self.args.prediction_loss_only
            prediction_loss_only=True if self.compute_metrics is None else None,
            ignore_keys=ignore_keys,
            metric_key_prefix=metric_key_prefix,
        )

        train_output = eval_loop(
            train_dataloader,
            description='Training Evaluation',
            prediction_loss_only=True if self.compute_metrics is None else None,
            ignore_keys=ignore_keys,
            metric_key_prefix="train",
        )

        total_batch_size = self.args.eval_batch_size * self.args.world_size
        if f"{metric_key_prefix}_jit_compilation_time" in eval_output.metrics:
            start_time += eval_output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
        eval_output.metrics.update(
            speed_metrics(
                metric_key_prefix,
                start_time,
                num_samples=eval_output.num_samples,
                num_steps=math.ceil(eval_output.num_samples / total_batch_size),
            )
        )

        train_n_samples = len(self.train_dataset)
        train_output.metrics.update(speed_metrics('train', start_time, train_n_samples))
        self.log(train_output.metrics | eval_output.metrics)

        if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
            xm.master_print(met.metrics_report())

        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, train_output.metrics)
        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, eval_output.metrics)

        self._memory_tracker.stop_and_update_metrics(eval_output.metrics)
        self._memory_tracker.stop_and_update_metrics(train_output.metrics)

        # only works in Python >= 3.9
        return train_output.metrics | eval_output.metrics

In [39]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=ds_train_small,
#     eval_dataset=ds_test_small,
#     # train_dataset=ds_train,
#     # eval_dataset=ds_test,
#     compute_metrics=compute_metrics,
#     callbacks=[CombinedTensorBoardCallback]
# )

trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train_small,
    eval_dataset=ds_test_small,
    # train_dataset=ds_train,
    # eval_dataset=ds_test,
    compute_metrics=compute_metrics,
    callbacks=[CombinedTensorBoardCallback]
)

In [41]:
trainer.train()

# trainer.train(resume_from_checkpoint=True)

Step,Training Loss,Validation Loss,Accuracy,Recall,F1
500,0.3548,0.282712,0.8865,0.85123,0.88483
1000,0.2765,0.24317,0.9,0.877001,0.89984
1500,0.2536,0.233188,0.9025,0.891449,0.903532
2000,0.2486,0.231922,0.9065,0.895744,0.907526
2500,0.2425,0.226868,0.9091,0.899453,0.910204
3000,0.2405,0.235578,0.9121,0.897696,0.912754
3500,0.1842,0.260092,0.9142,0.901015,0.914948
4000,0.1667,0.243925,0.9109,0.882273,0.910263
4500,0.1634,0.231016,0.9155,0.915072,0.917311
5000,0.1654,0.220146,0.9169,0.926786,0.919516


TrainOutput(global_step=9375, training_loss=0.179933759765625, metrics={'train_runtime': 12653.4358, 'train_samples_per_second': 23.709, 'train_steps_per_second': 0.741, 'total_flos': 7.8933316608e+16, 'train_loss': 0.179933759765625, 'epoch': 3.0})

In [42]:
# save model
# save model only saves the tokenizer with the model
# https://huggingface.co/docs/transformers/v4.35.2/en/main_classes/trainer#transformers.Trainer

trainer.save_model("test_trainer_save_model_24-11-2023_v5")

In [45]:
trainer.evaluate(
    ds_test
)

{'train_loss': 0.11214113980531693,
 'train_accuracy': 0.96127,
 'train_recall': 0.9673028879784151,
 'train_f1': 0.9615281461394046,
 'train_runtime': 518.6035,
 'train_samples_per_second': 192.826,
 'eval_loss': 0.22014589607715607,
 'eval_accuracy': 0.9169,
 'eval_recall': 0.9267864115579851,
 'eval_f1': 0.9195157384987893,
 'eval_runtime': 518.6035,
 'eval_samples_per_second': 19.283,
 'eval_steps_per_second': 0.604}

In [63]:
trainer.predict(
    ds_test.shuffle(seed=1).select(range(5))
)

PredictionOutput(predictions=array([[-2.5350044,  2.991728 ],
       [ 1.0069603, -2.805289 ],
       [-2.4684582,  3.0222573],
       [-2.1462834,  2.3476632],
       [ 1.9798595, -3.9196928]], dtype=float32), label_ids=array([1, 0, 1, 1, 0]), metrics={'test_loss': 0.008759320713579655, 'test_accuracy': 1.0, 'test_recall': 1.0, 'test_f1': 1.0, 'test_runtime': 0.0466, 'test_samples_per_second': 107.387, 'test_steps_per_second': 21.477})

The final probability of each class is calculated by applying softmax to the values for each row

Here marks the end of training :D

---

load the trained model from directory, and create a pipeline for direct inferencing

https://discuss.huggingface.co/t/using-trainer-at-inference-time/9378/7

https://discuss.huggingface.co/t/how-to-load-a-pipeline-saved-with-pipeline-save-pretrained/5373

In [58]:
from transformers import AutoModelForSequenceClassification
from transformers import pipeline

tokenizer_loaded = AutoTokenizer.from_pretrained("bert-base-cased")
model_loaded = AutoModelForSequenceClassification.from_pretrained('test_trainer_save_model_24-11-2023_v5', output_hidden_states=True)

# just load the model
# trainer_test = Trainer(
#     model = model_loaded
# )

# can input TrainingArgument class to make custom evaluation on a testing dataset
# or without, then just predict
# need to form a dataset object, cannot accept string I guess ??
# if with a true label -> automatically perform evaluation
# else just predict

# load the pipeline
pipeline_loaded = pipeline(
    'text-classification',
    model=model_loaded,
    tokenizer=tokenizer_loaded
)

In [55]:
pipeline_loaded

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x7fd6c93cac40>

In [68]:
# verifying the probability result output by the model
# and the pipeline
# it's correct

pipeline_loaded(
    ds_test.shuffle(seed=1).select(range(5))['text'],
    padding='max_length', max_length=tokenizer.model_max_length, truncation=True
)

[{'label': 'LABEL_1', 'score': 0.9960368275642395},
 {'label': 'LABEL_0', 'score': 0.9783793687820435},
 {'label': 'LABEL_1', 'score': 0.9958920478820801},
 {'label': 'LABEL_1', 'score': 0.9889470934867859},
 {'label': 'LABEL_0', 'score': 0.9972668886184692}]

In [59]:
pipeline_loaded("After I bought Postal 2, i saw that there was a third one and it had modern-day graphics and i thought that was pretty cool and then I saw all the negative reviews except for the literally only postive review and i thought to myself, hmmm I wonder how bad this game is? Seeing as how the second one was badass and amazingly better than some of the AAA games we get served today, so i bought it, played it and beat it, and now i now why this game is slandered, even RWS has given up on this trainwreck but before i trash this game more i have to say there are parts of this game that are good but far from being awesome.  Like my review of Postal 2, this review will broken down in the same areas  Gameplay, Graphics, Sound and most importantly Story  Warning: this is my opinon and if anything offends you, i deeply apolgize  Gameplay 5/10: the overall gmeplay isn't all to bad, the controls aren't bad but they are a big difference from its predecessor, the main control feature i like about this game is you don't have to use the mouse scroll for changing your guns, you can now use X and C for that which for me (being a console gamer for 16 years) is a big up for me since i'm still new to the PC controls. The game praises it's so called 'advanced AI' but in reality the so-called 'advanced AI' isn't so advanced unless advanced AI means running in your line of fire and killing all the 'advanced AI' in like two seconds  Graphics 6/10: the graphics of this game are not that bad but also not that good for a game that was realeased in 2011. there are times where the graphics acutally look complete and then other times your face palming with more regert than Patrick Stewert.  Sound 8/10: this is the only department that this game did good in. The BGM of this game is amazing, the only time the BGM is bad, is that it is stuck in a never-ending loop, so in the longer levels in the game it starts to become a ear ache. The sound F/X of this game is why the sound gets an 8 out of 10 instead of being a perfect score, the gun noises in this aren't as realistic as they were Postal 2, but they aren't horrible either, at least they are actually sounds instead of one guy making 'pew-pew' noises.  Story 6/10: the only reason this game's story did better than Postal 2 is because it's more of a linear based game, so there is more room for story, but the main reason ot only did a point better was becasue it's story was all over the f***ing place, when i finally beat the game, i had no f***ing idea what the f*** happend, the story has more holes in it than swiss cheese or a Micheal Bay movie (uggh don't get me started on him)  Overall 4.5/10: Overall this game needs A LOT more work and if it was developed on for a little bit longer (i'd say like two more years) it may have been a better game but I personally think that it should have been the exact same game as the second but with a new story and graphics like Halo 3. (ahhh if only) But before i go, the nic people over at Anthology Production, are making a mod for Postal 2, and it's basically taking Postal 3 and putting it into the engine of the second game and more! So i cannot wait for it to come out. so as you can see I do NOT reccomend this game but if you really want to play this s***ty game than wait for that mod.",
                padding='max_length', max_length=tokenizer_loaded.model_max_length, truncation=True)   # just pass the arguments of the tokeinzer like this 

[{'label': 'LABEL_0', 'score': 0.6495449542999268}]

In [60]:
pipeline_loaded('I like this game!!!',
                padding='max_length', max_length=tokenizer_loaded.model_max_length, truncation=True)

[{'label': 'LABEL_1', 'score': 0.9889022707939148}]

In [61]:
pipeline_loaded(["I like this game!!!", "This game sucks."],
                padding='max_length', max_length=tokenizer_loaded.model_max_length, truncation=True)

[{'label': 'LABEL_1', 'score': 0.9889022707939148},
 {'label': 'LABEL_0', 'score': 0.9885446429252625}]

Save model

In [62]:
pipeline_loaded.save_pretrained('test_trainer_save_pipeline_24-11-2023_v5')

# trainer.save_model('model_test_trainer_13-11-2023')

TF/Keras backend

In [None]:
ds_train = ds_train.map(tokenize_dataset)

In [None]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers.legacy import Adam

# Load and compile our model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")

# prepare tf dataset
tf_dataset = model.prepare_tf_dataset(ds_train, batch_size=16, shuffle=True, tokenizer=tokenizer)

# Lower learning rates are often better for fine-tuning transformers
model.compile(optimizer=Adam(3e-5), metrics=['accuracy'])  # No loss argument!

model_history = model.fit(tf_dataset)

2023-11-13 23:50:04.841218: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-13 23:50:04.862347: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-13 23:50:04.862379: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-13 23:50:04.862396: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-13 23:50:04.866986: I tensorflow/core/platform/cpu_feature_g

  7436/125063 [>.............................] - ETA: 3:24:53 - loss: 0.2965 - accuracy: 0.8741

KeyboardInterrupt: 

Save the model

In [None]:
from datetime import datetime

model.save_pretrained(f'bert-finetune-sa-gamereviews_{datetime.now().strftime("%Y%m%d%H%M%S")}')

Load the model