### **Import Libraries**

In [1]:
!pip install emoji==0.6.0 -q


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m30.7/51.0 kB[0m [31m777.3 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m719.9 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for emoji (setup.py) ... [?25l[?25hdone


In [21]:
try:
  from datasets import load_dataset
  from datasets import Dataset, DatasetDict
except Exception:
  !pip install datasets -q
  from datasets import load_dataset
  from datasets import Dataset, DatasetDict

import re
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

from tqdm import tqdm

### **Utility Functions**

In [3]:
def stars_to_sentiment(star_rate: int) -> str:
    """
    Converts the star rating of a review to a sentiment.

    Parameters:
    star_rate (int): The star rating of the review. It should be an integer.

    Returns:
    str: The sentiment corresponding to the star rating:
         - "NEG" for negative sentiment (star rating less than 2)
         - "NEU" for neutral sentiment (star rating equal to 2)
         - "POS" for positive sentiment (star rating greater than 2)
    """
    match star_rate:
        case _ if star_rate < 2:
            return "NEG"
        case _ if star_rate == 2:
            return "NEU"
        case _ if star_rate > 2:
            return "POS"

### **Load Dataset**

In [4]:
dataset = load_dataset('yelp_review_full')

train_dataset = dataset['train']
test_dataset  = dataset['test']
del dataset

train_text  = [train_dataset[i]['text'] for i in range(len(train_dataset))]
train_label = [train_dataset[i]['label'] for i in range(len(train_dataset))]
train_label = [stars_to_sentiment(label) for label in train_label]

test_text  = [test_dataset[i]['text'] for i in range(len(test_dataset))]
test_label = [test_dataset[i]['label'] for i in range(len(test_dataset))]
test_label = [stars_to_sentiment(label) for label in test_label]

del train_dataset
del test_dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/299M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Split training data into two sets, named training and validations set in order to preventing overfiting

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(
    train_text, train_label, test_size = 0.3, shuffle = True, stratify = train_label, random_state = 42
)

len(y_train), len(y_valid), len(test_label)

(455000, 195000, 50000)

In [6]:
train_data = [{'text': txt, 'label': lbl} for txt, lbl in zip(X_train, y_train)]
validation_data = [{'text': txt, 'label': lbl} for txt, lbl in zip(X_valid, y_valid)]
test_data = [{'text': txt, 'label': lbl} for txt, lbl in zip(test_text, test_label)]

# Convert to huggingface dataset api
train_data = Dataset.from_list(train_data)
validation_data = Dataset.from_list(validation_data)
test_data = Dataset.from_list(test_data)

data = DatasetDict()
data['train'] = train_data
data['validation'] = validation_data
data['test'] = test_data

In [7]:
tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")

# Tokenize the dataset
def preprocess_function(examples: dict):
    return tokenizer(examples['text'], truncation = True, padding = True, max_length = 128, return_tensors = "tf")

tokenized_data = data.map(preprocess_function, batched = True)



tokenizer_config.json:   0%|          | 0.00/338 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

Map:   0%|          | 0/455000 [00:00<?, ? examples/s]

Map:   0%|          | 0/195000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Calculate the baseline accuracy of the model

In [8]:
model = TFAutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
id2label = model.config.id2label
id2label

config.json:   0%|          | 0.00/949 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/540M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


{0: 'NEG', 1: 'NEU', 2: 'POS'}

In [9]:
logits = model.predict(
    tf.convert_to_tensor(tokenized_data['train']['input_ids']),
    batch_size = 64
).logits

probas = tf.nn.softmax(logits, axis = -1)

predicted_classes = tf.argmax(probas, axis = 1).numpy()

predicted_labels = [id2label[cls] for cls in predicted_classes]



In [23]:
print(classification_report(y_train, predicted_labels, target_names = ['NEG', 'NEU', 'POS']))

              precision    recall  f1-score   support

         NEG       0.79      0.63      0.70    182000
         NEU       0.27      0.38      0.31     91000
         POS       0.73      0.73      0.73    182000

    accuracy                           0.62    455000
   macro avg       0.60      0.58      0.58    455000
weighted avg       0.66      0.62      0.64    455000

