<a href="https://colab.research.google.com/github/MMaggieZhou/sentiment_analysis/blob/main/sentiment_analysis_bert_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis Leveraging Pertrained Bert via Transformers Library from Hugging Face

In [None]:
import re
import os
import torch
import unicodedata
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

## Data Cleaning

Tokenizer provided by transformers utilises sub-word tonization, so very little data cleaning is needed, aka convert to lower case, normalize unicode characters, as well as remove special characters.

In [64]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Normalize unicode characters
    text = unicodedata.normalize("NFKC", text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove special characters (optional)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    return text

label_encoder = LabelEncoder()

def load_and_process(file, training):
  df = pd.read_csv(file, names=['id', 'entity', 'label', 'text']).set_index('id')
  df = df.drop_duplicates().dropna()
  df['text_processed'] = df['text'].apply(preprocess_text)

  if training:
      df['label_num'] = label_encoder.fit_transform(df['label'])
  else:
      df['label_num'] = label_encoder.transform(df['label'])

  return df

train_df = load_and_process("/content/twitter_training.csv", True)
test_df = load_and_process("/content/twitter_validation.csv", False)

## Data Preprocessing

Use Tokenizer provided by transformers, to convert text into pytorch tensors, and further into Dataset as the desired format for training.

In [67]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt" # Return PyTorch torch.Tensor objects.
    )
train_encodings = tokenize_texts(train_df['text_processed'], tokenizer)
val_encodings = tokenize_texts(test_df['text_processed'], tokenizer)

class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        # encodings : output from BertTokenizer
        self.encodings = encodings
        # labes: df
        self.labels = torch.tensor(labels.values)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # encoding shoud have keys{'input_ids', 'attention_mask'}
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

train_dataset = TweetDataset(train_encodings, train_df['label_num'])
val_dataset = TweetDataset(val_encodings, test_df['label_num'])

##  Pretrained Model Loading & Fine-tune

Load pre-trained bert model into [BertForSequenceClassification](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py#L1631), which basically add a classification head to bert model.

Use the provided Trainer class for training and evaluation, which uses AdamW as optimizer by default, and utilizers GPU/TPU when available.

In [79]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

# Evaluation
def compute_metrics(pred):
    logits = pred.predictions
    targets = pred.label_ids
    preds = torch.argmax(torch.tensor(logits), axis=-1)
    acc = accuracy_score(targets, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(targets, preds, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Log locally, turns out this makes training faster
os.environ["WANDB_DISABLED"] = "true"
# The default optimizer is AdamW
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

print(next(trainer.model.parameters()).device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


cuda:0


In [80]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5238,0.262728,0.912,0.913682,0.912,0.911685
2,0.226,0.105904,0.97,0.970104,0.97,0.970021
3,0.112,0.135919,0.97,0.97012,0.97,0.970013


TrainOutput(global_step=13305, training_loss=0.4060885070544891, metrics={'train_runtime': 1174.9732, 'train_samples_per_second': 181.173, 'train_steps_per_second': 11.324, 'total_flos': 1.4002627143038976e+16, 'train_loss': 0.4060885070544891, 'epoch': 3.0})