# BERT testing

This is for testing our BERT model on Kaggle and manually scraped datasets.

## Install packages

Download torch according to official website and CUDA version as mentioned in `README.md`.

In [None]:
%pip install transformers datasets pandas scikit-learn googletrans
%pip install torch torchvision --index-url https://download.pytorch.org/whl/cu129

## Import packages

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import pandas as pd
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from googletrans import Translator
import re

## Load trained model

This is to load the trained model from training script.

In [None]:
save_dir = "./data/saved_model"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(save_dir, local_files_only=True)
model.to(device)
tokenizer = BertTokenizer.from_pretrained(save_dir, local_files_only=True)

## Load testing data

For testing and evaluation of our model, we used 2 datasets. One is Kaggle dataset as mentioned in `README.md`. The other is scrapped and labelled manually from local (Singapore) Google Maps reviews.

### Kaggle data

This data is from kaggle (see `README.md`).

In [None]:
df = pd.read_csv("./data/reviews-labeled.csv")
df = df[['text', 'label']]
df = df.dropna(subset=["text"]).reset_index(drop=True)
df['label'] = df['label'].apply(lambda x : 0 if x == "clean" else 1)

test_dataset = Dataset.from_pandas(df)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"], 
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_dataset = test_dataset.map(tokenize_fn, batched=True)

tokenized_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

test_loader = DataLoader(tokenized_dataset, batch_size=32)

### Local (Singapore) data

This data is manually scrapped and labelled from Google Maps directly.

In [None]:
df = pd.read_csv("./data/test-data-labeled.csv")
df = df[['text', 'label']]
df = df.dropna(subset=["text"]).reset_index(drop=True)

texts = df['text'].to_list()
indices = []
texts_to_translate = []
translated_texts = texts.copy()
async def translate_bulk():
    async with Translator() as translator:
        for index, text in enumerate(texts):
            result = await translator.detect(text)
            if result.lang != 'en':
                indices.append(index)
                texts_to_translate.append(text)
        translations = await translator.translate(texts_to_translate)
        for i, translation in zip(indices, translations):
            translated_texts[i] = translation.text
await translate_bulk()
df['text'] = translated_texts

def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F700-\U0001F77F"
        "\U0001F780-\U0001F7FF"
        "\U0001F800-\U0001F8FF"
        "\U0001F900-\U0001F9FF"
        "\U0001FA00-\U0001FA6F"
        "\U0001FA70-\U0001FAFF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)
df["text"] = df["text"].apply(remove_emojis)

print(df.head())

test_dataset = Dataset.from_pandas(df)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"], 
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_dataset = test_dataset.map(tokenize_fn, batched=True)

tokenized_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

test_loader = DataLoader(tokenized_dataset, batch_size=32)

## Evaluation

This will run evaluation on test dataset as well as calculate the accuracy, precision, recall and f1 scores. (0 is clean, 1 is flagged)

In [None]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(
    all_labels, all_preds, average="weighted"
)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

precision_cls, recall_cls, f1_cls, support_cls = precision_recall_fscore_support(
    all_labels, all_preds, labels=[0,1], average=None
)

print("Class-wise metrics:")
for i, (p, r, f, s) in enumerate(zip(precision_cls, recall_cls, f1_cls, support_cls)):
    if i == 0:
        print(f"clean -> Precision: {p:.4f}, Recall: {r:.4f}, F1: {f:.4f}, Support: {s}")
    else:
        print(f"flagged -> Precision: {p:.4f}, Recall: {r:.4f}, F1: {f:.4f}, Support: {s}")