# BERT solution

## Install packages

In [11]:
%pip install transformers datasets scikit-learn pandas
%pip install torch torchvision --index-url https://download.pytorch.org/whl/cu129

Note: you may need to restart the kernel to use updated packages.
Looking in indexes: https://download.pytorch.org/whl/cu129
Note: you may need to restart the kernel to use updated packages.


## Import packages

In [15]:
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from datasets import Dataset
import pandas as pd

## Load data

In [21]:
data = pd.read_csv("./data/reviews_binary.csv")
train_df, val_df = train_test_split(
    data,
    test_size=0.2,
    stratify=data['label'],
    random_state=42
)
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

## Load tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=128)
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

Map:   0%|          | 0/880 [00:00<?, ? examples/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

## Load model

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training model

In [24]:
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} - Training loss: {avg_train_loss:.4f}")
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    print("Validation Results:")
    print(classification_report(true_labels, preds, target_names=["clean", "flagged"]))

Epoch 1/3 - Training loss: 0.0769
Validation Results:
              precision    recall  f1-score   support

       clean       0.99      1.00      1.00       218
     flagged       0.00      0.00      0.00         2

    accuracy                           0.99       220
   macro avg       0.50      0.50      0.50       220
weighted avg       0.98      0.99      0.99       220



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 2/3 - Training loss: 0.0527
Validation Results:
              precision    recall  f1-score   support

       clean       0.99      1.00      1.00       218
     flagged       0.00      0.00      0.00         2

    accuracy                           0.99       220
   macro avg       0.50      0.50      0.50       220
weighted avg       0.98      0.99      0.99       220



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 3/3 - Training loss: 0.0213
Validation Results:
              precision    recall  f1-score   support

       clean       0.99      1.00      1.00       218
     flagged       0.00      0.00      0.00         2

    accuracy                           0.99       220
   macro avg       0.50      0.50      0.50       220
weighted avg       0.98      0.99      0.99       220



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Sample input

In [26]:
reviews = [
    "Great service, will come again!",
    "Visit our website for free coupons!",
    "Never visited but heard it's bad"
]
inputs = tokenizer(reviews, padding=True, truncation=True, max_length=128, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

## Predictions

In [None]:
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)
for review, pred in zip(reviews, predictions):
    label = "clean" if pred.item() == 0 else "flagged"
    print(f"Review: {review}\nPredicted label: {label}\n")

tensor([0, 1, 0], device='cuda:0')
Review: Great service, will come again!
Predicted label: clean

Review: Visit our website for free coupons!
Predicted label: flagged

Review: Never visited but heard it's bad
Predicted label: clean

