In [1]:
import json
import pandas as pd

# load training data
with open("train.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

# load testing data
with open("test.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

# create DataFrame
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(train_df.head())
print(test_df.head())


                                             reviews  sentiments
0  I bought this belt for my daughter in-law for ...           1
1  The size was perfect and so was the color.  It...           1
2  Fits and feels good, esp. for doing a swim rac...           1
3  These socks are absolutely the best. I take pi...           1
4  Thank you so much for the speedy delivery they...           1
                                             reviews
0  I bought 2 sleepers.  sleeper had holes in the...
1  I dare say these are just about the sexiest th...
2  everything about the transaction (price, deliv...
3  Not bad for just a shirt.  Very durable, and m...
4  These are truly wrinkle free and longer than t...


In [2]:
from transformers import BertTokenizer
import torch

# use pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# encode a batch of texts
def encode_batch(texts, labels=None):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )
    if labels is not None:
        encodings["labels"] = torch.tensor(labels)
    return encodings


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader

class ReviewDataset(Dataset):
    def __init__(self, df, tokenizer, is_train=True):
        self.texts = df["reviews"].tolist()
        self.labels = df["sentiments"].tolist() if is_train else None
        self.encodings = tokenizer(
            self.texts,
            truncation=True,
            padding=True,
            max_length=128
        )
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item


In [4]:
from sklearn.model_selection import train_test_split

# if needed, split training data into training and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# build datasets
train_dataset = ReviewDataset(train_df, tokenizer, is_train=True)
val_dataset = ReviewDataset(val_df, tokenizer, is_train=True)  

test_dataset = ReviewDataset(test_df, tokenizer, is_train=False)

# build DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [5]:
from transformers import BertForSequenceClassification

# pre-trained BERT for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()

best_val_f1 = 0.0  # record best validation F1 score
best_epoch = 0

for epoch in range(10):
    # ======== Training ========
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)

    # ======== Validation ========
    model.eval()
    val_loss = 0
    all_preds, all_labels, all_probs = [], [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            probs = torch.softmax(outputs.logits, dim=1)[:,1]
            preds = torch.argmax(outputs.logits, dim=1)

            all_probs.extend(probs.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    acc = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average="binary")
    recall = recall_score(all_labels, all_preds, average="binary")
    f1 = f1_score(all_labels, all_preds, average="binary")
    roc_auc = roc_auc_score(all_labels, all_probs)

    print(f"Epoch {epoch+1}")
    print(f"  Train Loss: {avg_train_loss:.4f}")
    print(f"  Val Loss:   {avg_val_loss:.4f}")
    print(f"  Accuracy:   {acc:.4f}")
    print(f"  Precision:  {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
    print(f"  ROC-AUC:    {roc_auc:.4f}")

    # ======== save best model ========
    if f1 > best_val_f1:
        best_val_f1 = f1
        best_epoch = epoch + 1
        model.save_pretrained("./sentiment_model")
        tokenizer.save_pretrained("./sentiment_model")
        print(f"  >>> Best model saved at epoch {best_epoch} with F1 {best_val_f1:.4f}")



Epoch 1
  Train Loss: 0.2085
  Val Loss:   0.1258
  Accuracy:   0.9568
  Precision:  0.9604, Recall: 0.9906, F1: 0.9752
  ROC-AUC:    0.9788
  >>> Best model saved at epoch 1 with F1 0.9752
Epoch 2
  Train Loss: 0.0857
  Val Loss:   0.1418
  Accuracy:   0.9541
  Precision:  0.9667, Recall: 0.9803, F1: 0.9735
  ROC-AUC:    0.9761
Epoch 3
  Train Loss: 0.0359
  Val Loss:   0.1680
  Accuracy:   0.9541
  Precision:  0.9638, Recall: 0.9835, F1: 0.9735
  ROC-AUC:    0.9727
Epoch 4
  Train Loss: 0.0278
  Val Loss:   0.2129
  Accuracy:   0.9419
  Precision:  0.9406, Recall: 0.9953, F1: 0.9672
  ROC-AUC:    0.9709
Epoch 5
  Train Loss: 0.0216
  Val Loss:   0.2152
  Accuracy:   0.9521
  Precision:  0.9616, Recall: 0.9835, F1: 0.9724
  ROC-AUC:    0.9508
Epoch 6
  Train Loss: 0.0170
  Val Loss:   0.2777
  Accuracy:   0.9507
  Precision:  0.9709, Recall: 0.9717, F1: 0.9713
  ROC-AUC:    0.9343
Epoch 7
  Train Loss: 0.0172
  Val Loss:   0.2503
  Accuracy:   0.9527
  Precision:  0.9703, Recall: 0.97

In [7]:
model.eval()
predictions = []

with torch.no_grad():
    for batch in DataLoader(test_dataset, batch_size=16):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())

# generate DataFrame
result_df = pd.DataFrame({
    "id": range(0, len(predictions)),
    "sentiments": predictions
})

# save CSV
result_df.to_csv("submission_bert.csv", index=False)
print(result_df.head())


   id  sentiments
0   0           0
1   1           1
2   2           1
3   3           1
4   4           1
