In [1]:
import json
import pandas as pd

# load training data
with open("train.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

# load testing data
with open("test.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

# create DataFrame
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

print(train_df.head())
print(test_df.head())


                                             reviews  sentiments
0  I bought this belt for my daughter in-law for ...           1
1  The size was perfect and so was the color.  It...           1
2  Fits and feels good, esp. for doing a swim rac...           1
3  These socks are absolutely the best. I take pi...           1
4  Thank you so much for the speedy delivery they...           1
                                             reviews
0  I bought 2 sleepers.  sleeper had holes in the...
1  I dare say these are just about the sexiest th...
2  everything about the transaction (price, deliv...
3  Not bad for just a shirt.  Very durable, and m...
4  These are truly wrinkle free and longer than t...


In [2]:
from transformers import BertTokenizer
import torch

# use pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# encode a batch of texts
def encode_batch(texts, labels=None):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )
    if labels is not None:
        encodings["labels"] = torch.tensor(labels)
    return encodings


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader

class ReviewDataset(Dataset):
    def __init__(self, df, tokenizer, is_train=True):
        self.texts = df["reviews"].tolist()
        self.labels = df["sentiments"].tolist() if is_train else None
        self.encodings = tokenizer(
            self.texts,
            truncation=True,
            padding=True,
            max_length=128
        )
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item


In [4]:

train_dataset = ReviewDataset(train_df, tokenizer, is_train=True)
test_dataset = ReviewDataset(test_df, tokenizer, is_train=False)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [5]:
from transformers import BertForSequenceClassification

# pre-trained BERT for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()

# training loop
for epoch in range(5):  # 5 epochs
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

# save the model
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")


Epoch 1, Loss: 0.1895096193783822
Epoch 2, Loss: 0.07892597656416431
Epoch 3, Loss: 0.042552456824706375
Epoch 4, Loss: 0.02025858199378007
Epoch 5, Loss: 0.01521793132318238


('./sentiment_model/tokenizer_config.json',
 './sentiment_model/special_tokens_map.json',
 './sentiment_model/vocab.txt',
 './sentiment_model/added_tokens.json')

In [7]:
model.eval()
predictions = []

with torch.no_grad():
    for batch in DataLoader(test_dataset, batch_size=16):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())

# generate DataFrame
result_df = pd.DataFrame({
    "id": range(0, len(predictions)),
    "sentiments": predictions
})

# save CSV
result_df.to_csv("submission_bert.csv", index=False)
print(result_df.head())


   id  sentiments
0   0           0
1   1           1
2   2           1
3   3           1
4   4           1
