In [2]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [3]:
train_data = pd.read_csv('/kaggle/input/retail-dataset/train_dataset_with_captions.csv')
test_data = pd.read_csv('/kaggle/input/retail-dataset/test_features_with_captions.csv')

In [4]:
class CustomDataset(Dataset):
    def __init__(self, titles, descriptions, captions, labels, tokenizer, max_len):
        self.titles = titles
        self.descriptions = descriptions
        self.captions = captions
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        title = str(self.titles[idx])
        description = str(self.descriptions[idx])
        caption = str(self.captions[idx])
        text = title + " " + caption + " " + description
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create dataset objects
MAX_LEN = 128
train_dataset = CustomDataset(
    titles=train_data.title.values,
    descriptions=train_data.description.values,
    captions=train_data.caption.values,
    labels=train_data.classes.values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

# DataLoader
BATCH_SIZE = 32
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
class BERTModel(torch.nn.Module):
    def __init__(self, num_classes):
        super(BERTModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits

# Initialize the model
NUM_CLASSES = 21
model = BERTModel(NUM_CLASSES)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

def train_epoch(model, data_loader, optimizer, device):
    model = model.train()
    total_loss = 0
    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = torch.nn.CrossEntropyLoss()(outputs, labels)

        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    return total_loss / len(data_loader)

EPOCHS = 6
for epoch in range(EPOCHS):
    loss = train_epoch(model, train_dataloader, optimizer, device)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Loss: {loss}')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/6, Loss: 0.9471843160832013
Epoch 2/6, Loss: 0.46772997094086616
Epoch 3/6, Loss: 0.2862978635998057
Epoch 4/6, Loss: 0.17797295653324338
Epoch 5/6, Loss: 0.10787584093805211
Epoch 6/6, Loss: 0.07238885954270069


In [7]:
class TestDataset(Dataset):
    def __init__(self, titles, descriptions, captions, tokenizer, max_len):
        self.titles = titles
        self.descriptions = descriptions
        self.captions = captions
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        title = str(self.titles[idx])
        description = str(self.descriptions[idx])
        caption = str(self.captions[idx])
        text = title + " " + caption + " " + description
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# Create test dataset and dataloader
test_dataset = TestDataset(
    titles=test_data.title.values,
    descriptions=test_data.description.values,
    captions=test_data.caption.values,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Prediction function
def predict(model, data_loader, device):
    model = model.eval()
    predictions = []
    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().numpy())
    return predictions

# Predict
test_predictions = predict(model, test_dataloader, device)

In [8]:
submission = pd.DataFrame({'ID': test_data.ID, 'classes': test_predictions})
submission.to_csv('submission_bert_with_captions_2.csv', index=False)