In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load
df = pd.read_csv('My_Product_Descriptions.csv')

In [None]:
df

In [5]:
# Rename for convenience
df = df.rename(columns={'Category': 'label'})

df['text'] = df['Product Name'].str.strip() + " — " + df['Description'].str.strip()

# Encode labels
label2id = {label: idx for idx, label in enumerate(df['label'].unique())}
id2label = {idx: label for label, idx in label2id.items()}
df['label_id'] = df['label'].map(label2id)

# Train-validation split
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label_id'], random_state=42)

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast
import torch # Import torch

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

class ProductDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        # The 'texts' argument is already a list because we passed train_df['text'].tolist()
        # or val_df['text'].tolist() during instantiation.
        # So, no need to call .tolist() again.
        self.encodings = tokenizer(texts, truncation=True, padding='max_length',
                                   max_length=max_length)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Assuming labels is a Pandas Series, .iloc[idx] is appropriate
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

# Create datasets and loaders
# Pass the text data as a list using .tolist()
train_dataset = ProductDataset(train_df['text'].tolist(), train_df['label_id'], tokenizer)
val_dataset   = ProductDataset(val_df['text'].tolist(),   val_df['label_id'], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=16)

In [None]:
import torch
from transformers import BertForSequenceClassification
from torch.optim import AdamW # Import AdamW from torch.optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
from tqdm.auto import tqdm

epochs = 10

for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    avg_train_loss = train_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss, correct = 0, 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Val Epoch {epoch+1}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            preds = outputs.logits.argmax(dim=-1)
            correct += (preds == batch['labels']).sum().item()
    avg_val_loss = val_loss / len(val_loader)
    val_acc = correct / len(val_dataset)

    print(f"\nEpoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | "
          f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")


In [10]:
# Save
model.save_pretrained('bert-product-classifier')
tokenizer.save_pretrained('bert-product-classifier')

Device set to use cuda:0


[{'label': 'Coffee', 'score': 0.8371042013168335}]


In [14]:
# Load for inference
from transformers import pipeline
classifier = pipeline(
    "text-classification",
    model='bert-product-classifier',
    tokenizer='bert-product-classifier',
    return_all_scores=False
)

# Example
print(classifier("Fruity Coffee - A fruity medium roast coffee with notes of caramel"))


Device set to use cuda:0


[{'label': 'Coffee', 'score': 0.8371042013168335}]
