In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load
df = pd.read_csv('My_Product_Descriptions.csv')


# Rename for convenience
df = df.rename(columns={'Category': 'label'})
df['text'] = df['Product Name'].str.strip() + " — " + df['Description'].str.strip()

# Encode labels
label2id = {label: idx for idx, label in enumerate(df['label'].unique())}
id2label = {idx: label for label, idx in label2id.items()}
df['label_id'] = df['label'].map(label2id)

# Train-validation split
train_df, val_df = train_test_split(
    df, test_size=0.2, stratify=df['label_id'], random_state=42
)

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizerFast
import torch

# Use DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

class ProductDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding='max_length',
            max_length=max_length
        )
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

# Create datasets and loaders
train_dataset = ProductDataset(
    train_df['text'].tolist(),
    train_df['label_id'],
    tokenizer
)
val_dataset = ProductDataset(
    val_df['text'].tolist(),
    val_df['label_id'],
    tokenizer
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=16)

In [None]:
import torch
from transformers import DistilBertForSequenceClassification
from torch.optim import AdamW

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load pretrained DistilBERT for classification
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

# %%
from tqdm.auto import tqdm

epochs = 10

for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    avg_train_loss = train_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss, correct = 0, 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Val Epoch {epoch+1}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            preds = outputs.logits.argmax(dim=-1)
            correct += (preds == batch['labels']).sum().item()
    avg_val_loss = val_loss / len(val_loader)
    val_acc = correct / len(val_dataset)

    print(
        f"\nEpoch {epoch+1} | "
        f"Train Loss: {avg_train_loss:.4f} | "
        f"Val Loss: {avg_val_loss:.4f} | "
        f"Val Acc: {val_acc:.4f}"
    )

# %%
# Save fine-tuned model & tokenizer
model.save_pretrained('distilbert-product-classifier')
tokenizer.save_pretrained('distilbert-product-classifier')

In [None]:
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model='distilbert-product-classifier',
    tokenizer='distilbert-product-classifier',
    return_all_scores=False
)

# Example
print(
    classifier(
        "Fruity Coffee - A fruity medium roast coffee with notes of caramel"
    )
)