In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AlbertTokenizer, AlbertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

2024-07-04 22:57:30.421937: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('train.csv')

df.head(12)

Unnamed: 0,Product Name,Subcategory
0,Mini Caramel Stroopwafels,"Biscuits, Chin Chin & Cookies"
1,Large Caramel Stroopwafels,"Biscuits, Chin Chin & Cookies"
2,Jumbo Caramel Stroopwafels,"Biscuits, Chin Chin & Cookies"
3,Cheddar & Cracked Black Pepper Bites,"Biscuits, Chin Chin & Cookies"
4,Ricotta & Spinach Tortelloni,Pasta & Noodles
5,Sicilian Cannoli Salted Caramel,Chocolates & Sweets
6,Salted Caramel Cannoli,Chocolates & Sweets
7,Hazelnut Chocolate Cannoli,Chocolates & Sweets
8,Aragostine Hazelnut Chocolate,Chocolates & Sweets
9,Aragostine Lemon,Chocolates & Sweets


In [3]:
#Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['Product Name'].tolist(), df['Subcategory'].tolist(), test_size=0.2, random_state=42)


Decided to use ALBERT BASE VS because it is a transformer model, which means it is very efficient for NLP taaks including multi-class classification

In [4]:
#Loading tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')


train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [5]:
#change labels to numerical
label_map = {label: i for i, label in enumerate(df['Subcategory'].unique())}
train_labels = [label_map[label] for label in train_labels]
val_labels = [label_map[label] for label in val_labels]

# Define ALBERT model for sequence classification
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(label_map))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    num_train_epochs=5,
    logging_dir='./logs',
    logging_steps=100,
)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Define custom metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [7]:
#Define Trainer
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
eval_dataset = CustomDataset(val_encodings, val_labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [8]:
# Fine-tune ALBERT
trainer.train()


model.save_pretrained('./production')

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 3.107032299041748, 'eval_accuracy': 0.16666666666666666, 'eval_precision': 0.057761078527207556, 'eval_recall': 0.16666666666666666, 'eval_f1': 0.07709878496022121, 'eval_runtime': 4.2285, 'eval_samples_per_second': 28.379, 'eval_steps_per_second': 1.892, 'epoch': 1.0}
{'loss': 3.0766, 'grad_norm': 34.19195556640625, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.67}


  0%|          | 0/8 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 3.0092153549194336, 'eval_accuracy': 0.18333333333333332, 'eval_precision': 0.07552681992337164, 'eval_recall': 0.18333333333333332, 'eval_f1': 0.09685185185185187, 'eval_runtime': 3.9017, 'eval_samples_per_second': 30.756, 'eval_steps_per_second': 2.05, 'epoch': 2.0}


  0%|          | 0/8 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 2.867724657058716, 'eval_accuracy': 0.26666666666666666, 'eval_precision': 0.21933456830482004, 'eval_recall': 0.26666666666666666, 'eval_f1': 0.19663489586392255, 'eval_runtime': 4.55, 'eval_samples_per_second': 26.374, 'eval_steps_per_second': 1.758, 'epoch': 3.0}
{'loss': 2.7922, 'grad_norm': 97.9991455078125, 'learning_rate': 1.6666666666666667e-05, 'epoch': 3.33}


  0%|          | 0/8 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 2.7348508834838867, 'eval_accuracy': 0.325, 'eval_precision': 0.2562016687016687, 'eval_recall': 0.325, 'eval_f1': 0.24925249952839088, 'eval_runtime': 4.4069, 'eval_samples_per_second': 27.23, 'eval_steps_per_second': 1.815, 'epoch': 4.0}
{'loss': 2.4014, 'grad_norm': 43.74592208862305, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/8 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 2.643828868865967, 'eval_accuracy': 0.30833333333333335, 'eval_precision': 0.26779067949018687, 'eval_recall': 0.30833333333333335, 'eval_f1': 0.24708042853204143, 'eval_runtime': 3.5219, 'eval_samples_per_second': 34.072, 'eval_steps_per_second': 2.271, 'epoch': 5.0}
{'train_runtime': 344.4846, 'train_samples_per_second': 6.938, 'train_steps_per_second': 0.871, 'train_loss': 2.7567286682128906, 'epoch': 5.0}


In [9]:
#test
text_to_predict = "Mini Caramel Stroopwafels"
inputs = tokenizer(text_to_predict, return_tensors="pt")
outputs = model(**inputs)
predicted_label_idx = outputs.logits.argmax().item()
predicted_label = list(label_map.keys())[predicted_label_idx]

print(f"Predicted subcategory for '{text_to_predict}': {predicted_label}")

Predicted subcategory for 'Mini Caramel Stroopwafels': Biscuits, Chin Chin & Cookies


In [10]:
#reate label2category mapping
label2category = {v: k for k, v in label_map.items()}

In [11]:
print(label2category)

{0: 'Biscuits, Chin Chin & Cookies', 1: 'Pasta & Noodles', 2: 'Chocolates & Sweets', 3: 'Flour & Baking Powder', 4: 'Baking Tools & Accessories', 5: 'Fruit Juices & Yoghurt', 6: 'Nuts & Seeds', 7: 'Sugar, Honey & Sweeteners', 8: 'Canned Foods', 9: 'Salt & Seasoning Cubes', 10: 'Everyday Tea', 11: 'Grains & Rice', 12: 'Oats & Instant Cereals', 13: 'Breakfast & Cereals', 14: 'Cooking Tools & Accessories', 15: 'Cooking Sauces', 16: 'Alcoholic Drinks', 17: 'Fizzy Drinks & Malt', 18: 'Non-Alcoholic Drinks', 19: 'Snacks & Confectioneries', 20: 'Butter, Cheese & Other Spreads', 21: 'Herbs & Spices', 22: 'Chips & Snacks', 23: 'Herbal Teas', 24: 'Beverages & Milk', 25: 'Diapering', 26: 'Baby & Toddler Health', 27: 'Daily Care', 28: 'Feeding & Nursing', 29: 'Toys & Gears', 30: 'School Bag', 31: 'Beer', 32: 'Wines & Champagne '}
