In [125]:
import pandas as pd

In [126]:
df = pd.read_csv("labelled_training_data.csv")
df.sample(10)

Unnamed: 0,Food Name,Label
57,Raw Egg,Not Junk
184,"Coffee, Prepared From Grounds",Not Junk
403,"Seven Seas, Omega 3 Capsules, Max Strength",Not Junk
67,"Tesco, Flat Peaches",Not Junk
220,"Optimum Nutrition, Micronized Creatine Powder",Not Junk
494,Prosciutto,Not Junk
430,"Soybeans, Cooked from Dried",Not Junk
240,Bao mooli,Not Junk
218,"Nectarine, Fresh",Not Junk
49,"Diet Coke, Caffeine Free",Junk


Convert labels to integers (0 for Not Junk, 1 for Junk)

In [127]:
label_map = {"Not Junk":0, "Junk":1}
df["Label"] = df["Label"].map(label_map)

BERT requires input in a specific format (tokenized text), so we’ll use the BertTokenizer to preprocess the food item descriptions.

In [128]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the data
def encode_data(texts, labels, tokenizer):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=64)
    return encodings, labels




Now, create a custom PyTorch Dataset to return the tokenized data in the format BERT expects.

In [129]:
import torch
from torch.utils.data import Dataset

class FoodDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=64)
        self.labels = labels
    def __getitem__(self, index):
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[index])
        return item
    def __len__(self):
        return len(self.labels)

# Prepare the dataset
training_texts = df["Food Name"].tolist()
training_label = df["Label"].tolist()

# Create the dataset
training_dataset = FoodDataset(training_texts, training_label, tokenizer)

Check if GPU is available

In [130]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


Now, load a pre-trained BERT model with a classification head.

In [131]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Move the model to the GPU

In [132]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

Next, we define the training arguments, such as the number of epochs, batch size, etc.

In [133]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,   # batch size for training
    per_device_eval_batch_size=16,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    fp16=True                        # # Mixed precision for better performance on GPUs
)

Now, set up the Trainer class with the model, training arguments, and dataset.

In [134]:
from transformers import Trainer

trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=training_dataset,
    eval_dataset=None  # You can add validation data here if available
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Finally, train the model using the Trainer class.

In [135]:
# Train the model
trainer.train()

  0%|          | 0/150 [00:00<?, ?it/s]

{'loss': 0.8405, 'grad_norm': 14.686184883117676, 'learning_rate': 9e-07, 'epoch': 0.2}
{'loss': 0.7651, 'grad_norm': 7.164673328399658, 'learning_rate': 1.9e-06, 'epoch': 0.4}
{'loss': 0.6021, 'grad_norm': 9.920513153076172, 'learning_rate': 2.9e-06, 'epoch': 0.6}
{'loss': 0.5109, 'grad_norm': 5.5381879806518555, 'learning_rate': 3.9e-06, 'epoch': 0.8}
{'loss': 0.424, 'grad_norm': 6.73447847366333, 'learning_rate': 4.9000000000000005e-06, 'epoch': 1.0}
{'loss': 0.3655, 'grad_norm': 4.916894912719727, 'learning_rate': 5.9e-06, 'epoch': 1.2}
{'loss': 0.2459, 'grad_norm': 6.577718734741211, 'learning_rate': 6.900000000000001e-06, 'epoch': 1.4}
{'loss': 0.1739, 'grad_norm': 2.1829209327697754, 'learning_rate': 7.9e-06, 'epoch': 1.6}
{'loss': 0.1466, 'grad_norm': 13.699748992919922, 'learning_rate': 8.9e-06, 'epoch': 1.8}
{'loss': 0.2009, 'grad_norm': 4.182320594787598, 'learning_rate': 9.900000000000002e-06, 'epoch': 2.0}
{'loss': 0.0747, 'grad_norm': 1.6432679891586304, 'learning_rate': 

TrainOutput(global_step=150, training_loss=0.3061061461766561, metrics={'train_runtime': 9.2439, 'train_samples_per_second': 259.63, 'train_steps_per_second': 16.227, 'total_flos': 33299992944000.0, 'train_loss': 0.3061061461766561, 'epoch': 3.0})

After training, you can use the model to classify new food items.

In [136]:
# Function to classify new items
def classify_food_item(item):
    # Move tokenizer output tensors to the GPU (if available)
    encoding = tokenizer(item, truncation=True, padding=True, max_length=64, return_tensors='pt').to(device)
    
    # Ensure that the model and data are on the same device
    outputs = model(**encoding)  # Model is already on GPU
    logits = outputs.logits
    
    predicted_class = torch.argmax(logits, dim=1).item()
    return 'Junk' if predicted_class == 1 else 'Not Junk'


In [137]:
# Example usage
new_item = "cucumber"
classification = classify_food_item(new_item)
print(f"The item '{new_item}' is classified as: {classification}")

The item 'cucumber' is classified as: Junk


In [138]:
import pandas as pd

In [139]:
df = pd.read_csv("unlabelled_training_data.csv")
df.head()

Unnamed: 0,Food Name,Category
0,"Fage, Total, Greek Strained Yoghurt, 5% Fat",Dairy and Egg Products
1,"Bulk, Pure Whey Isolate, Pistachio Ice Cream",Supplements
2,"Eggs, Cooked",Dairy and Egg Products
3,"Centrum Advance, Multivitamin",Supplements
4,"Seven Seas, Omega 3 Capsules, Max Strength",Supplements


In [140]:
df["generated_label"] = df["Food Name"].apply(classify_food_item)

In [141]:
df.sample(20)

Unnamed: 0,Food Name,Category,generated_label
361,"White Rice, Steamed",Cereal Grains and Pasta,Not Junk
158,Prosciutto,Pork Products,Not Junk
480,"Figs, Fresh",Fruits and Fruit Juices,Not Junk
641,"Coffee, Prepared From Grounds",Beverages,Not Junk
275,"Diet Coke, Caffeine Free",Beverages,Junk
362,"Chicken Breast, Skin Removed Before Cooking",Poultry Products,Not Junk
310,Kimchi,Vegetables and Vegetable Products,Not Junk
199,"Nectarine, Fresh",Fruits and Fruit Juices,Not Junk
523,"Pistachio Nuts, Raw",Nut and Seed Products,Not Junk
90,"Carrots, Cooked From Fresh",Vegetables and Vegetable Products,Not Junk
