In [2]:
# Import libraries
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from transformers import AutoTokenizer, AutoModel, AutoConfig, Trainer, TrainingArguments, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch.nn as nn
import os


In [5]:
# Download datasets
!wget -P /kaggle/working -nc "https://raw.githubusercontent.com/HammadxSaj/Sem-Eval-Task10-Dataset/refs/heads/main/final_cleaned_train.csv"
!wget -P /kaggle/working -nc "https://raw.githubusercontent.com/HammadxSaj/Sem-Eval-Task10-Dataset/refs/heads/main/final_cleaned_validation.csv"


  pid, fd = os.forkpty()


File '/kaggle/working/final_cleaned_train.csv' already there; not retrieving.

File '/kaggle/working/final_cleaned_validation.csv' already there; not retrieving.



In [6]:
# Load the training data
df = pd.read_csv('//kaggle/input/dataset/data.csv')

# Inspect the dataframe
df.head()


Unnamed: 0,year,month,day,country,title,text,hazard-category,product-category,hazard,product,combined_text,label
0,1994,1,7,us,Recall Notification: FSIS-024-94,Case Number: 024-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria monocytogenes,smoked sausage,Recall Notification: FSIS-024-94 Case Number: ...,55
1,1994,3,10,us,Recall Notification: FSIS-033-94,Case Number: 033-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria spp,sausage,Recall Notification: FSIS-033-94 Case Number: ...,56
2,1994,3,28,us,Recall Notification: FSIS-014-94,Case Number: 014-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria monocytogenes,ham slices,Recall Notification: FSIS-014-94 Case Number: ...,55
3,1994,4,3,us,Recall Notification: FSIS-009-94,Case Number: 009-94 \n Date Opene...,foreign bodies,"meat, egg and dairy products",plastic fragment,thermal processed pork meat,Recall Notification: FSIS-009-94 Case Number: ...,90
4,1994,7,1,us,Recall Notification: FSIS-001-94,Case Number: 001-94 \n Date Opene...,foreign bodies,"meat, egg and dairy products",plastic fragment,chicken breast,Recall Notification: FSIS-001-94 Case Number: ...,90


In [7]:
df['text'] = df['hazard-category'] + ' ' + df['product-category'] + ' ' + df['text']

# Drop 'hazard-category' and 'product-category'
df = df.drop(columns=['hazard-category', 'product-category'])

In [8]:
df

Unnamed: 0,year,month,day,country,title,text,hazard,product,combined_text,label
0,1994,1,7,us,Recall Notification: FSIS-024-94,"biological meat, egg and dairy products Case N...",listeria monocytogenes,smoked sausage,Recall Notification: FSIS-024-94 Case Number: ...,55
1,1994,3,10,us,Recall Notification: FSIS-033-94,"biological meat, egg and dairy products Case N...",listeria spp,sausage,Recall Notification: FSIS-033-94 Case Number: ...,56
2,1994,3,28,us,Recall Notification: FSIS-014-94,"biological meat, egg and dairy products Case N...",listeria monocytogenes,ham slices,Recall Notification: FSIS-014-94 Case Number: ...,55
3,1994,4,3,us,Recall Notification: FSIS-009-94,"foreign bodies meat, egg and dairy products Ca...",plastic fragment,thermal processed pork meat,Recall Notification: FSIS-009-94 Case Number: ...,90
4,1994,7,1,us,Recall Notification: FSIS-001-94,"foreign bodies meat, egg and dairy products Ca...",plastic fragment,chicken breast,Recall Notification: FSIS-001-94 Case Number: ...,90
...,...,...,...,...,...,...,...,...,...,...
12308,2022,6,6,us,False,"allergens confectionery phrase: NAPERVILLE, Il...",milk and products thereof,sprinkle mix,"False phrase: NAPERVILLE, Ill., June 2, 2022 (...",59
12309,2022,6,14,ie,False,"allergens cocoa and cocoa preparations, coffee...",almond,chocolate spread with hazelnuts,False : Undeclared Almond in Batches of SPAR S...,5
12310,2022,6,23,us,Paraphrase: Daily Harvest Issues Voluntary Rec...,biological fruits and vegetables . French Lent...,other,frozen leek,Paraphrase: Daily Harvest Issues Voluntary Rec...,73
12311,2022,7,4,hk,False,"biological meat, egg and dairy products CFS fo...",virus,bovine meat and offal,False CFS follows up on imported frozen beef a...,125


In [9]:
# Data preprocessing

# Drop unnecessary columns for training
df = df[['text', 'hazard', 'product']]

# Drop rows with missing values
df.dropna(inplace=True)

# # Initialize label encoders
# hazard_category_encoder = LabelEncoder()
# product_category_encoder = LabelEncoder()
hazard_encoder = LabelEncoder()
product_encoder = LabelEncoder()

# Fit the encoders
# hazard_category_encoder.fit(df['hazard-category'])
# product_category_encoder.fit(df['product-category'])
hazard_encoder.fit(df['hazard'])
product_encoder.fit(df['product'])

# Transform the labels
# df['hazard-category'] = hazard_category_encoder.transform(df['hazard-category'])
# df['product-category'] = product_category_encoder.transform(df['product-category'])
df['hazard'] = hazard_encoder.transform(df['hazard'])
df['product'] = product_encoder.transform(df['product'])

# Split the data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

print(f"Number of training samples: {len(train_df)}")
print(f"Number of validation samples: {len(val_df)}")


Number of training samples: 9850
Number of validation samples: 2463


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hazard'] = hazard_encoder.transform(df['hazard'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['product'] = product_encoder.transform(df['product'])


In [10]:
# Define the FoodHazardDataset class
class FoodHazardDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, hazards, products):
        self.encodings = encodings
        # self.hazard_categories = hazard_categories
        # self.product_categories = product_categories
        self.hazards = hazards
        self.products = products

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # item['hazard_category_labels'] = torch.tensor(self.hazard_categories[idx])
        # item['product_category_labels'] = torch.tensor(self.product_categories[idx])
        item['hazard_labels'] = torch.tensor(self.hazards[idx])
        item['product_labels'] = torch.tensor(self.products[idx])
        return item

    def __len__(self):
        return len(self.hazards)


In [11]:
# Define the number of unique labels for each category
# num_hazard_category_labels = len(hazard_category_encoder.classes_)
# num_product_category_labels = len(product_category_encoder.classes_)
num_hazard_labels = len(hazard_encoder.classes_)
num_product_labels = len(product_encoder.classes_)


In [12]:
from transformers import AutoModel
import torch.nn as nn

class TransformerForFoodHazardClassification(nn.Module):
    def __init__(self, model_name, num_labels_dict):
        super().__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        # Uncomment the line below if you want to use dropout
        # self.dropout = nn.Dropout(self.transformer.config.hidden_dropout_prob)

        hidden_size = self.transformer.config.hidden_size

        # Classifiers for the four labels
        # self.hazard_category_classifier = nn.Linear(hidden_size, num_labels_dict['hazard_category'])
        # self.product_category_classifier = nn.Linear(hidden_size, num_labels_dict['product_category'])
        self.hazard_classifier = nn.Linear(hidden_size, num_labels_dict['hazard'])
        self.product_classifier = nn.Linear(hidden_size, num_labels_dict['product'])

        # Loss function
        self.loss_fct = nn.CrossEntropyLoss()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                hazard_labels=None, product_labels=None):
        # Check if the model supports token_type_ids
        if "token_type_ids" in self.transformer.forward.__code__.co_varnames:
            outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        else:
            # For DistilBERT and similar models that do not accept token_type_ids
            outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)

        # Select pooled output for models like BERT and DeBERTa, or use CLS token for others
        if hasattr(outputs, 'pooler_output'):
            pooled_output = outputs.pooler_output
        else:
            pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token

        # Apply dropout if using
        # pooled_output = self.dropout(pooled_output)

        # Predict the four labels
        # hazard_category_logits = self.hazard_category_classifier(pooled_output)
        # product_category_logits = self.product_category_classifier(pooled_output)
        hazard_logits = self.hazard_classifier(pooled_output)
        product_logits = self.product_classifier(pooled_output)

        loss = None
        if hazard_labels is not None and product_labels is not None:
            # Compute loss for each task
            # hazard_category_loss = self.loss_fct(hazard_category_logits, hazard_category_labels)
            # product_category_loss = self.loss_fct(product_category_logits, product_category_labels)
            hazard_loss = self.loss_fct(hazard_logits, hazard_labels)
            product_loss = self.loss_fct(product_logits, product_labels)

            # Aggregate losses
            loss = hazard_loss + product_loss

        # Return the loss and logits
        output = (hazard_logits, product_logits)
        return ((loss,) + output) if loss is not None else output


In [13]:
# Define the compute_metrics function to calculate both accuracy and average F1 score across all labels

def compute_metrics(pred):

    labels = pred.label_ids

    preds = pred.predictions



    # Unpack labels and predictions for each task

    # hazard_category_labels = labels[0]

    # product_category_labels = labels[1]

    hazard_labels = labels[0]

    product_labels = labels[1]



    # hazard_category_preds = preds[0].argmax(-1)

    # product_category_preds = preds[1].argmax(-1)

    hazard_preds = preds[0].argmax(-1)

    product_preds = preds[1].argmax(-1)



    # Compute accuracy for each task (can be used separately if needed)

    # hazard_category_acc = accuracy_score(hazard_category_labels, hazard_category_preds)

    # product_category_acc = accuracy_score(product_category_labels, product_category_preds)

    hazard_acc = accuracy_score(hazard_labels, hazard_preds)

    product_acc = accuracy_score(product_labels, product_preds)



    # Compute F1 score for each task

    # hazard_category_f1 = f1_score(hazard_category_labels, hazard_category_preds, average='weighted')

    # product_category_f1 = f1_score(product_category_labels, product_category_preds, average='weighted')

    hazard_f1 = f1_score(hazard_labels, hazard_preds, average='weighted')

    product_f1 = f1_score(product_labels, product_preds, average='weighted')



    # Compute average F1 score across all tasks

    avg_f1 = (hazard_f1 + product_f1) / 2



    # Optionally, you can also compute average accuracy across tasks if needed

    avg_acc = (hazard_acc + product_acc) / 2



    # Return a dictionary with both accuracy and average F1 score

    return {

        'hazard_acc': hazard_acc,

        'product_acc': product_acc,

        'avg_accuracy': avg_acc,

        'avg_f1': avg_f1

    }

In [14]:
# Define the data collator
def data_collator(batch):
    return {
        'input_ids': torch.stack([x['input_ids'] for x in batch]),
        'attention_mask': torch.stack([x['attention_mask'] for x in batch]),
        'hazard_labels': torch.tensor([x['hazard_labels'] for x in batch]),
        'product_labels': torch.tensor([x['product_labels'] for x in batch]),
    }


In [15]:
# Function to train and save a model
def train_and_save_model(model_name, output_dir):
    """
    Trains and saves a model (only the final model after the last epoch).
    
    Args:
    - model_name: the pre-trained model name or path.
    - output_dir: directory to save the model
    """
    # Initialize the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Tokenize the text data
    train_texts = train_df['text'].tolist()
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)

    val_texts = val_df['text'].tolist()
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

    # Prepare the datasets
    train_dataset = FoodHazardDataset(
        train_encodings,
        # train_df['hazard-category'].tolist(),
        # train_df['product-category'].tolist()
        train_df['hazard'].tolist(),
        train_df['product'].tolist()
    )

    val_dataset = FoodHazardDataset(
        val_encodings,
        # val_df['hazard-category'].tolist(),
        # val_df['product-category'].tolist()
        val_df['hazard'].tolist(),
        val_df['product'].tolist()
    )

    # Define the number of labels
    num_labels_dict = {
        # 'hazard_category': num_hazard_category_labels,
        # 'product_category': num_product_category_labels
        'hazard': num_hazard_labels,
        'product': num_product_labels
    }

    # Initialize the model
    model = TransformerForFoodHazardClassification(model_name, num_labels_dict)

    # Move the model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model.to(device)

    # Training arguments

    if model_name == "microsoft/deberta-base":
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=8,  # Train for 8 epochs
            per_device_train_batch_size=8,  # Adjust based on your GPU memory
            per_device_eval_batch_size=8,
            evaluation_strategy="epoch",
            save_strategy="no",  # Do not save after each epoch
            logging_dir='./logs',
            logging_steps=10,
            warmup_steps=500,
            weight_decay=0.01,
            report_to=[]  # Disable W&B logging
        )
    else:
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=8,  # Train for 8 epochs
            per_device_train_batch_size=16,  # Adjust based on your GPU memory
            per_device_eval_batch_size=16,
            evaluation_strategy="epoch",
            save_strategy="no",  # Do not save after each epoch
            logging_dir='./logs',
            logging_steps=10,
            warmup_steps=500,
            weight_decay=0.01,
            report_to=[]  # Disable W&B logging
        )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        optimizers=(AdamW(model.parameters(), lr=1e-5), None),
    )

    # Train the model
    trainer.train()

    # Save the model only after the last epoch (epoch 8)
    if model_name == 'allenai/scibert_scivocab_uncased':
        # Save the model only after the last epoch (epoch 8)
        state_dict = {k: v.contiguous() if isinstance(v, torch.Tensor) else v for k, v in model.state_dict().items()}
        torch.save(state_dict, os.path.join(output_dir, "scibert_weights"))
    else:
        trainer.save_model(output_dir)

    # Evaluate the model
    eval_results = trainer.evaluate()
    print(f"Evaluation results for {model_name}:")
    print(eval_results)

    # Clear GPU memory
    del model
    torch.cuda.empty_cache()

    return eval_results  # Return evaluation results instead of the trainer


In [16]:
torch.cuda.empty_cache()

In [17]:
# Train DeBERTa Large
trainer_deberta_base = train_and_save_model('microsoft/deberta-base', 'deberta-base-model')

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Hazard Acc,Product Acc,Avg Accuracy,Avg F1
1,8.8029,8.570143,0.496143,0.049939,0.273041,0.21401
2,7.554,7.097185,0.649614,0.142915,0.396265,0.340783
3,6.3441,6.216573,0.750305,0.220057,0.485181,0.435436
4,5.703,5.606031,0.812018,0.282582,0.5473,0.503038
5,5.1443,5.197421,0.830288,0.334551,0.58242,0.538545
6,4.954,4.920473,0.846123,0.365814,0.605968,0.562844
7,4.3671,4.76668,0.855461,0.387333,0.621397,0.579406
8,4.6721,4.708915,0.854243,0.395047,0.624645,0.583338


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Evaluation results for microsoft/deberta-base:
{'eval_loss': 4.708914756774902, 'eval_hazard_acc': 0.8542427933414535, 'eval_product_acc': 0.3950466910272026, 'eval_avg_accuracy': 0.624644742184328, 'eval_avg_f1': 0.5833376924914732, 'eval_runtime': 81.1817, 'eval_samples_per_second': 30.339, 'eval_steps_per_second': 1.897, 'epoch': 8.0}


In [18]:
torch.cuda.empty_cache()

In [19]:
trainer_distilbert_base = train_and_save_model('distilbert-base-uncased', 'distilbert-base-uncased-model')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Hazard Acc,Product Acc,Avg Accuracy,Avg F1
1,10.828,10.793424,0.219245,0.0203,0.119773,0.050819
2,9.318,9.020288,0.452294,0.066585,0.25944,0.197959
3,8.3289,8.102565,0.603735,0.133983,0.368859,0.314556
4,7.6936,7.579804,0.685749,0.181486,0.433618,0.381342
5,7.2143,7.238029,0.726756,0.209095,0.467925,0.417688
6,7.0478,7.010336,0.747868,0.215591,0.48173,0.432752
7,6.9059,6.883865,0.758831,0.227771,0.493301,0.444632
8,6.8952,6.841282,0.760455,0.232643,0.496549,0.447511


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Evaluation results for distilbert-base-uncased:
{'eval_loss': 6.841282367706299, 'eval_hazard_acc': 0.76045473000406, 'eval_product_acc': 0.23264311814859928, 'eval_avg_accuracy': 0.49654892407632967, 'eval_avg_f1': 0.4475109004021072, 'eval_runtime': 25.9272, 'eval_samples_per_second': 94.997, 'eval_steps_per_second': 2.97, 'epoch': 8.0}


In [20]:
torch.cuda.empty_cache()

In [21]:
trainer_scibert = train_and_save_model('allenai/scibert_scivocab_uncased', 'scibert_scivocab_uncased-model')

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Hazard Acc,Product Acc,Avg Accuracy,Avg F1
1,10.9128,10.866262,0.198538,0.019488,0.109013,0.050833
2,9.2581,9.066277,0.519285,0.058059,0.288672,0.221704
3,8.4529,8.275174,0.623224,0.117337,0.37028,0.313707
4,7.7959,7.799679,0.685749,0.142915,0.414332,0.36025
5,7.3561,7.472742,0.725538,0.168088,0.446813,0.397808
6,7.2083,7.259977,0.747056,0.182704,0.46488,0.415307
7,7.0338,7.13051,0.758019,0.1892,0.473609,0.426142
8,6.9772,7.088196,0.764921,0.190418,0.47767,0.431126


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Evaluation results for allenai/scibert_scivocab_uncased:
{'eval_loss': 7.08819580078125, 'eval_hazard_acc': 0.7649208282582217, 'eval_product_acc': 0.1904181892001624, 'eval_avg_accuracy': 0.47766950872919206, 'eval_avg_f1': 0.43112616205141685, 'eval_runtime': 46.3307, 'eval_samples_per_second': 53.161, 'eval_steps_per_second': 1.662, 'epoch': 8.0}


In [22]:
torch.cuda.empty_cache()

In [23]:
# concat_df = pd.read_csv('/kaggle/input/haz-prod-cat/submission.csv')

In [4]:
test_df = pd.read_csv('/kaggle/working/final_cleaned_validation.csv')

In [25]:
# test_df = pd.concat([testdf, concat_df], axis=1)

In [6]:
test_df

Unnamed: 0,year,month,day,country,title,text
0,1994,5,5,us,Recall Notification: FSIS-017-94,Date Opened: Date Closed: Name: KOEGEL MEATS I...
1,1994,5,12,us,Recall Notification: FSIS-048-94,Date Opened: Date Closed: Name: COLUMBUS SALAM...
2,1995,4,16,us,Recall Notification: FSIS-032-95,Date Opened: Date Closed: Recall Class: N Name...
3,1998,7,16,ca,Archive - ALLERGY ALERT -- PRESENCE OF UNDECLA...,PRESENCE OF UNDECLARED NUTS IN ORIGINALE AUGUS...
4,1998,8,6,us,Recall Notification: FSIS-018-98,Recall Notification Report: RNR018-98 Date Ope...
...,...,...,...,...,...,...
560,2022,6,29,au,The Fresh Salad Co Thai Coconut Wild Rice Prep...,Page Content ​ ​​​​ ​Date published: Product i...
561,2022,7,18,au,Powered by Plants Pty Ltd — Cleanfit Plant Pro...,PRA number 2022/19525 Published date Product d...
562,2022,7,20,ca,Certain Enjoy Life brand Soft Baked Cookies – ...,Food recall warning Certain Enjoy Life brand S...
563,2022,7,28,hk,Imported biscuit may contain allergen (peanuts),Imported biscuit may contain allergen (peanuts...


In [7]:
test_df['text'] = test_df['text']
# Drop 'hazard-category' and 'product-category'
# test_df = test_df.drop(columns=['hazard-category', 'product-category'])

In [8]:
test_df

Unnamed: 0,year,month,day,country,title,text
0,1994,5,5,us,Recall Notification: FSIS-017-94,Date Opened: Date Closed: Name: KOEGEL MEATS I...
1,1994,5,12,us,Recall Notification: FSIS-048-94,Date Opened: Date Closed: Name: COLUMBUS SALAM...
2,1995,4,16,us,Recall Notification: FSIS-032-95,Date Opened: Date Closed: Recall Class: N Name...
3,1998,7,16,ca,Archive - ALLERGY ALERT -- PRESENCE OF UNDECLA...,PRESENCE OF UNDECLARED NUTS IN ORIGINALE AUGUS...
4,1998,8,6,us,Recall Notification: FSIS-018-98,Recall Notification Report: RNR018-98 Date Ope...
...,...,...,...,...,...,...
560,2022,6,29,au,The Fresh Salad Co Thai Coconut Wild Rice Prep...,Page Content ​ ​​​​ ​Date published: Product i...
561,2022,7,18,au,Powered by Plants Pty Ltd — Cleanfit Plant Pro...,PRA number 2022/19525 Published date Product d...
562,2022,7,20,ca,Certain Enjoy Life brand Soft Baked Cookies – ...,Food recall warning Certain Enjoy Life brand S...
563,2022,7,28,hk,Imported biscuit may contain allergen (peanuts),Imported biscuit may contain allergen (peanuts...


In [9]:
test_df = test_df[['text']]
test_texts = test_df['text'].tolist()

In [10]:
from torch.utils.data import DataLoader, TensorDataset
from safetensors.torch import load_file
import numpy as np
import torch

def get_model_logits(model_name, model_dir, test_texts, batch_size=8):
    # Initialize the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Tokenize the test data
    test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512, return_tensors='pt')

    # Convert tokenized inputs to a TensorDataset
    test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'])

    # Use DataLoader to load the data in batches
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    # Define the number of labels
    num_labels_dict = {
        # 'hazard_category': num_hazard_category_labels,
        # 'product_category': num_product_category_labels
        'hazard': num_hazard_labels,
        'product': num_product_labels
    }

    # Initialize the model
    model = TransformerForFoodHazardClassification(model_name, num_labels_dict)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load the model state dict from model.safetensors
    if model_name == "allenai/scibert_scivocab_uncased":
        state_dict = torch.load(f"{model_dir}/scibert_weights", map_location=device)
        model.load_state_dict(state_dict)
    else:
        state_dict = load_file(f"{model_dir}/model.safetensors")
        model.load_state_dict(state_dict)

    # Move model to the GPUs (using DataParallel for multiple GPUs)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')  # Default to cuda:0
    model = torch.nn.DataParallel(model, device_ids=[0, 1])  # Use both GPU 0 and GPU 1
    model.to(device)

    # Initialize dictionaries to accumulate logits
    # all_hazard_category_logits = []
    # all_product_category_logits = []
    all_hazard_logits = []
    all_product_logits = []
    
    with torch.no_grad():
        model.eval()
        for batch in test_dataloader:
            input_ids, attention_mask = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Unpack logits and move them to CPU
            hazard_logits, product_logits = outputs
            # all_hazard_category_logits.append(hazard_category_logits.cpu().numpy())
            # all_product_category_logits.append(product_category_logits.cpu().numpy())
            all_hazard_logits.append(hazard_logits.cpu().numpy())
            all_product_logits.append(product_logits.cpu().numpy())

    # Concatenate all logits from batches along the batch dimension (axis=0)
    # hazard_category_logits_concat = np.concatenate(all_hazard_category_logits, axis=0)
    # product_category_logits_concat = np.concatenate(all_product_category_logits, axis=0)
    hazard_logits_concat = np.concatenate(all_hazard_logits, axis=0)
    product_logits_concat = np.concatenate(all_product_logits, axis=0)

    # Free GPU memory by deleting model and test_encodings
    del model
    del test_encodings

    # Clear CUDA cache
    torch.cuda.empty_cache()

    # Return logits as a structured dictionary
    return {
        # 'hazard_category': hazard_category_logits_concat,
        # 'product_category': product_category_logits_concat
        'hazard': hazard_logits_concat,
        'product': product_logits_concat
    }


In [24]:
# After evaluating the model
torch.cuda.empty_cache()


In [26]:
# Get logits from DeBERTa Large
deberta_base_logits = get_model_logits('microsoft/deberta-base', 'deberta-base-model', test_texts)


In [32]:
# After evaluating the model
torch.cuda.empty_cache()


In [33]:
distilbert_base_logits = get_model_logits('distilbert-base-uncased', 'distilbert-base-uncased-model', test_texts)

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [40]:
# After evaluating the model
torch.cuda.empty_cache()


In [41]:
scibert_logits = get_model_logits('allenai/scibert_scivocab_uncased', 'scibert_scivocab_uncased-model', test_texts)

  state_dict = torch.load(f"{model_dir}/scibert_weights", map_location=device)
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [42]:
# Average the logits
# hazard_category_logits_avg = (distilbert_base_logits['hazard_category'] + deberta_base_logits['hazard_category']) / 2
# product_category_logits_avg = (distilbert_base_logits['product_category'] + deberta_base_logits['product_category']) / 2
# hazard_logits_avg = (distilbert_base_logits['hazard'] + deberta_base_logits['hazard'] + scibert_logits['hazard']) / 3
# product_logits_avg = (distilbert_base_logits['product'] + deberta_base_logits['product'] + scibert_logits['product']) / 3
hazard_logits_avg = scibert_logits['hazard']
product_logits_avg = scibert_logits['product']

In [43]:
# Get predicted labels
# hazard_category_preds = np.argmax(hazard_category_logits_avg, axis=1)
# product_category_preds = np.argmax(product_category_logits_avg, axis=1)
hazard_preds = np.argmax(hazard_logits_avg, axis=1)
product_preds = np.argmax(product_logits_avg, axis=1)

In [44]:
# Decode the predicted labels using the label encoders
# hazard_category_labels = hazard_category_encoder.inverse_transform(hazard_category_preds)
# product_category_labels = product_category_encoder.inverse_transform(product_category_preds)
hazard_labels = hazard_encoder.inverse_transform(hazard_preds)
product_labels = product_encoder.inverse_transform(product_preds)

In [45]:
# Create a DataFrame for the predictions
output_df = pd.DataFrame({
    'hazard': hazard_labels,
    'product': product_labels
})

# Save the output DataFrame to a CSV file
output_df.to_csv('test_predictions_ensemble.csv', index=False)

# For subtask 1 (hazard-category and product-category)
subtask2_df = output_df[['hazard', 'product']]
subtask2_df.to_csv('subtask2_predictions_ensemble.csv', index=True)


In [46]:
# Analyze the predictions
print("Hazard Predictions:")
print(subtask2_df['hazard'].value_counts())

print("\nProduct Predictions:")
print(subtask2_df['product'].value_counts())

Hazard Predictions:
hazard
salmonella                                        78
listeria monocytogenes                            75
milk and products thereof                         72
other                                             49
escherichia coli                                  30
inspection issues                                 30
peanuts and products thereof                      29
cereals containing gluten and products thereof    26
soybeans and products thereof                     24
eggs and products thereof                         24
plastic fragment                                  20
metal fragment                                    16
sulphur dioxide and sulphites                     13
glass fragment                                    12
almond                                            10
sesame seeds and products thereof                  7
unauthorised substance ethylene oxide              6
mustard and products thereof                       5
clostridium botulin