# Neural Network Classification Benchmark
In this notebook we are going to do the classification using a neural network.

In [2]:
# Loading Packages
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd


## Loading the Data
First we are loading the and spliting the data into a train set and devised set.

In [3]:
data = pd.read_csv('datasets/incidents_train.csv', index_col=0)
trainset, devset = train_test_split(data, test_size=0.2, random_state=4)

data.head()

Unnamed: 0,year,month,day,country,title,text,hazard-category,product-category,hazard,product
0,1994,1,7,us,Recall Notification: FSIS-024-94,Case Number: 024-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria monocytogenes,smoked sausage
1,1994,3,10,us,Recall Notification: FSIS-033-94,Case Number: 033-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria spp,sausage
2,1994,3,28,us,Recall Notification: FSIS-014-94,Case Number: 014-94 \n Date Opene...,biological,"meat, egg and dairy products",listeria monocytogenes,ham slices
3,1994,4,3,us,Recall Notification: FSIS-009-94,Case Number: 009-94 \n Date Opene...,foreign bodies,"meat, egg and dairy products",plastic fragment,thermal processed pork meat
4,1994,7,1,us,Recall Notification: FSIS-001-94,Case Number: 001-94 \n Date Opene...,foreign bodies,"meat, egg and dairy products",plastic fragment,chicken breast


## Defining the Neural Network
Here we are initializing the Neural network that we will use.\
The code for this was found online and was adjusted for the specific problem.

In [4]:
# Neural Network Definition
class TextClassifierNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(TextClassifierNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

### Initializing the vectorizer for the title column
I am going to use the same vectorizer that i used in the SVM model for the title column

In [None]:
# TF-IDF Vectorizer
title_tfidf_vect = TfidfVectorizer(strip_accents='unicode', analyzer='char', ngram_range=(3, 6), max_df=0.5, min_df=3)


### Defining a function to train the Neural Network
The training function is getting as an input the vectorizer and the name of the column that will train the model.\
Then it is training the model in the trainset and evaluates it in the devset printing out the f_1 scores for each category.

In [None]:
def training(vectorizer,column):
    vectorizer.fit(trainset[column])
    X_train = vectorizer.transform(trainset[column]).toarray()
    X_dev = vectorizer.transform(devset[column]).toarray()

    
    # Training Loop
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    hidden_dim = 128  # Adjust based on experiments
    batch_size = 32
    epochs = 10
    learning_rate = 1e-3

    # Create label encoders for each target column
    label_encoders = {}

    for label in ('hazard-category', 'product-category', 'hazard', 'product'):
        print(label.upper())

        # Encode labels
        le = LabelEncoder()
        le.fit_transform(data[label])
        trainset[label] = le.transform(trainset[label])
        devset[label] = le.transform(devset[label])
        label_encoders[label] = le

        # Prepare target labels
        y_train = trainset[label].values
        y_dev = devset[label].values

        # Get number of classes
        num_classes = len(le.classes_)

        # Convert data to PyTorch tensors
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train, dtype=torch.long)
        X_dev_tensor = torch.tensor(X_dev, dtype=torch.float32)
        y_dev_tensor = torch.tensor(y_dev, dtype=torch.long)

        # **Initialize the model for this label**
        model = TextClassifierNN(input_dim=X_train.shape[1], hidden_dim=hidden_dim, num_classes=num_classes)
        model.to(device)

        # Define optimizer and loss function
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss()

        # Train the model
        model.train()
        for epoch in range(epochs):
            for i in range(0, len(X_train_tensor), batch_size):
                batch_X = X_train_tensor[i:i + batch_size].to(device)
                batch_y = y_train_tensor[i:i + batch_size].to(device)

                # Forward pass
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)

                # Backward pass and optimization
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

        # Evaluate on dev set
        model.eval()  # **Model is now defined here**
        with torch.no_grad():
            dev_outputs = model(X_dev_tensor.to(device))
            dev_predictions = torch.argmax(dev_outputs, axis=1).cpu().numpy()

        # Decode predictions back to string labels
        devset['predictions-' + label] = label_encoders[label].inverse_transform(dev_predictions)
        devset[label] = label_encoders[label].inverse_transform(devset[label])
        print(f'  macro: {f1_score(y_dev, dev_predictions, zero_division=0, average="macro"):.2f}')
        print(f'  micro: {f1_score(y_dev, dev_predictions, zero_division=0, average="micro"):.2f}')



### Training the model with the title column

In [None]:
training(title_tfidf_vect,'title')

HAZARD-CATEGORY
Epoch 1, Loss: 2.9374
Epoch 2, Loss: 2.0666
Epoch 3, Loss: 1.3694
Epoch 4, Loss: 0.5874
Epoch 5, Loss: 0.1745
Epoch 6, Loss: 0.0689
Epoch 7, Loss: 0.0363
Epoch 8, Loss: 0.0225
Epoch 9, Loss: 0.0154
Epoch 10, Loss: 0.0111
  macro: 0.70
  micro: 0.84
PRODUCT-CATEGORY
Epoch 1, Loss: 2.2444
Epoch 2, Loss: 0.8472
Epoch 3, Loss: 0.1400
Epoch 4, Loss: 0.0413
Epoch 5, Loss: 0.0181
Epoch 6, Loss: 0.0099
Epoch 7, Loss: 0.0062
Epoch 8, Loss: 0.0043
Epoch 9, Loss: 0.0031
Epoch 10, Loss: 0.0024
  macro: 0.59
  micro: 0.75
HAZARD
Epoch 1, Loss: 5.2973
Epoch 2, Loss: 4.4153
Epoch 3, Loss: 3.6276
Epoch 4, Loss: 2.6175
Epoch 5, Loss: 1.3581
Epoch 6, Loss: 0.3871
Epoch 7, Loss: 0.1124
Epoch 8, Loss: 0.0508
Epoch 9, Loss: 0.0283
Epoch 10, Loss: 0.0181
  macro: 0.34
  micro: 0.62
PRODUCT
Epoch 1, Loss: 6.9475
Epoch 2, Loss: 6.2187
Epoch 3, Loss: 5.3830
Epoch 4, Loss: 4.2386
Epoch 5, Loss: 2.6933
Epoch 6, Loss: 0.9914
Epoch 7, Loss: 0.1801
Epoch 8, Loss: 0.0565
Epoch 9, Loss: 0.0273
Epoch 1

### Defining the function that returns the scores for each Subtask

In [19]:
def compute_score(hazards_true, products_true, hazards_pred, products_pred):
  # compute f1 for hazards:
  f1_hazards = f1_score(
    hazards_true,
    hazards_pred,
    average='macro'
  )

  # compute f1 for products:
  f1_products = f1_score(
    products_true[hazards_pred == hazards_true],
    products_pred[hazards_pred == hazards_true],
    average='macro'
  )

  return (f1_hazards + f1_products) / 2.

The scores for the title-trained predictions.

In [None]:
print(f"Score Sub-Task 1: {compute_score(devset['hazard-category'], devset['product-category'], devset['predictions-hazard-category'], devset['predictions-product-category']):.3f}")
print(f"Score Sub-Task 2: {compute_score(devset['hazard'], devset['product'], devset['predictions-hazard'], devset['predictions-product']):.3f}")

Score Sub-Task 1: 0.655
Score Sub-Task 2: 0.299


### Text trained Neural Network
Next we are going to do the same thing with the text column.\
First we are initializing the vectorizer for the text column and run the model with it.

In [14]:
text_tfidf_vect = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,2), max_df=0.5, min_df=5)

In [17]:
training(text_tfidf_vect,'text')

HAZARD-CATEGORY
Epoch 1, Loss: 3.2612
Epoch 2, Loss: 1.0725
Epoch 3, Loss: 0.1582
Epoch 4, Loss: 0.0249
Epoch 5, Loss: 0.0084
Epoch 6, Loss: 0.0036
Epoch 7, Loss: 0.0019
Epoch 8, Loss: 0.0012
Epoch 9, Loss: 0.0008
Epoch 10, Loss: 0.0006
  macro: 0.74
  micro: 0.92
PRODUCT-CATEGORY
Epoch 1, Loss: 2.1968
Epoch 2, Loss: 0.2833
Epoch 3, Loss: 0.0215
Epoch 4, Loss: 0.0048
Epoch 5, Loss: 0.0018
Epoch 6, Loss: 0.0009
Epoch 7, Loss: 0.0005
Epoch 8, Loss: 0.0003
Epoch 9, Loss: 0.0002
Epoch 10, Loss: 0.0002
  macro: 0.52
  micro: 0.71
HAZARD
Epoch 1, Loss: 6.0265
Epoch 2, Loss: 3.3981
Epoch 3, Loss: 1.4899
Epoch 4, Loss: 0.1636
Epoch 5, Loss: 0.0444
Epoch 6, Loss: 0.0207
Epoch 7, Loss: 0.0118
Epoch 8, Loss: 0.0078
Epoch 9, Loss: 0.0057
Epoch 10, Loss: 0.0044
  macro: 0.40
  micro: 0.74
PRODUCT
Epoch 1, Loss: 7.0307
Epoch 2, Loss: 4.9953
Epoch 3, Loss: 2.9247
Epoch 4, Loss: 0.5080
Epoch 5, Loss: 0.0338
Epoch 6, Loss: 0.0125
Epoch 7, Loss: 0.0062
Epoch 8, Loss: 0.0037
Epoch 9, Loss: 0.0026
Epoch 1

The scores for the text-trained predictions.

In [20]:
print(f"Score Sub-Task 1: {compute_score(devset['hazard-category'], devset['product-category'], devset['predictions-hazard-category'], devset['predictions-product-category']):.3f}")
print(f"Score Sub-Task 2: {compute_score(devset['hazard'], devset['product'], devset['predictions-hazard'], devset['predictions-product']):.3f}")

Score Sub-Task 1: 0.639
Score Sub-Task 2: 0.285


### Evaluating
In the title trained model we achieved the scores:\
 ST-1: 0.655 and ST-2: 0.299\
While in the text trained model we achieved the scores:\
 ST-1: 0.639 and ST-2: 0.285\
for the specific train and dev sets we chose.

In [18]:
devset

Unnamed: 0,year,month,day,country,title,text,hazard-category,product-category,hazard,product,predictions-hazard-category,predictions-product-category,predictions-hazard,predictions-product
2209,2017,5,5,us,Unilever Issues Allergy Alert on Undeclared Pe...,Unilever is voluntarily recalling a limited nu...,allergens,ices and desserts,peanuts and products thereof,ice cream,allergens,ices and desserts,peanuts and products thereof,ice cream
3107,2018,10,18,us,"GHSE, LLC Recalls Salads Containing Meat Produ...","WASHINGTON, Oct. 17, 2018 – GHSE, LLC, a Green...",biological,cereals and bakery products,listeria monocytogenes,corn,biological,fruits and vegetables,listeria monocytogenes,salads
3226,2018,12,11,us,Moonstruck Chocolate Co. Issues Allergy Alert ...,"Moonstruck Chocolate Company of Portland, Oreg...",allergens,confectionery,hazelnut,candies,allergens,"cocoa and cocoa preparations, coffee and tea",hazelnut,candies
802,2013,4,2,us,Arkansas Firm Recalls Breaded Chicken Products...,"WASHINGTON, April 2, 2013 - Tyson Foods Inc., ...",allergens,"meat, egg and dairy products",soybeans and products thereof,chicken preparations,allergens,"meat, egg and dairy products",soybeans and products thereof,chicken based products
1460,2015,12,15,uk,Orthodox Coconut Palm brand Coconut Juice reca...,Orthodox Coconut Palm brand Coconut Juice is r...,allergens,non-alcoholic beverages,milk and products thereof,coconut juice,allergens,non-alcoholic beverages,milk and products thereof,coconut juice
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2551,2018,1,19,us,"Café Spice GCT Inc., Recalls Ready-To-Eat Chic...","WASHINGTON, Jan. 18, 2018 – Café Spice GCT Inc...",allergens,"meat, egg and dairy products",milk and products thereof,chicken based products,allergens,"meat, egg and dairy products",milk and products thereof,chicken based products
5441,2021,11,20,uk,Waitrose recalls Waitrose Thrive Fish Pie beca...,Waitrose is recalling Waitrose Thrive Fish Pie...,allergens,seafood,eggs and products thereof,fish products,allergens,prepared dishes and snacks,eggs and products thereof,ready to eat - cook meals
4416,2020,8,28,ca,Picoudi brand microgreens recalled due to Salm...,Food Recall Warning - Picoudi brand microgreen...,biological,fruits and vegetables,salmonella,fresh vegetables,biological,fruits and vegetables,salmonella,alfalfa sprouts
2465,2017,11,15,us,Casa Sanchez Foods Recalls,"Casa Sanchez Foods of Hayward, CA, is recallin...",biological,"soups, broths, sauces and condiments",listeria monocytogenes,guacamole,biological,"soups, broths, sauces and condiments",listeria monocytogenes,vegetables
