In [10]:
!pip install numpy torch pandas scikit-learn crepes transformers tqdm

Collecting crepes
  Downloading crepes-0.7.1-py3-none-any.whl.metadata (19 kB)
Downloading crepes-0.7.1-py3-none-any.whl (29 kB)
Installing collected packages: crepes
Successfully installed crepes-0.7.1


In [19]:
!wget -P /kaggle/working -nc "https://raw.githubusercontent.com/HammadxSaj/Sem-Eval-Task10-Dataset/refs/heads/main/final_cleaned_train.csv"
!wget -P /kaggle/working -nc "https://raw.githubusercontent.com/HammadxSaj/Sem-Eval-Task10-Dataset/refs/heads/main/final_cleaned_validation.csv"

File '/kaggle/working/final_cleaned_train.csv' already there; not retrieving.

File '/kaggle/working/final_cleaned_validation.csv' already there; not retrieving.



In [20]:
import pandas as pd
import numpy as np

data = pd.read_csv('/kaggle/working/final_cleaned_train.csv', index_col=0)
data.sample()

Unnamed: 0_level_0,month,day,country,title,text,hazard-category,product-category,hazard,product
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015,10,23,uk,Mr Naga Hot Pepper Pickle recalled due to unde...,Shahnaz Food Products Ltd is recalling jars of...,allergens,fruits and vegetables,mustard and products thereof,pickled peppers


In [21]:
# select input and label from data:
X = data['title']
y = data[['hazard-category', 'product-category']]

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_dev, y_train, y_dev = {}, {}, {}, {}

# create train and development sets:
for column in y.columns:
    X_train[column], X_dev[column], y_train[column], y_dev[column] = train_test_split(X.values, y[column].values, test_size=.2, shuffle=True, random_state=42, stratify=y[column])
    print(column.upper())
    print('  Size of development set:', X_dev[column].shape)
    print('  Size of train set:      ', X_train[column].shape)

HAZARD-CATEGORY
  Size of development set: (1194,)
  Size of train set:       (4772,)
PRODUCT-CATEGORY
  Size of development set: (1194,)
  Size of train set:       (4772,)


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create and train input embedding:
tfidf = TfidfVectorizer().fit(X)

# since TfidfVectorizer.transform(...) returns a sparse matrix which 'crepes'
# does not handle well, we use the following utility function to encode our texts:
phi = lambda x: tfidf.transform(x).toarray()

# print a sample of the vocabulary to show that we learned something:
list(tfidf.vocabulary_.keys())[:5]

['recall', 'notification', 'fsis', '024', '94']

In [24]:
# create label to one-hot and reverse dictionaries:
id2label = {column:np.unique(y[column]) for column in y.columns}
label2id = {column:{l:i for i, l in enumerate(id2label[column])} for column in y.columns}

# show label-id mapping:
label2id['hazard-category']

{'allergens': 0,
 'biological': 1,
 'chemical': 2,
 'food additives and flavourings': 3,
 'foreign bodies': 4,
 'fraud': 5,
 'migration': 6,
 'organoleptic aspects': 7,
 'other hazard': 8,
 'packaging defect': 9}

In [25]:
label2id['product-category']

{'alcoholic beverages': 0,
 'cereals and bakery products': 1,
 'cocoa and cocoa preparations, coffee and tea': 2,
 'confectionery': 3,
 'dietetic foods, food supplements, fortified foods': 4,
 'fats and oils': 5,
 'feed materials': 6,
 'food additives and flavourings': 7,
 'food contact materials': 8,
 'fruits and vegetables': 9,
 'herbs and spices': 10,
 'honey and royal jelly': 11,
 'ices and desserts': 12,
 'meat, egg and dairy products': 13,
 'non-alcoholic beverages': 14,
 'nuts, nut products and seeds': 15,
 'other food product / mixed': 16,
 'pet feed': 17,
 'prepared dishes and snacks': 18,
 'seafood': 19,
 'soups, broths, sauces and condiments': 20,
 'sugars and syrups': 21}

In [26]:
from crepes import WrapClassifier
from sklearn.linear_model import LogisticRegression

# create a conformal base classifiers based on Logistic Regression:
base_classifiers = {column:WrapClassifier(LogisticRegression()) for column in y.columns}

for column in base_classifiers:
    # train the base classifier:
    base_classifiers[column].fit(
        phi(X_train[column]),
        [label2id[column][l] for l in y_train[column]]
    )

    # calibrate the base classifier:
    base_classifiers[column].calibrate(
        phi(X_dev[column]),
        [label2id[column][l] for l in y_dev[column]],
        class_cond=True
    )

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
import getpass
from huggingface_hub import login
login(getpass.getpass('Enter your huggingface API-key:'))

Enter your huggingface API-key: ········


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [28]:
import transformers
import torch

# create llm pipeline:
llm = transformers.pipeline(
    "text-generation",
    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto"
)

# Get special tokens for later:
bos_token_id = llm.tokenizer.convert_tokens_to_ids('<|begin_of_text|>')
eos_token_id = llm.tokenizer.convert_tokens_to_ids('<|eot_id|>')
pad_token_id = llm.tokenizer.convert_tokens_to_ids('<|eot_id|>')

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [29]:
from sklearn.metrics import pairwise_distances

def get_few_shot_examples(column, text, examples_per_class=2):
    examples = []

    # generate conformal prediction set:
    prediction_set = id2label[column][base_classifiers[column].predict_set(phi([text])).astype(bool)[0]]
    
    for y in prediction_set:
        # get texts in current class:
        texts = X_train[column][y_train[column] == y]

        # generate embeddings of texts in class:
        embeddings = phi([text] + texts.tolist())

        # calculate cosine-similarity:
        similarity = (1. - pairwise_distances(embeddings, metric='cosine'))[1:,0]

        # get closest sample of training data based on embeddings:
        for j in np.argsort(similarity)[::-1][:examples_per_class]:
            examples.append((texts[j], y, similarity[j]))

    # sort samples based on embedding from training data:
    examples.sort(key=lambda e: e[2], reverse=True)

    return examples

# print sample output:
samples = get_few_shot_examples('hazard-category', X_dev['hazard-category'][0])
samples

[('J. L. King & Co—Tuna Pasta Salad', 'biological', 0.35513942995404624),
 ('Coles—Garden Vegetable Pasta Sauce', 'foreign bodies', 0.3419897147547253),
 ('Enrico’s Kitchen Pty Ltd—Frozen pizza and pasta products',
  'foreign bodies',
  0.3336328866991509),
 ('Avanza Pasta, LLC Recalls Beef and Poultry Products  Produced Without Benefit of Inspection',
  'fraud',
  0.3164180527682221),
 ('Avanza Pasta, LLC Recalls Beef and Poultry Products  Produced Without Benefit of Inspection',
  'fraud',
  0.3164180527682221),
 ('Recall of Me2you Italian Pasta Salad due to the Possible Presence of Listeria monocytogenes',
  'biological',
  0.26290953289773256),
 ("Recall of “Bellamy's Organic - Organic Brown Rice Pasta Stars”",
  'chemical',
  0.2357641851388549),
 ('Various brands of juice products recalled due to off odour',
  'organoleptic aspects',
  0.1527461141783999),
 ('baketime ltd recalls various branded biscuit, snack and confectionery products',
  'other hazard',
  0.1339119818958855),


In [30]:
def create_prompt(column, text, examples):
    # helper function replacing quotation marks in the text:
    replace_qm = lambda s: s.replace('"', "'")

    # create context:
    context = f'We are looking for food {column.split("-")[0]}s in texts. Here are some labeled examples sorted from most probable to least probable:\n'

    for x, y, _ in examples:
        context += f'\n"{replace_qm(x)}" => {y}'

    return {"role": "user", "content": f'{context}\n\nPlease predict the correct class for the following sample. Only provide the class label.\n\n"{replace_qm(text)}" => '}

# print sample prompt:
print(create_prompt('hazard-category', X_dev['hazard-category'][0], samples)["content"])

We are looking for food hazards in texts. Here are some labeled examples sorted from most probable to least probable:

"J. L. King & Co—Tuna Pasta Salad" => biological
"Coles—Garden Vegetable Pasta Sauce" => foreign bodies
"Enrico’s Kitchen Pty Ltd—Frozen pizza and pasta products" => foreign bodies
"Avanza Pasta, LLC Recalls Beef and Poultry Products  Produced Without Benefit of Inspection" => fraud
"Avanza Pasta, LLC Recalls Beef and Poultry Products  Produced Without Benefit of Inspection" => fraud
"Recall of Me2you Italian Pasta Salad due to the Possible Presence of Listeria monocytogenes" => biological
"Recall of “Bellamy's Organic - Organic Brown Rice Pasta Stars”" => chemical
"Various brands of juice products recalled due to off odour" => organoleptic aspects
"baketime ltd recalls various branded biscuit, snack and confectionery products" => other hazard
"Jumbo Importers recalls various KOO canned products because of defective cans" => packaging defect
"west london sandwiches rec

In [31]:
# load test set:
data_test = pd.read_csv('/kaggle/working/final_cleaned_validation.csv', index_col=0)
X_test = data_test['title'].values

data_test.sample()

Unnamed: 0_level_0,month,day,country,title,text
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019,4,2,us,Wakefern Food Corp. Voluntarily Recalls Wholes...,Wakefern Food Corp. has initiated a voluntary ...


In [None]:
from tqdm.notebook import tqdm

# predict test set:
for column in base_classifiers:
    # add new column to dataframe:
    data_test[column] = ['']*len(data_test)
    mask = data_test.columns == column

    for i, x in enumerate(tqdm(X_test, desc='Assessing texts')):
        # get 2 most similar texts in the training data:
        examples = get_few_shot_examples(column, x)
        
        if len(examples) > 1:
            # create prompt:
            prompt = create_prompt(column, x, examples)
    
            # prompt LLM:
            data_test.iloc[i, mask] = llm([prompt],
                bos_token_id=bos_token_id,
                eos_token_id=eos_token_id,
                pad_token_id=pad_token_id,
                max_new_tokens=32,
                do_sample=False,
                temperature=None,
                top_p=None
            )[0]["generated_text"][-1]["content"]
    
        else: data_test.iloc[i, mask] = examples[0]

data_test.sample()

In [None]:
data_test[['hazard-category', 'product-category']].to_csv('submission.csv')