<a href="https://colab.research.google.com/github/IGieckI/AIExperiments/blob/main/A2/Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U bitsandbytes transformers accelerate

import os
import random
import warnings
import pandas as pd
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from huggingface_hub import login

random.seed(42)

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


## Setup and Data Loading

In [15]:
# Login with huggingface token to get access to some models
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
# Download of the two dataset from the github repository
!wget -O "a2_test.csv" "https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2025-2026/Assignment%202/data/a2_test.csv"
!wget -O "demonstrations.csv" "https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2025-2026/Assignment%202/data/demonstrations.csv"

--2025-11-19 16:52:54--  https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2025-2026/Assignment%202/data/a2_test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 51789 (51K) [text/plain]
Saving to: ‘a2_test.csv’


2025-11-19 16:52:54 (7.58 MB/s) - ‘a2_test.csv’ saved [51789/51789]

--2025-11-19 16:52:54--  https://raw.githubusercontent.com/nlp-unibo/nlp-course-material/refs/heads/main/2025-2026/Assignment%202/data/demonstrations.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 224102 (219K) [tex

In [17]:
# Load data into pandas DataFrames
test_df = pd.read_csv("a2_test.csv")
demonstrations_df = pd.read_csv("demonstrations.csv")

print("Test Set Label distribution:\n", test_df["label_category"].value_counts())
test_df.head()

Test Set Label distribution:
 label_category
derogation    60
prejudiced    60
animosity     60
not-sexist    60
threats       60
Name: count, dtype: int64


Unnamed: 0,rewire_id,text,label_category
0,sexism2022_english-11647,She may be worth what someone paid for her in ...,derogation
1,sexism2022_english-15883,The men who white knight for women eventually ...,prejudiced
2,sexism2022_english-12494,Two thoughts to keep in mind about women and A...,derogation
3,sexism2022_english-26,"Wimmin. No matter how good they have it, they'...",prejudiced
4,sexism2022_english-7475,"So just a few months ago, while Republican wom...",prejudiced


In [18]:
print("Demonstrations Label distribution\n", demonstrations_df["label_category"].value_counts())
demonstrations_df.head()

Demonstrations Label distribution
 label_category
derogation    200
threats       200
animosity     200
prejudiced    200
not-sexist    200
Name: count, dtype: int64


Unnamed: 0,rewire_id,text,label_sexist,label_category,label_vector,split
0,sexism2022_english-7417,female egg donors are allowed to be uggo as fu...,sexist,derogation,2.1 descriptive attacks,train
1,sexism2022_english-10155,My girlfriend is mentally retarded. Or if you ...,sexist,derogation,2.1 descriptive attacks,test
2,sexism2022_english-15154,Lock her UP!!!!,sexist,threats,1.2 incitement and encouragement of harm,train
3,sexism2022_english-13387,So you want to live in a world where women are...,sexist,animosity,"3.1 casual use of gendered slurs, profanities,...",dev
4,sexism2022_english-2990,"Special episode of Loomy Does Politics, by pop...",sexist,prejudiced,4.2 supporting systemic discrimination against...,train


In [19]:
# Creating the ground truth labels from the test set into numeric format
label_mapping = {
    "not-sexist": 0,
    "threats": 1,
    "derogation": 2,
    "animosity": 3,
    "prejudiced": 4,
}

le = LabelEncoder()
le.fit(
    list(label_mapping.keys())
)
y_true = le.transform(test_df["label_category"])

# Creating also the inverse mapping numeric_id to labels_name
idx_to_label = {v: k for k, v in label_mapping.items()}
label_names = [
    idx_to_label[i] for i in sorted(idx_to_label)
]

# Get the list of texts to process
test_texts = test_df["text"].tolist()

print(f"Ground truth labels examples: {y_true[:5]}")
print(f"Label names for plots: {label_names}")

Ground truth labels examples: [1 3 1 3 3]
Label names for plots: ['not-sexist', 'threats', 'derogation', 'animosity', 'prejudiced']


## Models Setup

In [20]:
# Defining the models we are going to use
models_to_test = {
    "Meta-Llama3.1-8B": "meta-llama/Llama-3.1-8B-Instruct",
    #"Mistral-7B": "mistralai/Mistral-7B-Instruct-v0.3",
    # "DeepSeek-R1-7B" : "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", # As this is a reasoning model we could test this too for bonus points
}

# Defining a 4-bit quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Dictionary to hold loaded models and tokenizers
loaded_models = {}

def load_model_and_tokenizer(model_id, config):
    """
    Loads a model and its tokenizer with 4-bit quantization
    """
    print(f"Loading tokenizer for {model_id}")

    tokenizer = AutoTokenizer.from_pretrained(
        model_id
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        print(f"Set pad_token to eos_token for {model_id}")

    print(f"Loading model {model_id} with 4-bit quantization...")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=config,
        device_map="cuda",
    )
    print(f"Successfully loaded {model_id}")
    return model, tokenizer

In [21]:
# Load all models
# !!! It's probably better not to load all the models at once, but to load them one by one and use them in that time, such that the total memory usage is lower
# !!! Moreover, we should try uploading the models on google drive and see if they load faster
for model_name, model_id in models_to_test.items():
    try:
        model, tokenizer = load_model_and_tokenizer(
            model_id, quantization_config
        )
        loaded_models[model_name] = {"model": model, "tokenizer": tokenizer}
    except Exception as e:
        print(f"Failed to load {model_name}. Error: {e}")

Loading tokenizer for meta-llama/Llama-3.1-8B-Instruct


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Set pad_token to eos_token for meta-llama/Llama-3.1-8B-Instruct
Loading model meta-llama/Llama-3.1-8B-Instruct with 4-bit quantization...


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Successfully loaded meta-llama/Llama-3.1-8B-Instruct


## Prompt Setup

In [22]:
# Define the zero-shot prompt template
ZERO_SHOT_PROMPT_TEMPLATE = [
    {
        'role': 'system',
        'content': 'You are an annotator for sexism detection.'
    },
    {
        'role': 'user',
        'content': """Your task is to classify input text as non-sexist
         or sexist. If sexist, classify input text according to one
         of the following four categories: threats, derogation,
         animosity, prejudiced discussion.

         Below you find sexist categories definitions:
         Threats: the text expresses intent or desire to harm a woman.
         Derogation: the text describes a woman in a derogative manner.
         Animosity: the text contains slurs or insults towards a woman.
         Prejudiced discussion: the text expresses supports for
         mistreatment of women as individuals.

         Respond only by writing one of the following categories:
         not-sexist, threats, derogation, animosity, prejudiced.

        TEXT: {text}

        ANSWER:
        """
    }
]

def prepare_prompts(texts, prompt_template, examples_str=None):
    """
    This function formats input text samples into instructions prompts.
    It's designed to handle both zero-shot and few-shot (via examples_str)

    Inputs:
      texts: input texts to classify via prompting
      prompt_template: the prompt template (list of dicts)
      examples_str: (Optional) A formatted string of few-shot examples

    Outputs:
      A list of chat histories (list of lists of dicts)
    """
    prepared_prompts = []

    # Deep copy the template components
    system_prompt = prompt_template[0].copy()
    user_content_template = prompt_template[1]['content']

    # If few-shot examples are provided, inject them
    if examples_str:
        # Check if the placeholder exists
        if "{examples}" in user_content_template:
            user_content_template = user_content_template.replace("{examples}", examples_str)
        else:
            print("Warning: `examples_str` provided but no `{examples}` placeholder found in template.")

    # Process each text
    for text in texts:
        # Create the final user content string
        final_user_content = user_content_template.replace("{text}", text)

        # Create the chat history for this specific sample
        chat_history = [
            system_prompt,
            {"role": "user", "content": final_user_content}
        ]
        prepared_prompts.append(chat_history)

    return prepared_prompts

In [23]:
# Test the function with the first text
print("Testing prepare_prompts (zero-shot):")
sample_prompts = prepare_prompts(
    [test_texts[0]],
    ZERO_SHOT_PROMPT_TEMPLATE,
    None,  # Tokenizer not needed for this implementation
)
import json

print(json.dumps(sample_prompts[0], indent=2))

Testing prepare_prompts (zero-shot):
[
  {
    "role": "system",
    "content": "You are an annotator for sexism detection."
  },
  {
    "role": "user",
    "content": "Your task is to classify input text as non-sexist\n         or sexist. If sexist, classify input text according to one\n         of the following four categories: threats, derogation,\n         animosity, prejudiced discussion.\n\n         Below you find sexist categories definitions:\n         Threats: the text expresses intent or desire to harm a woman.\n         Derogation: the text describes a woman in a derogative manner.\n         Animosity: the text contains slurs or insults towards a woman.\n         Prejudiced discussion: the text expresses supports for\n         mistreatment of women as individuals.\n\n         Respond only by writing one of the following categories:\n         not-sexist, threats, derogation, animosity, prejudiced.\n\n        TEXT: She may be worth what someone paid for her in terms of stimul

## Metrics

In [24]:
def process_response(response):
    """
    This function takes a textual response generated by the LLM
    and processes it to map the response to a numeric label.

    It takes the generated response from LLM as input and checks for
	the presence of specific keywords corresponding to each label, then
	returns the appropriate numeric label based on the mapping.
    note: if no keyword is found, default is set to 0 (not-sexist) for task 4
    """
    # Clean and normalize the response
    clean_response = str(response).lower().strip().replace("'", "")

    # Check for each label keyword
    if "threats" in clean_response:
        return 1
    if "derogation" in clean_response:
        return 2
    if "animosity" in clean_response:
        return 3
    if "prejudiced" in clean_response:
        return 4

    # If the response contains 'not-sexist', 'non-sexist' or nothing (task 4), 0 is returned
    return 0

def compute_metrics(y_pred_raw, y_true):
    """
    This function takes predicted raw responses and ground-truth labels
    and computes macro F1-score and fail-ratio.

    Inputs:
      y_pred_raw: list of raw LLM string responses
      y_true: list of ground-truth numeric labels

    Outputs:
      dictionary containing 'macro_f1' and 'fail_ratio'
    """
    expected_keywords = [
        "not-sexist",
        "threats",
        "derogation",
        "animosity",
        "prejudiced",
    ]

    y_pred_parsed = []
    failed_responses = 0

    for raw_response in y_pred_raw:
        # Check for failure (none of the answer keywords present)
        clean_raw = str(raw_response).lower().strip()
        if not any(keyword in clean_raw for keyword in expected_keywords):
            failed_responses += 1

        # Parse the response
        parsed_label = process_response(raw_response)
        y_pred_parsed.append(parsed_label)

    # Compute metrics
    fail_ratio = failed_responses / len(y_pred_raw)
    macro_f1 = f1_score(y_true, y_pred_parsed, average="macro")

    print("\n--- Classification Report ---")
    print(
        classification_report(
            y_true, y_pred_parsed, target_names=label_names, zero_division=0
        )
    )
    print("------------------------------")

    return {"macro_f1": macro_f1, "fail_ratio": fail_ratio}, y_pred_parsed

def generate_responses(model, tokenizer, prompt_examples):
    """
    This function implements the inference loop for an LLM model.
    Given a set of examples, the model is tasked to generate
    a response.

    Inputs:
      model: LLM model instance for prompting
      tokenizer: The tokenizer for the model
      prompt_examples: A list of pre-processed chat histories

    Outputs:
      generated responses (list of strings)
    """
    print(f"Generating {len(prompt_examples)} responses...")
    # We use a pipeline for efficient batching and clean text generation
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",
    )

    # Generate responses in batches
    # We set max_new_tokens low because we only expect a 1-2 word answer.
    # return_full_text=False ensures we only get the model's answer,
    # not the prompt.
    outputs = []
    for out in tqdm(
        pipe(
            prompt_examples,
            max_new_tokens=20, # !!! With reasoning, this value should be increased, while in direct inference is better to keep the value low (to just print the answer)
            pad_token_id=tokenizer.eos_token_id,
            do_sample=False,  # For deterministic output
            return_full_text=False,
            batch_size=32,  # Adjust batch size based on VRAM
        ),
        total=len(prompt_examples),
    ):
        # Extract the generated text
        if out and isinstance(out, list) and len(out) > 0:
            outputs.append(out[0]["generated_text"])
        else:
            outputs.append("")  # Append empty string on failure

    return outputs

## Zero-Shot Inference

In [25]:
# Store results for final comparison
experiment_results = []
# Store parsed predictions for confusion matrices
parsed_predictions = {}
# Store raw responses for qualitative analysis
raw_responses_log = {}

for model_name, components in loaded_models.items():
    print(f"\n--- Running Zero-Shot Inference for {model_name} ---")
    model = components["model"]
    tokenizer = components["tokenizer"]
    tokenizer.padding_side = 'left'

    # Prepare Prompts
    prompts = prepare_prompts(
        test_texts,
        ZERO_SHOT_PROMPT_TEMPLATE,
        examples_str=None,
    )

    # Generate Responses
    raw_responses = generate_responses(model, tokenizer, prompts)
    raw_responses_log[f"{model_name}_zero_shot"] = raw_responses

    # Compute Metrics
    metrics, y_pred = compute_metrics(raw_responses, y_true)
    parsed_predictions[f"{model_name}_zero_shot"] = y_pred

    # Store results
    result_entry = {
        "model": model_name,
        "setting": "Zero-Shot",
        "macro_f1": metrics["macro_f1"],
        "fail_ratio": metrics["fail_ratio"],
    }
    experiment_results.append(result_entry)

    print(
        f"{model_name} Zero-Shot Results: Macro F1 = {metrics['macro_f1']:.4f}, Fail Ratio = {metrics['fail_ratio']:.4f}"
    )

Device set to use cuda
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



--- Running Zero-Shot Inference for Meta-Llama3.1-8B ---
Generating 300 responses...


  0%|          | 0/300 [00:00<?, ?it/s]


--- Classification Report ---
              precision    recall  f1-score   support

  not-sexist       0.07      0.05      0.06        60
     threats       0.06      0.03      0.04        60
  derogation       0.14      0.07      0.09        60
   animosity       0.13      0.27      0.17        60
  prejudiced       0.18      0.20      0.19        60

    accuracy                           0.12       300
   macro avg       0.11      0.12      0.11       300
weighted avg       0.11      0.12      0.11       300

------------------------------
Meta-Llama3.1-8B Zero-Shot Results: Macro F1 = 0.1107, Fail Ratio = 0.0233


## Few-Shot Inference

In [26]:
FEW_SHOT_PROMPT_TEMPLATE = [
    {
        "role": "system",
        "content": "You are an annotator for sexism detection.",
    },
    {
        "role": "user",
        "content": """Your task is to classify input text as non-sexist
         or sexist. If sexist, classify input text according to one
         of the following four categories: threats, derogation,
         animosity, prejudiced discussion.

         Below you find sexist categories definitions:
         Threats: the text expresses intent or desire to harm a woman.
         Derogation: the text describes a woman in a derogative manner.
         Animosity: the text contains slurs or insults towards a woman.
         Prejudiced discussion: the text expresses supports for
         mistreatment of women as individuals.

         Respond only by writing one of the following categories:
         not-sexist, threats, derogation, animosity, prejudiced.

        EXAMPLES:
        {examples}

        TEXT: {text}

        ANSWER:
        """,
    },
]

N_SHOTS = 2
# !!! ANOTHER WAT WE COULD DO IS WITH A SORT OF DYNAMIC DEMONSTRATIONS SELECTION BASED ON THE TEST INPUT OR SOMETHING
print(f"Building {N_SHOTS}-shot demonstrations...")

demo_samples = []

# Ensure labels are in the same format as our mapping
labels_to_sample = list(label_mapping.keys())

for label in labels_to_sample:
	class_samples = demonstrations_df[demonstrations_df["label_category"] == label]

	n_samples = min(N_SHOTS, len(class_samples))
	selected = class_samples.sample(n=n_samples, random_state=42)
	demo_samples.extend(selected.to_dict("records"))

# Shuffle the selected demonstrations
random.shuffle(demo_samples)

# Format into the example string
examples_str = "\n"
for sample in demo_samples:
	examples_str += f"TEXT: {sample['text']}\n"
	examples_str += f"ANSWER: {sample['label_category']}\n\n"

few_shot_examples_str = examples_str.strip()

print("--- Demonstration Examples ---")
print(few_shot_examples_str)
print("------------------------------")

for model_name, components in loaded_models.items():
    print(f"\n--- Running Few-Shot Inference for {model_name} ---")
    model = components["model"]
    tokenizer = components["tokenizer"]

    # Prepare Prompts
    prompts = prepare_prompts(
        test_texts,
        FEW_SHOT_PROMPT_TEMPLATE,
        examples_str=few_shot_examples_str,
    )

    # Generate Responses
    raw_responses = generate_responses(model, tokenizer, prompts)
    raw_responses_log[f"{model_name}_few_shot"] = raw_responses

    # Compute Metrics
    metrics, y_pred = compute_metrics(raw_responses, y_true)
    parsed_predictions[f"{model_name}_few_shot"] = y_pred

    # Store results
    result_entry = {
        "model": model_name,
        "setting": f"{N_SHOTS}-Shot",
        "macro_f1": metrics["macro_f1"],
        "fail_ratio": metrics["fail_ratio"],
    }
    experiment_results.append(result_entry)

    print(
        f"{model_name} Few-Shot Results: Macro F1 = {metrics['macro_f1']:.4f}, Fail Ratio = {metrics['fail_ratio']:.4f}"
    )

Device set to use cuda


Building 2-shot demonstrations...
--- Demonstration Examples ---
TEXT: I'd just pretend to get locked down, set a wedding date, get her pregnant, then leave the country. Nobody is going to extradite me based on that horseshit allegation.
ANSWER: threats

TEXT: No one knows the struggle of young black men like some old white bitch from Arkansas. Yea girl you go. #presidentialdebates
ANSWER: derogation

TEXT: Two things. 1st is when a woman posts that, she's saying she's ignorant. Ignorance is the lack of awareness. 2nd is a little thug in him is an invitation to slap her around, you know, just a little.
ANSWER: threats

TEXT: Good Christian Girl > Into Bondage Are you retarded? How is the cognitive dissonance not blowing your mind apart? All women like it rough but the ones in BDSM are damaged beyond belief and completely fucked up in the head.
ANSWER: derogation

TEXT: I think Torba is afraid of getting sued because people can interpret that tweet as glorifying violence against people?

  0%|          | 0/300 [00:00<?, ?it/s]


--- Classification Report ---
              precision    recall  f1-score   support

  not-sexist       0.17      0.10      0.12        60
     threats       0.13      0.07      0.09        60
  derogation       0.07      0.03      0.05        60
   animosity       0.07      0.07      0.07        60
  prejudiced       0.16      0.42      0.24        60

    accuracy                           0.14       300
   macro avg       0.12      0.14      0.11       300
weighted avg       0.12      0.14      0.11       300

------------------------------
Meta-Llama3.1-8B Few-Shot Results: Macro F1 = 0.1131, Fail Ratio = 0.0000


## Error Analysis

In [27]:
results_df = pd.DataFrame(experiment_results)
results_df = results_df.sort_values(
    by=["model", "setting"], ascending=[True, False]
).reset_index(drop=True)
results_df = results_df.set_index(["model", "setting"])

print("--- Overall Experiment Results ---")
display(
    results_df.style.format(
        {"macro_f1": "{:.4f}", "fail_ratio": "{:.2%}"}
    )
)


--- Overall Experiment Results ---


Unnamed: 0_level_0,Unnamed: 1_level_0,macro_f1,fail_ratio
model,setting,Unnamed: 2_level_1,Unnamed: 3_level_1
Meta-Llama3.1-8B,Zero-Shot,0.1107,2.33%
Meta-Llama3.1-8B,2-Shot,0.1131,0.00%
