# Libraries

In [None]:
!pip install transformers
!pip install lime
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
import pandas as pd
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Set the device to GPU if available, otherwise CPU

# First model classification

In [None]:
# Import the dataset
data = pd.read_csv("data.csv", ignore_index=True)  # Load the CSV file and reset the index

# Configure the classification model
MODEL = "tomh/toxigen_hatebert"  # Specify the model you want to use (e.g., a pre-trained hate speech model)
clf = pipeline("text-classification", model=MODEL, tokenizer="bert-base-cased", truncation=True)  # Create the text classification pipeline with the model and BERT tokenizer

# Start the prediction on the dataset
data.reset_index(inplace=True)  # Reset the DataFrame index to avoid conflicts with existing indices
list_result = []  # List to store the prediction results
for i in range(len(data)):  # Iterate over all the rows in the dataset
    print(i)  # Print the index for monitoring
    list_result.append(clf(data.loc[i, "text"]))  # Get the prediction for each row in the dataset

# Extract the prediction labels
risultati = []  # List to store the predicted labels
for i in range(len(list_result)):  # Iterate over the prediction results
    risultati.append(list_result[i][0]["label"])  # Append each prediction label to the list

# Add the predicted labels column to the DataFrame
data["prediction_label_iniziale"] = risultati  # Insert the predicted labels into the 'prediction_label_iniziale' column

# Replace the predicted labels "LABEL_0" and "LABEL_1" to match the format of the original labels in the dataset
# This is necessary to ensure that the predicted labels align with the true labels for calculating accuracy
data.prediction_label_iniziale.replace("LABEL_0", 2, inplace=True)  # Replace 'LABEL_0' with 2 (hate speech)
data.prediction_label_iniziale.replace("LABEL_1", 0, inplace=True)  # Replace 'LABEL_1' with 0 (non-hate speech)

# Convert the predicted and true labels to integers for accuracy calculation
data["prediction_label_iniziale"] = data["prediction_label_iniziale"].astype(int)
data["hatespeech"] = data["hatespeech"].astype(int)  # Ensure the 'hatespeech' column is an integer for comparison

# Calculate the accuracy of the model by comparing the predicted labels with the true labels
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(data['hatespeech'], data["prediction_label_iniziale"])

# Print the accuracy
print("Accuracy:", accuracy)

# LIME

In [None]:
# Import necessary libraries
import numpy as np
import lime  # Library for interpretable machine learning
import torch
import torch.nn.functional as F
from lime.lime_text import LimeTextExplainer  # Lime explainer for text data

class_names = ["LABEL_0", "LABEL_1"]  # Define class names

# Load model and tokenizer from Hugging Face
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")  # Tokenizer for BERT
model = AutoModelForSequenceClassification.from_pretrained("tomh/toxigen_hatebert")  # Load pre-trained model for hate speech classification
model.to(device)  # Move the model to the GPU or CPU depending on the device

# Define the predictor function that takes text inputs and returns prediction probabilities
def predictor(texts):
    outputs = model(**tokenizer(texts, return_tensors="pt", padding=True).to(device))  # Tokenize and pass through model
    probas = F.softmax(outputs.logits).cpu().detach().numpy()  # Get class probabilities using softmax
    return probas

# Initialize LimeTextExplainer to explain model predictions
explainer = LimeTextExplainer(class_names=class_names)

# Lists to store the explanations for each text sample
lista_words_general = []
lista_values_general = []

# Loop through the dataset for explanation
for x in range(len(data)):
    torch.cuda.empty_cache()  # Clear GPU memory to avoid running out of memory during processing
    with torch.no_grad():  # Disable gradient computation for faster predictions
        print(x)  # Print index to track progress
        # Check the predicted label for the current instance
        if data.loc[x, "prediction_label_iniziale"] == 2:
            pred = "LABEL_0"  # Hate speech
        else:
            pred = "LABEL_1"  # Non-hate speech

        lista_words_specific = []  # List to store words for the current explanation
        lista_values_specific = []  # List to store feature importances for the current explanation

        str_to_predict = data.loc[x, "text"]  # Get the text to predict
        # Generate the explanation for the prediction
        exp = explainer.explain_instance(str_to_predict, predictor, num_features=20, num_samples=100)

        # Sort the features based on the predicted label
        if pred == "LABEL_1":
            sorted_lst = [x for x in exp.as_list() if x[1] < 0]  # Select features with negative impact
            sorted_lst = sorted(sorted_lst, key=lambda x: x[1], reverse=False)  # Sort by importance (negative first)
        else:
            sorted_lst = [x for x in exp.as_list() if x[1] > 0]  # Select features with positive impact
            sorted_lst = sorted(sorted_lst, key=lambda x: x[1], reverse=True)  # Sort by importance (positive first)

        # Append sorted words and their values for the explanation
        for x in sorted_lst:
            lista_words_specific.append(x[0])  # Append the word
            lista_values_specific.append(x[1])  # Append the importance value

        # Append the words and values for this instance to the general lists
        lista_words_general.append(lista_words_specific)
        lista_values_general.append(lista_values_specific)

# Add the explanations to the DataFrame as a new column
data["lime_words"] = lista_words_general

# Set the Adversarial Attack

In [None]:
import math

# Function to replace a character at a specific index in a string
def replace_str_index(text, index=0, replacement=''):
    return f'{text[:index]}{replacement}{text[index+1:]}'

# Function to find a synonym for a word from WordNet
def bug_sub_w(word):
    name_app = ""
    for syn in wordnet.synsets(word):
        for name in syn.lemma_names():
            if name != word:
                name_app = name
                break
        if name_app != "":
            break
    return name_app

# Function to insert a space in the middle of the word
def bug_insert(word):
    if len(word) < 2:
        return word
    else:
        ind = len(word) // 2
        word_bug = word[:ind] + " " + word[ind:]
        return word_bug

# Function to delete a character in the middle of the word
def bug_delete(word):
    if len(word) < 2:
        return word
    else:
        ind = len(word) // 2
        word_bug = word[:ind] + word[ind+1:]
        return word_bug

# Function to swap characters near the middle of the word
def bug_swap(word):
    if len(word) < 2:
        return word
    elif len(word) == 2:
        word_bug = word[1] + word[0]
        return word_bug
    else:
        ind = len(word) // 2
        lett = word[ind:ind+1]
        word_bug = word[:ind-1] + lett + word[ind-1:ind] + word[ind+1:]
        return word_bug

# Function to replace a character with a common "leet" character (e.g., 'a' to '@')
def bug_sub_c(word):
    word = word.lower()
    if len(word) > 2:
        x = math.ceil(len(word) / 2)
        if word[x] in ["a", "b", "c", "e", "g", "h", "i", "l", "o", "s", "t", "z"]:
            if word[x] == "a":
                word = replace_str_index(word, x, "@")
            elif word[x] == "b":
                word = replace_str_index(word, x, "8")
            elif word[x] == "c":
                word = replace_str_index(word, x, "(")
            elif word[x] == "e":
                word = replace_str_index(word, x, "3")
            elif word[x] == "g":
                word = replace_str_index(word, x, "6")
            elif word[x] == "h":
                word = replace_str_index(word, x, "#")
            elif word[x] == "i":
                word = replace_str_index(word, x, "!")
            elif word[x] == "l":
                word = replace_str_index(word, x, "1")
            elif word[x] == "o":
                word = replace_str_index(word, x, "0")
            elif word[x] == "s":
                word = replace_str_index(word, x, "$")
            elif word[x] == "t":
                word = replace_str_index(word, x, "7")
            elif word[x] == "z":
                word = replace_str_index(word, x, "2")
        else:
            x = x - 1
            if word[x] in ["a", "b", "c", "e", "g", "h", "i", "l", "o", "s", "t", "z"]:
                if word[x] == "a":
                    word = replace_str_index(word, x, "@")
                elif word[x] == "b":
                    word = replace_str_index(word, x, "8")
                elif word[x] == "c":
                    word = replace_str_index(word, x, "(")
                elif word[x] == "e":
                    word = replace_str_index(word, x, "3")
                elif word[x] == "g":
                    word = replace_str_index(word, x, "6")
                elif word[x] == "h":
                    word = replace_str_index(word, x, "#")
                elif word[x] == "i":
                    word = replace_str_index(word, x, "!")
                elif word[x] == "l":
                    word = replace_str_index(word, x, "1")
                elif word[x] == "o":
                    word = replace_str_index(word, x, "0")
                elif word[x] == "s":
                    word = replace_str_index(word, x, "$")
                elif word[x] == "t":
                    word = replace_str_index(word, x, "7")
                elif word[x] == "z":
                    word = replace_str_index(word, x, "2")
    return word


# Attack to 1 word

In [None]:
import random

# Define the list of attack functions
funzioni = [bug_insert, bug_delete, bug_swap, bug_sub_c, bug_sub_c]

# Initialize a list to store the modified sentences
lista_frasi = []
for x in range(len(data)):
    print(x)
    # Randomly select an attack function to apply
    funzioni_selezionate = random.sample(funzioni, 1)

    for funzione in funzioni_selezionate:
        frase = data.loc[x, "text"]  # Get the original text
        lime_words = data.loc[x, "lime_words"][:1]  # Select only the most important word (just one word)
        word_to_attack_list = []  # List to store modified words

        # Apply the attack function to the selected word
        for j in lime_words:
            word_to_attack_list.append(funzione(j))

        # Replace the original word with the attacked word in the sentence
        for y in range(len(word_to_attack_list)):
            frase = frase.replace(lime_words[y], word_to_attack_list[y])

    lista_frasi.append(frase)  # Add the modified sentence to the list

# Update the text column in the data with the modified sentences
data["text"] = lista_frasi

# Initialize a list to store the predictions after the attack
list_result = []
for i in range(len(data)):
    print(i)
    # Predict the label for each modified sentence
    list_result.append(clf(data.loc[i, "text"]))

# Extract the predicted labels from the results
risultati = []
for i in range(len(list_result)):
    risultati.append(list_result[i][0]["label"])

# Add the predicted labels to the data
data["prediction_label_finale"] = risultati

# Replace the original label format with the numeric format for accuracy calculation
data.prediction_label_finale.replace("LABEL_0", 2, inplace=True)
data.prediction_label_finale.replace("LABEL_1", 0, inplace=True)

# Convert the final predictions and actual labels to integers for accuracy calculation
data["prediction_label_finale"] = data["prediction_label_finale"].astype(int)

# Calculate accuracy by comparing the predicted labels with the true labels
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(data["hatespeech"], data["prediction_label_finale"])

print("ACCURACY AFTER ATTACK ON 1 WORD:", accuracy)

# Filter the data to compare correct predictions before and after the attack
data_pred = data[data["hatespeech"] == 2]  # Select only the hate speech samples
data_initial = data_pred[data_pred.prediction_label_iniziale == 2]  # Correct predictions before the attack
data_final = data_pred[data_pred.prediction_label_finale == 2]  # Correct predictions after the attack

# Print the number of correct predictions before and after the attack
print("Number of correct predictions before attack", len(data_initial))
print("Number of correct predictions after attack", len(data_final))

# Calculate the Attack Success Rate (ASR), which is the percentage of correct predictions lost after the attack
print("ASR: ", (((len(data_initial) - len(data_final)) / len(data_initial)) * 100))

# Generate and display the confusion matrix to evaluate the model's performance
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(list(data['hatespeech']), list(data["prediction_label_finale"]))

# Plot the confusion matrix using seaborn for better visualization
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Predicted 0", "Predicted 1"], yticklabels=["Actual 0", "Actual 1"])
plt.xlabel("Predicted label")
plt.ylabel("Actual label")
plt.title("Confusion Matrix")
plt.show()

# Attack to 3 words

In [None]:
# PROVE CON ATTACCO SU 3 PAROLE - THE STEPS ARE THE SAME OF 1 WORD ATTACK
import random

funzioni = [bug_insert, bug_delete, bug_swap, bug_sub_c, bug_sub_c]

lista_frasi = []
for x in range(len(data)):
    print(x)
    funzioni_selezionate = random.sample(funzioni, 3)

    for funzione in funzioni_selezionate:
        frase = data.loc[x, "text"]
        lime_words = data.loc[x, "lime_words"][:3] # Set the word to attack at 3
        word_to_attack_list = []
        for j in lime_words:
            word_to_attack_list.append(funzione(j))
        for y in range(len(word_to_attack_list)):
            frase = frase.replace(lime_words[y], word_to_attack_list[y])
    lista_frasi.append(frase)

data["text"] = lista_frasi

list_result = []
for i in range(len(data)):
    list_result.append(clf(data.loc[i, "text"]))

risultati = []
for i in range(len(list_result)):
    risultati.append(list_result[i][0]["label"])

data["prediction_label_finale"] = risultati

data.prediction_label_finale.replace("LABEL_0", 2, inplace=True)
data.prediction_label_finale.replace("LABEL_1", 0, inplace=True)

data["prediction_label_finale"] = data["prediction_label_finale"].astype(int)

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(data["hatespeech"], data["prediction_label_finale"])

print("ACCURACY AFTER ATTACK ON 3 WORDS:", accuracy)

data_pred = data[data["hatespeech"] == 2]
data_iniziale = data_pred[data_pred.prediction_label_iniziale == 2]
data_finale = data_pred[data_pred.prediction_label_finale == 2]
print("Numero predizioni giuste pre-attacco", len(data_iniziale))
print("Numero predizioni giuste post-attacco", len(data_finale))
print("ASR: ", (((len(data_iniziale) - len(data_finale)) / len(data_iniziale)) * 100))

cm = confusion_matrix(list(data['hatespeech']), list(data["prediction_label_finale"]))

# Plot confusion matrix using seaborn
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Predicted 0", "Predicted 1"], yticklabels=["Actual 0", "Actual 1"])
plt.xlabel("Predicted label")
plt.ylabel("Actual label")
plt.title("Confusion Matrix")
plt.show()


# Attack to 5 words

In [None]:
# PROVE CON ATTACCO SU 5 PAROLE
import random

# List of functions that perform different types of text manipulations
funzioni = [bug_insert, bug_delete, bug_swap, bug_sub_c, bug_sub_c]

# List to store modified sentences
lista_frasi = []

# Loop through each row in the dataset
for x in range(len(data)):
    print(x)

    # Randomly select 5 functions to apply to the text
    funzioni_selezionate = random.sample(funzioni, 5)

    # Apply each of the selected functions to the text
    for funzione in funzioni_selezionate:
        frase = data.loc[x, "text"]  # Get the original sentence
        lime_words = data.loc[x, "lime_words"][:5]  # Set the words to attack at 5
        word_to_attack_list = []

        # For each word in the LIME explanation, apply the attack function
        for j in lime_words:
            word_to_attack_list.append(funzione(j))

        # Replace each word in the sentence with the manipulated version
        for y in range(len(word_to_attack_list)):
            frase = frase.replace(lime_words[y], word_to_attack_list[y])

    lista_frasi.append(frase)  # Add the modified sentence to the list

# Update the text column with the modified sentences
data["text"] = lista_frasi

# List to store the prediction results for each modified sentence
list_result = []
for i in range(len(data)):
    list_result.append(clf(data.loc[i, "text"]))  # Apply the classifier to the modified text

# Extract the predicted labels from the classifier's result
risultati = []
for i in range(len(list_result)):
    risultati.append(list_result[i][0]["label"])

# Add the predicted labels to the dataframe
data["prediction_label_finale"] = risultati

# Replace LABEL_0 and LABEL_1 with their respective integer labels (for comparison)
data.prediction_label_finale.replace("LABEL_0", 2, inplace=True)
data.prediction_label_finale.replace("LABEL_1", 0, inplace=True)

# Convert the prediction labels to integers
data["prediction_label_finale"] = data["prediction_label_finale"].astype(int)

# Import the necessary metric for evaluating accuracy
from sklearn.metrics import accuracy_score

# Calculate the accuracy of the model after applying the attacks
accuracy = accuracy_score(data["hatespeech"], data["prediction_label_finale"])
print("ACCURACY AFTER ATTACK ON 5 WORDS:", accuracy)

# Filter the rows where the original label is 2 (target label)
data_pred = data[data["hatespeech"] == 2]
# Get the rows where the initial prediction was also 2
data_iniziale = data_pred[data_pred.prediction_label_iniziale == 2]
# Get the rows where the final prediction is also 2 (after attack)
data_finale = data_pred[data_pred.prediction_label_finale == 2]

# Print the number of correct predictions before and after the attack
print("Numero predizioni giuste pre-attacco", len(data_iniziale))
print("Numero predizioni giuste post-attacco", len(data_finale))

# Calculate the Attack Success Rate (ASR), which is the percentage of predictions that changed after the attack
print("ASR: ", (((len(data_iniziale) - len(data_finale)) / len(data_iniziale)) * 100))

# Import confusion matrix to evaluate the model's performance
cm = confusion_matrix(list(data['hatespeech']), list(data["prediction_label_finale"]))

# Plot the confusion matrix using seaborn for better visualization
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Predicted 0", "Predicted 1"], yticklabels=["Actual 0", "Actual 1"])
plt.xlabel("Predicted label")
plt.ylabel("Actual label")
plt.title("Confusion Matrix")
plt.show()