In [None]:
import pandas as pd
import json
import os
import ast
import re
import numpy as np
from datasets import Dataset
import matplotlib.pyplot as plt

# For LLM
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    set_seed,
    pipeline
)
from trl import SFTTrainer, setup_chat_format, SFTConfig, DataCollatorForCompletionOnlyLM

import torch
from time import time
 
# For wandb
import wandb
# Set seed
import pickle
set_seed(42)
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from transformers import BatchEncoding, DataCollatorForSeq2Seq
from unsloth import FastLanguageModel
from peft import LoraConfig, PeftModel, get_peft_model, get_peft_model_state_dict
import seaborn as sns
from copy import deepcopy
from tqdm import tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
if os.getcwd() == '/root':
    new_path = "/root/0_Thesis/0_final/"
    os.chdir(new_path)
else:
    os.chdir("..") 
print(os.getcwd())

/root/0_Thesis/0_final


In [None]:
mstral7b = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
llama8b = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
gemma9b = "unsloth/gemma-2-9b-it-bnb-4bit"
qwen14b = "unsloth/Qwen2.5-14B-Instruct-bnb-4bit"

models = [[mstral7b, 200], [llama8b, 200], [gemma9b, 100], [qwen14b, 100]]

#  Section 1: Binary

In [None]:
def labeling_binary(model_id, df,  batch_size = 200):
    max_seq_length = 500

    dtype = getattr(torch, "float16")
    load_in_4bit = True

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_id,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit, 
    )
    
    FastLanguageModel.for_inference(model) 
    # tokenizer.padding_side = "left"

    user_message_template = ''' Do you think this comment is hate/offensive speech? :
    ------\n"{comment}"
    ------
    Your answer should be a number 1,2:
    1 for Hate Speech.
    2 for Normal.
    Give only valid answer number back, nothing else and do not explain:
    '''


    user_message_template = '''You are tasked with annotating speech. Your response must be a single valid number:
    1 for Hate Speech,
    2 for Normal Speech.

    Provide only the number corresponding to the category. Do not include any explanation or additional text or your thinking process.
    Do you think the following comment is hate speech or offensive speech?
    \n"{comment}"\n
    Your Answer:
    '''

    struct_template = {
        "input_sequence": "### Input:",
        "output_sequence": "### Output:",
        "last_output_sequence": "",
        "system_sequence": "### System:",
        "stop_sequence": "",
        "wrap": False,
        "macro": True,
        "names_behavior": "none",
        "activation_regex": "",
        "system_sequence_prefix": "",
        "system_sequence_suffix": "",
        "first_output_sequence": "",
        "skip_examples": True,
        "output_suffix": "",
        "input_suffix": "",
        "system_suffix": "",
        "user_alignment_message": "Please start the roleplay.",
        "system_same_as_user": False,
        "last_system_sequence": "",
        "first_input_sequence": "",
        "last_input_sequence": "",
        "names_force_groups": True,
        "name": "DeepSeekV3 - Instruct"
    }

    def preprocess(text):
        user_message_content = user_message_template.format(comment=text)
        user_message = {
            "role": "user",
            "content": user_message_content
        }
        
        if "Qwen" in model_id:
            system_message =  {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant"}
            # system_message =  {"role": "system", "content": "You are a helpful assistant"}
        else:
            system_message =  {"role": "system", "content": "You are a helpful assistant"}

        if "gemma" in model_id or "gemma" in model_id:
            messages = [user_message]
        else:
            messages = [system_message, user_message]


        messages = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        messages = messages
        if "mistral" in model_id:
            messages += " "
            

        return messages

    df["prompt"] = df["text"].apply(preprocess)
    if "Qwen" in model_id:
        stop_token_id = tokenizer(["12"])['input_ids'][0]
    elif "lama" in model_id:
        stop_token_id = [16,17]
    elif "mistral" in model_id:
        stop_token_id = [29508, 29518]
    else:
        stop_token_id = tokenizer(["12"])['input_ids'][0][1:]
    assert len(stop_token_id) == 2

    def process_task(texts):
        # model = ray.get(model_ref)
        encoding = tokenizer(texts, padding=True, return_tensors='pt').to('cuda')

        with torch.no_grad():
            outputs = model(**encoding)
            logits = outputs.logits  # Shape: [batch_size, sequence_length, vocab_size]
        last_token_logits = logits[:, -1, :]  # Shape: [vocab_size]
        probabilities = torch.softmax(last_token_logits, dim=-1)
        indices = torch.tensor(stop_token_id)
        selected_probs = probabilities[:, indices].float().cpu().numpy()
        return selected_probs
    
    solution2 =  {}
    counter = 1
    texts = []
    sol = []
    current_counter = 0
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    probs_value = []
    futures = []
    for index, value in tqdm(enumerate(df['prompt'].tolist())):
        texts.append(value)
        if len(texts) % batch_size == 0:
            selected_probs = process_task(texts)   
            probs_value += selected_probs.tolist()
            texts = []
            torch.cuda.empty_cache()
            torch.cuda.synchronize()  

    torch.cuda.empty_cache()
    torch.cuda.synchronize()   
    if len(texts) != 0:  
        selected_probs = process_task(texts)   


        probs_value += selected_probs.tolist()
    rounded_data = [[round(x, 5) for x in pair] for pair in probs_value]
    prob_1 = [pair[0] for pair in probs_value]
    prob_2 = [pair[1] for pair in probs_value]
    df[model_id +"_label_1"] = prob_1
    df[model_id +"_label_2"] = prob_2
    return df

In [None]:
for name_ds in ["m1_eng.csv", "m1_deu.csv", "m1_vie.csv"]:
    df = pd.read_csv("data/raw/" + name_ds)
    for model_id in models:
        df = labeling_binary(model_id=model_id[0], df=df,batch_size=model_id[1])
        df.to_csv("data/raw/" + name_ds + "_binary", index=False )

In [None]:
for name_ds in ["m2_eng_171k.csv", "m2_eng_130k.csv", "m2_deu_155k.csv", "m2_vie_6k.csv"]:
    df = pd.read_csv("data/raw/" + name_ds)
    for model_id in models:
        df = labeling_binary(model_id=model_id[0], df=df,batch_size=model_id[1])
        df.to_csv("data/raw/" + name_ds + "_binary", index=False )

# Section 2: Multi

In [None]:
def labeling_multi(model_id, df, batch_size = 200, zero_shot=True):
    max_seq_length = 500
    batch_size = 300

    dtype = getattr(torch, "float16")
    load_in_4bit = True

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_id,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit, 
    )
    
    FastLanguageModel.for_inference(model) 
    tokenizer.padding_side = "left"


    user_message_template = '''You are tasked with annotating speech. Your response must be a single valid number:
    1 for Hate Speech.
    2 for Offensive Speech.
    3 for Normal Speech.

    Provide only the number corresponding to the category. Do not include any explanation or additional text or your thinking process.
    Do you think the following comment is hate speech or offensive speech?
    \n"{comment}"\n
    Your Answer:
    '''
    if not zero_shot: 
        user_message_template = '''You are tasked with annotating speech. Your response must be a single valid number:
            1 for Hate Speech.
            2 for Offensive Speech.
            3 for Normal Speech.

            Think step by step:
            - Identify whether the comment contains explicit hate towards a group or individual based on identity characteristics (Hate Speech).
            - If the comment is aggressive, rude, or contains insults but lacks clear hateful intent, classify it as Offensive Speech.
            - If the comment is neutral or non-offensive, classify it as Normal Speech.

            Examples:
            Comment: "People of [group] are a disease and should be removed."
            Your Answer: 1

            Comment: "You're so stupid, just shut up!"
            Your Answer: 2

            Comment: "I hope you have a great day!"
            Your Answer: 3

            Provide only the number corresponding to the category. Do not include any explanation or additional text or your thinking process.
            Do you think the following comment is hate speech or offensive speech?
            \n"{comment}"\n
            Your Answer:
            '''

    def preprocess(text):
        user_message_content = user_message_template.format(comment=text)
        user_message = {
            "role": "user",
            "content": user_message_content
        }
        
        if "Qwen" in model_id:
            system_message =  {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant"}
            # system_message =  {"role": "system", "content": "You are a helpful assistant"}
        else:
            system_message =  {"role": "system", "content": "You are a helpful assistant"}

        if "gemma" in model_id or "gemma" in model_id:
            messages = [user_message]
        else:
            messages = [system_message, user_message]


        messages = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        messages = messages
        if "mistral" in model_id:
            messages += " "
        return messages

    df["prompt"] = df["text"].apply(preprocess)
    if "Qwen" in model_id:
        stop_token_id = tokenizer(["123"])['input_ids'][0]
    elif "lama" in model_id:
        stop_token_id = [16,17,18]
    elif "mistral" in model_id:
        stop_token_id = [29508, 29518, 29538]
        #29549
    else:
        stop_token_id = tokenizer(["123"])['input_ids'][0][1:]
    assert len(stop_token_id) == 3

    def process_task(texts):
        # model = ray.get(model_ref)
        encoding = tokenizer(texts, padding=True, return_tensors='pt').to('cuda')
        with torch.no_grad():
            outputs = model(**encoding)
            logits = outputs.logits  # Shape: [batch_size, sequence_length, vocab_size]
        last_token_logits = logits[:, -1, :]  # Shape: [vocab_size]
        probabilities = torch.softmax(last_token_logits, dim=-1)
        indices = torch.tensor(stop_token_id)
        selected_probs = probabilities[:, indices].float().cpu().numpy()
        # print( np.mean(probabilities[:, indices[2]].float().cpu().numpy()))
        return selected_probs
    
    solution2 =  {}
    counter = 1
    texts = []
    sol = []
    current_counter = 0
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    probs_value = []
    futures = []
    for index, value in tqdm(enumerate(df['prompt'].tolist())):
        texts.append(value)
        if len(texts) % batch_size == 0:
            selected_probs = process_task(texts)   
            probs_value += selected_probs.tolist()
            texts = []

            torch.cuda.empty_cache()
            torch.cuda.synchronize()  

    torch.cuda.empty_cache()
    torch.cuda.synchronize()     
    selected_probs = process_task(texts)   

    probs_value += selected_probs.tolist()
    rounded_data = [[round(x, 5) for x in pair] for pair in probs_value]
    prob_1 = [pair[0] for pair in probs_value]
    prob_2 = [pair[1] for pair in probs_value]
    prob_3 = [pair[2] for pair in probs_value]

    df[model_id +"_label_1"] = prob_1
    df[model_id +"_label_2"] = prob_2
    df[model_id +"_label_3"] = prob_3

    return df

In [None]:
for name_ds in ["m2_eng_171k.csv", "m2_eng_130k.csv", "m2_deu_155k.csv", "m2_vie_6k.csv"]:
    df = pd.read_csv("data/raw/" + name_ds)
    for model_id in models:
        df = labeling_multi(model_id=model_id[0], df=df,batch_size=model_id[1], zero_shot=True)
        df.to_csv("data/raw/" + name_ds + "_multi", index=False )

In [None]:
for name_ds in ["m2_eng_171k.csv", "m2_eng_130k.csv", "m2_deu_155k.csv", "m2_vie_6k.csv"]:
    df = pd.read_csv("data/raw/" + name_ds)
    for model_id in models:
        df = labeling_multi(model_id=model_id[0], df=df,batch_size=model_id[1], zero_shot=True)
        df.to_csv("data/raw/" + name_ds + "_multi", index=False )

# SECTION 3: LGB
