In [None]:
import pandas as pd
import json
import os
import ast
import re
import numpy as np
from datasets import Dataset
import matplotlib.pyplot as plt

# For LLM
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    set_seed,
    pipeline
)
from trl import SFTTrainer, setup_chat_format, SFTConfig, DataCollatorForCompletionOnlyLM

import torch
from time import time
 
# For wandb
import wandb
# Set seed
import pickle
set_seed(42)
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from transformers import BatchEncoding, DataCollatorForSeq2Seq
from unsloth import FastLanguageModel
from peft import LoraConfig, PeftModel, get_peft_model, get_peft_model_state_dict
import seaborn as sns
from copy import deepcopy
from tqdm import tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
if os.getcwd() == '/root':
    new_path = "/root/0_Thesis/0_final/"
    os.chdir(new_path)
else:
    os.chdir("..") 
print(os.getcwd())

/root/0_Thesis/0_final


In [None]:

qwen05 = "unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit"
qwen14b = "unsloth/Qwen2.5-14B-Instruct-bnb-4bit"


#  Section 1: Filtering Qwen 0.5B

In [None]:
def filtering_qwen05(model_id, df,  batch_size = 2000):
    max_seq_length = 500

    dtype = getattr(torch, "float16")
    load_in_4bit = True

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_id,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit, 
    )
    
    FastLanguageModel.for_inference(model) 
    tokenizer.padding_side = "left"

    user_message_template = '''You are tasked with annotating speech. Your response must be a single valid number:
    1 for Hate Speech,
    2 for Normal Speech.

    Provide only the number corresponding to the category. Do not include any explanation or additional text or your thinking process.
    Do you think the following comment is hate speech or offensive speech?
    \n"{comment}"\n
    Your Answer:
    '''

    def preprocess(text):
        user_message_content = user_message_template.format(comment=text)
        user_message = {
            "role": "user",
            "content": user_message_content
        }
        
        if "Qwen" in model_id:
            system_message =  {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant"}
            # system_message =  {"role": "system", "content": "You are a helpful assistant"}
        else:
            system_message =  {"role": "system", "content": "You are a helpful assistant"}

        if "gemma" in model_id or "gemma" in model_id:
            messages = [user_message]
        else:
            messages = [system_message, user_message]


        messages = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        messages = messages
        if "mistral" in model_id:
            messages += " "
            

        return messages

    df["prompt"] = df["text"].apply(preprocess)
    if "Qwen" in model_id:
        stop_token_id = tokenizer(["12"])['input_ids'][0]
    elif "lama" in model_id:
        stop_token_id = [16,17]
    elif "mistral" in model_id:
        stop_token_id = [29508, 29518]
    else:
        stop_token_id = tokenizer(["12"])['input_ids'][0][1:]
    assert len(stop_token_id) == 2

    def process_task(texts):
        # model = ray.get(model_ref)
        encoding = tokenizer(texts, padding=True, return_tensors='pt').to('cuda')

        with torch.no_grad():
            outputs = model(**encoding)
            logits = outputs.logits  # Shape: [batch_size, sequence_length, vocab_size]
        last_token_logits = logits[:, -1, :]  # Shape: [vocab_size]
        probabilities = torch.softmax(last_token_logits, dim=-1)
        indices = torch.tensor(stop_token_id)
        selected_probs = probabilities[:, indices].float().cpu().numpy()
        return selected_probs
    
    solution2 =  {}
    counter = 1
    texts = []
    sol = []
    current_counter = 0
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    probs_value = []
    futures = []
    for index, value in tqdm(enumerate(df['prompt'].tolist())):
        texts.append(value)
        if len(texts) % batch_size == 0:
            selected_probs = process_task(texts)   
            probs_value += selected_probs.tolist()
            texts = []
            torch.cuda.empty_cache()
            torch.cuda.synchronize()  

    torch.cuda.empty_cache()
    torch.cuda.synchronize()   
    if len(texts) != 0:  
        selected_probs = process_task(texts)   


        probs_value += selected_probs.tolist()
    rounded_data = [[round(x, 5) for x in pair] for pair in probs_value]
    prob_1 = [pair[0] for pair in probs_value]
    prob_2 = [pair[1] for pair in probs_value]
    df[model_id +"_label_1"] = prob_1
    df[model_id +"_label_2"] = prob_2
    return df

In [None]:
for name_ds in ["m1_eng.csv", "m1_deu.csv", "m1_vie.csv"]:
    df = pd.read_csv("data/raw/" + name_ds)
    df = filtering_qwen05(model_id=qwen05, df=df,batch_size=2000)
    df = df[df[qwen05 + "_label_1"] >= 0.5]
    df.to_csv("data/raw/" + name_ds + "_binary_filter_qwen05", index=False )

#  Section 2: Filtering Qwen 14B

In [None]:
def filtering_qwen14B(model_id, df,  batch_size = 300):
    max_seq_length = 500

    dtype = getattr(torch, "float16")
    load_in_4bit = True

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_id,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit, 
    )
    
    FastLanguageModel.for_inference(model) 
    tokenizer.padding_side = "left"

    user_message_template = '''You are tasked with annotating speech. Your response must be a single valid number:
    1 → The comment is from a human engaging with another human in an online discussion.  
    2 → The comment does not fit this criterion.  

    Provide **only** the corresponding number as your answer. Do not include any explanations, reasoning, or additional text.
    Is the following comment from a human engaging with another human in an online discussion?  

    \n"{comment}"\n
    Your Answer:
    '''

    def preprocess(text):
        user_message_content = user_message_template.format(comment=text)
        user_message = {
            "role": "user",
            "content": user_message_content
        }
        
        if "Qwen" in model_id:
            system_message =  {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant"}
            # system_message =  {"role": "system", "content": "You are a helpful assistant"}
        else:
            system_message =  {"role": "system", "content": "You are a helpful assistant"}

        if "gemma" in model_id or "gemma" in model_id:
            messages = [user_message]
        else:
            messages = [system_message, user_message]


        messages = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        messages = messages
        if "mistral" in model_id:
            messages += " "
            

        return messages

    df["prompt"] = df["text"].apply(preprocess)
    if "Qwen" in model_id:
        stop_token_id = tokenizer(["12"])['input_ids'][0]
    elif "lama" in model_id:
        stop_token_id = [16,17]
    elif "mistral" in model_id:
        stop_token_id = [29508, 29518]
    else:
        stop_token_id = tokenizer(["12"])['input_ids'][0][1:]
    assert len(stop_token_id) == 2

    def process_task(texts):
        # model = ray.get(model_ref)
        encoding = tokenizer(texts, padding=True, return_tensors='pt').to('cuda')

        with torch.no_grad():
            outputs = model(**encoding)
            logits = outputs.logits  # Shape: [batch_size, sequence_length, vocab_size]
        last_token_logits = logits[:, -1, :]  # Shape: [vocab_size]
        probabilities = torch.softmax(last_token_logits, dim=-1)
        indices = torch.tensor(stop_token_id)
        selected_probs = probabilities[:, indices].float().cpu().numpy()
        return selected_probs
    
    solution2 =  {}
    counter = 1
    texts = []
    sol = []
    current_counter = 0
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    probs_value = []
    futures = []
    for index, value in tqdm(enumerate(df['prompt'].tolist())):
        texts.append(value)
        if len(texts) % batch_size == 0:
            selected_probs = process_task(texts)   
            probs_value += selected_probs.tolist()
            texts = []
            torch.cuda.empty_cache()
            torch.cuda.synchronize()  

    torch.cuda.empty_cache()
    torch.cuda.synchronize()   
    if len(texts) != 0:  
        selected_probs = process_task(texts)   


        probs_value += selected_probs.tolist()
    rounded_data = [[round(x, 5) for x in pair] for pair in probs_value]
    prob_1 = [pair[0] for pair in probs_value]
    prob_2 = [pair[1] for pair in probs_value]
    df[model_id +"_label_1"] = prob_1
    df[model_id +"_label_2"] = prob_2
    return df

In [None]:
for name_ds in ["m1_eng.csv", "m1_deu.csv", "m1_vie.csv"]:
    df = pd.read_csv("data/raw/" + name_ds)
    df = filtering_qwen14B(model_id=qwen14b, df=df,batch_size=2000)
    df = df[df[qwen14b + "_label_1"] >= 0.5]
    df.to_csv("data/raw/" + name_ds + "_binary_filter_qwen14b", index=False )