In [1]:
import pandas as pd
import json
import os
import ast
import re
import numpy as np
import matplotlib.pyplot as plt
import torch
from time import time
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, f1_score,  recall_score, confusion_matrix, precision_score
from transformers import (
    set_seed,
)
set_seed(42)

from unsloth import FastLanguageModel
max_seq_length = 500
from tqdm import tqdm

import pickle

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
random_state_list = [42, 57, 120, 98, 65, 74]

In [3]:
if os.getcwd() == '/root':
    new_path = "/root/0_Thesis/0_final/"
    os.chdir(new_path)
else:
    os.chdir("..") 
print(os.getcwd())

/root/0_Thesis/0_final


# Section 1: Load Human Label Data

In [None]:
# RE-EVALUATION SET
# df_evaluation = pd.read_csv("data/human/re_evaluation.csv")
# df_evaluation['len_text'] = df_evaluation['text'].str.len()
# df_evaluation = df_evaluation[df_evaluation['len_text'] <= 300]
# print(df_evaluation.shape)
# df_evaluation.ds.value_counts()

(2645, 4)


ds
hate     1000
ethos     945
hasoc     700
Name: count, dtype: int64

In [4]:
df_evaluation = pd.read_csv("data/human/1_combine_hate_ds.csv")
df_evaluation['len_text'] = df_evaluation['text'].str.len()
df_evaluation = df_evaluation[df_evaluation['len_text'] <= 300]
print(df_evaluation.shape)

(84083, 7)


In [5]:
df_evaluation.dataset.value_counts()

dataset
ViHSD           30571
HateSpeechX     20022
Sexism          13631
GermEval2019    12131
GermEval2021     3457
Covid            2164
US_election      2107
Name: count, dtype: int64

In [6]:
df_evaluation.hate_label_id.value_counts()

hate_label_id
1    58915
0    25168
Name: count, dtype: int64

In [7]:
df_evaluation.language.value_counts()

language
eng    37924
vie    30571
deu    15588
Name: count, dtype: int64

In [8]:
random_state = 42
def create_df_for_eval(random_state=42, n = 500):
    all_ds = np.unique(df_evaluation['dataset'])
    df_for_evaluation = pd.DataFrame()
    for ds in all_ds:
        df_tmp = df_evaluation[df_evaluation['dataset'] == ds]
        n = 500
        if np.sum(df_tmp['hate_label_id'] == 0) <n:
            n = np.sum(df_tmp['hate_label_id'] == 0)
        df_tmp_0 = df_tmp[df_tmp['hate_label_id'] == 0].sample(n=n, random_state=random_state)
        df_tmp_1 = df_tmp[df_tmp['hate_label_id'] == 1].sample(n=1000-n, random_state=random_state)
        df_for_evaluation = pd.concat([df_for_evaluation, df_tmp_0, df_tmp_1], ignore_index=True)
    return df_for_evaluation
df_for_evaluation = create_df_for_eval(42)
print(df_for_evaluation.shape)
print(df_for_evaluation.hate_label_id.value_counts())

(7000, 7)
hate_label_id
1    3852
0    3148
Name: count, dtype: int64


In [9]:
df_for_evaluation.loc[df_for_evaluation["hate_label_id"] == 0,'text']

0       There is absolutely no way that a country with...
1       @Johnusadollar @AnnieBurns14 What a typical, p...
2       #coronavirus Were all going to die because som...
3       @NortherntreeY @Lyndsayunihead @sarahgrowls @r...
4       NOT #COVIDー19!!  IT SHOULD BE CALLED THE #Wuha...
                              ...                        
6495           Admin đánh rơi não trước khi đăng bài này?
6496                                            Anh em cc
6497    Hiến máu phải chờ xét nghiệm có kqua rồi mới đ...
6498    No qua ma oi nguoi ta giau kg ai ma dem khoe n...
6499    Trường Giang nói mới tỉnh ra bạn à. Chứ đâu ai...
Name: text, Length: 3148, dtype: object

# Section 2: Load Model

In [None]:
model_lists = ["m2.v2.lgb.Llama1B", "m2.v2.lgb.Qwen14B", "dangdangde/m2.v8.lgb.Llama1B", "dangdangde/m2.v9.lgb.Llama1B", 'dangdangde/m1.m2.v1.mean.Llama1B' ,'dangdangde/m2.v4.lgb.Qwen14B']
base_model = []

In [11]:
model_id = "dangdangde/" + model_lists[1]

In [12]:
dtype = getattr(torch, "bfloat16")
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

FastLanguageModel.for_inference(model) 
tokenizer.padding_side = "left"

==((====))==  Unsloth 2025.2.5: Fast Qwen2 patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Unsloth 2025.2.5 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


In [13]:

user_message_template = '''You are tasked with annotating speech. Your response must be a single valid number:
1 for Hate Speech,
2 for Normal.

Provide only the number corresponding to the category. Do not include any explanation or additional text.
Do you think the following comment is hate speech or offensive speech?
\n"{comment}"\n
Your Answer:
'''

def preprocess(text):
    user_message_content = user_message_template.format(comment=text)
    user_message = {
        "role": "user",
        "content": user_message_content
    }
    if "Qwen" in model_id:
        system_message =  {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant"}
    else:
        system_message =  {"role": "system", "content": "You are a helpful assistant"}

    if "gemma" in model_id or "gemma" in model_id:
        messages = [user_message]
    else:
        messages = [system_message, user_message]


    messages = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    messages = messages
    if "mistral" in model_id:
        messages += " "

    return messages

df_evaluation["prompt"] = df_evaluation['text'].apply(preprocess)

In [14]:
if "Qwen" in model_id:
    stop_token_id = tokenizer(["12"])['input_ids'][0]
    if "DeepSeek" in model_id:
        stop_token_id = [16, 17]
elif "Llama" in model_id or "llama" in model_id:
    stop_token_id = [16, 17]
    if "DeepSeek" in model_id:
        stop_token_id = [16, 17]
elif "mistral" in model_id or "Mistral" in model_id:
    stop_token_id = [29508, 29518]
else:
    stop_token_id = tokenizer(["12"])['input_ids'][0][1:]

assert len(stop_token_id) == 2
def process_task(texts):
    encoding = tokenizer(texts, padding=True, return_tensors='pt').to('cuda')
    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits  
    last_token_logits = logits[:, -1, :]  
    probabilities = torch.softmax(last_token_logits, dim=-1)
    indices = torch.tensor(stop_token_id)
    selected_probs_1 = probabilities[:, indices[0]].float().cpu().numpy()
    selected_probs_2 = probabilities[:, indices[1]].float().cpu().numpy()
    return selected_probs_1, selected_probs_2

# Section 3: Evaluation

In [15]:
batch_size = 100

In [16]:
all_y_pred = []
all_y_true = []
model_probs_dict = {}
for random_state in random_state_list:
    df_for_evaluation = create_df_for_eval(random_state)
    df_for_evaluation["prompt"] = df_for_evaluation["text"].apply(preprocess)
    probs_value_1 = []
    probs_value_2 = []

    texts = []
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    for index, value in tqdm(enumerate(df_for_evaluation['prompt'].tolist())):
        texts.append(value)

        if len(texts) % batch_size == 0:
            selected_probs_1, selected_probs_2 = process_task(texts)   
            probs_value_1 += selected_probs_1.tolist()
            probs_value_2 += selected_probs_2.tolist()
            texts = []

            torch.cuda.empty_cache()
            torch.cuda.synchronize()

    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    
    if len(texts) != 0:
        selected_probs_1, selected_probs_2 = process_task(texts)   
        probs_value_1 += selected_probs_1.tolist()
        probs_value_2 += selected_probs_2.tolist()

    y_true = np.array(df_for_evaluation['hate_label_id'] == 0, dtype=int)

    all_y_true.extend(y_true)
    all_y_pred.extend(probs_value_1)



    model_probs_dict[random_state] = {
        "probs_value_1": probs_value_1,
        "probs_value_2": probs_value_2,
        "y_true": y_true
    }
print(f"AUC: {roc_auc_score(np.array(all_y_true), np.array(all_y_pred))}")
all_y_pred = np.array(all_y_pred)
all_y_pred = all_y_pred >= 0.3
print(f"Acc: {accuracy_score(all_y_true , all_y_pred)}")
print(f"Precision: {precision_score(all_y_true , all_y_pred)}")
print(f"Recall: {recall_score(all_y_true , all_y_pred)}")
print(f"f1_score: {f1_score(all_y_true , all_y_pred)}")

# prob_save_path = f"{model_id}.pkl"
# with open(prob_save_path, "wb") as f:
#     pickle.dump(model_probs_dict[model_id], f)

7000it [06:46, 17.23it/s]
7000it [06:49, 17.10it/s]
7000it [06:46, 17.21it/s]
7000it [06:54, 16.89it/s]
7000it [06:55, 16.85it/s]
7000it [06:53, 16.91it/s]

AUC: 0.79746841600499
Acc: 0.7090714285714286
Precision: 0.7444110532874001
Recall: 0.5376958915713681
f1_score: 0.6243890442962098



