<h1 style="text-align: center;">Conversation Generation with Persona</h1>

## Import Library

In [1]:
!pip install torch evaluate transformers accelerate>=0.26.0 sentencepiece einops sacrebleu google.generativeai openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import os
import torch
import numpy as np
import pandas as pd
import evaluate
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer, 
    GenerationConfig, 
    TrainingArguments, 
    Trainer,
    BitsAndBytesConfig,
    pipeline,
    DataCollatorWithPadding
)
from huggingface_hub import login
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from dotenv import load_dotenv

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Configure logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Global Config

In [None]:
load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
OPEN_API_KEY = os.getenv("OPEN_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [None]:
# Define models
MODELS = [
    "GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct",
    "GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct",
    "meta-llama/Llama-3.1-8B-Instruct",
    "sail/Sailor2-8B",
    "google/gemma-2-2b-it",
    "google/gemma-2-9b-it",
]

## Load Dataset

In [4]:
data = load_dataset("JavaneseHonorifics/Unggah-Ungguh", "conversation")

train-00000-of-00001.csv:   0%|          | 0.00/57.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160 [00:00<?, ? examples/s]

In [5]:
data["train"][1]

{'index': 1,
 'role a': 'Peer',
 'role b': 'Peer',
 'context': "Speaker A asks speaker B about what Speaker B's father ate today. Speaker A and Speaker B have equal status or position and have familiar interactions.",
 'a utterance': 'Bapakmu dhahar apa dina iki?',
 'a utterance category': 1,
 'b utterance': 'Bapakku dhahar pecel dina iki.',
 'b utterance category': 1}

In [6]:
def map_category_to_number(category):
    category_mapping = {
        'Ngoko': 0,
        'Ngoko lugu': 0,
        'Ngoko alus': 1,
        'Krama': 2,
        'Krama madya': 2,
        'Krama alus': 3
    }
    return category_mapping.get(category.strip(), -1)

def excel_to_dataset_dict(file_path):
    df = pd.read_excel(file_path)

    df['A Utterance Category'] = df['A Utterance Category'].apply(map_category_to_number)
    df['B Utterance Category'] = df['B Utterance Category'].apply(map_category_to_number)

    df = df.rename(columns={
        'Role A': 'role_a',
        'Role B': 'role_b',
        'Context': 'context',
        'A Utterance': 'a_utterance',
        'A Utterance Category': 'a_utterance_category',
        'B Utterance': 'b_utterance',
        'B Utterance Category': 'b_utterance_category'
    })
    
    df.reset_index(inplace=True)
    
    dataset = Dataset.from_pandas(df)
    
    dataset_dict = DatasetDict({
        'train': dataset
    })
    
    return dataset_dict

## Load Model

In [10]:
login(token=HF_TOKEN)

In [11]:
model_name_1 = MODELS[0] # "GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct"

tokenizer1 = AutoTokenizer.from_pretrained(model_name_1)

model1 = AutoModelForCausalLM.from_pretrained(
    model_name_1,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=f"./cache"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

--- Logging error ---
Traceback (most recent call last):
  File "/share/pkg.8/python3/3.10.12/install/lib/python3.10/logging/__init__.py", line 1100, in emit
    msg = self.format(record)
  File "/share/pkg.8/python3/3.10.12/install/lib/python3.10/logging/__init__.py", line 943, in format
    return fmt.format(record)
  File "/share/pkg.8/python3/3.10.12/install/lib/python3.10/logging/__init__.py", line 678, in format
    record.message = record.getMessage()
  File "/share/pkg.8/python3/3.10.12/install/lib/python3.10/logging/__init__.py", line 368, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/share/pkg.8/python3/3.10.12/install/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/share/pkg.8/python3/3.10.12/install/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/share/pkg.8/python3/3.10.12/install/lib/python3.

In [12]:
model_name_2 = MODELS[1] # "GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct"

tokenizer2 = AutoTokenizer.from_pretrained(model_name_2)

model2 = AutoModelForCausalLM.from_pretrained(
    model_name_2,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=f"./cache"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [13]:
model_name_3 = MODELS[2] # "meta-llama/Llama-3.1-8B-Instruct"

tokenizer3 = AutoTokenizer.from_pretrained(model_name_3)

model3 = AutoModelForCausalLM.from_pretrained(
    model_name_3,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=f"./cache"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
model_name_4 = MODELS[3] # "sail/Sailor2-8B"

tokenizer4 = AutoTokenizer.from_pretrained(model_name_4)

model4 = AutoModelForCausalLM.from_pretrained(
    model_name_4,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=f"./cache"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
model_name_5 = MODELS[4] # "google/gemma-2-2b-it"

tokenizer5 = AutoTokenizer.from_pretrained(model_name_5)

model5 = AutoModelForCausalLM.from_pretrained(
    model_name_5,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=f"./cache"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
model_name_6 = MODELS[5] # "google/gemma-2-9b-it"

tokenizer6 = AutoTokenizer.from_pretrained(model_name_6)

model6 = AutoModelForCausalLM.from_pretrained(
    model_name_6,
    device_map="auto",
    trust_remote_code=True,
    cache_dir=f"./cache"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## Define Variabel to Store Inference Result

In [17]:
model_names = [
    "model1", "model2", "model3", "model4", "model5", "model6",
    "modelOpenAI", "modelGemini"
]

for hint in ["noHint", "withHint"]:
    for speaker in ["A", "B"]:
        for model in model_names:
            globals()[f"{model}_{hint}_{speaker}"] = []

In [24]:
def evaluate_model(model, tokenizer, data, hint=False):
    generation_config = GenerationConfig(
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        top_k=50,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,
        early_stopping=True,
        repetition_penalty=1.2,
        min_length=10,
        max_new_tokens=100
    )
    A = []
    B = []
    
    data = data['train']
    
    for idx, item in enumerate(data):
        torch.cuda.empty_cache()
        
        prompt = f"""Create a conversation between A as {item["role a"]} and B as {item["role b"]} in Javanese language with this context: `{item["context"]}`\n"""
        prompt += "Please follow this format:\n"
        prompt += "A: `<UTTERANCE>`\n"
        prompt += "B: `<UTTERANCE>`\n\n"
        if hint:
            prompt += "Use this Javanese's honorific level usage as a hint:\n"
            prompt += "1. Ngoko Ngoko:\n"
            prompt += "- Used for informal conversations with peers or lower-status individuals\n"
            prompt += "- Common in close relationships or familiar interactions\n"
            prompt += "2. Ngoko Alus:\n"
            prompt += "- Adds respect when speaking to equals or higher-status individuals in informal or close relationships\n"
            prompt += "- Flexible for conversations with mixed-status participants\n"
            prompt += "- Talked with a person with equals status about other person who has a higher-status\n"
            prompt += "3. Krama:\n"
            prompt += "- Used for respectful conversations with equals or lower-status individuals, especially when not close\n"
            prompt += "- Suitable for maintaining formality in less familiar interactions\n"
            prompt += "4. Krama Alus:\n"
            prompt += "- Expresses high respect in conversations with higher-status individuals or unfamiliar equals\n"
            prompt += "- Essential for formal interactions requiring utmost politeness\n"
            prompt += "- Talked with a person with higher-status about other person who has a higher-status\n"
        prompt += "Answer:\n"
        
        model_input_ids = tokenizer(prompt, return_tensors='pt').input_ids
        
        try:
            outputs = model.generate(
                input_ids=model_input_ids,
                generation_config=generation_config
            )
            decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            del outputs
            torch.cuda.empty_cache()

            decoded_output = decoded_output.replace("```java", "").replace("```javanese", "").replace("```jawa", "").replace("```", "")

            a_count = 0
            b_count = 0
            a_utterance = ""
            b_utterance = ""

            lines = decoded_output.split("\n")
            for line in lines:
                line = line.strip()
                if line.startswith("A:"):
                    a_count += 1
                    if a_count == 2:
                        if '"' in line:
                            a_utterance = line[line.find('"'):line.rfind('"')+1].strip('"')
                        else:
                            a_utterance = line.replace("A:", "").strip()
                elif line.startswith("B:"):
                    b_count += 1
                    if b_count == 2:
                        if '"' in line:
                            b_utterance = line[line.find('"'):line.rfind('"')+1].strip('"')
                        else:
                            b_utterance = line.replace("B:", "").strip()

            a_utterance = a_utterance.strip('`').strip()
            b_utterance = b_utterance.strip('`').strip()

            print(f"\nInstance {idx + 1}:")
            print(f"Role A: {item['role_a']}")
            print(f"Role B: {item['role_b']}")
            print(f"Context: {item['context']}")
            print(f"Utterance A: {a_utterance}")
            print(f"Utterance B: {b_utterance}")
            print("-" * 50)

            A.append(a_utterance if a_utterance else "")
            B.append(b_utterance if b_utterance else "")
            
        except Exception as e:
            print(f"\nError processing instance {idx + 1}:")
            print(f"Role A: {item['role_a']}")
            print(f"Role B: {item['role_b']}")
            print(f"Context: {item['context']}")
            print(f"Error: {str(e)}")
            print("-" * 50)
            A.append("")
            B.append("")
            torch.cuda.empty_cache()
            continue
            
        torch.cuda.empty_cache()
    print("\nDONE")
    return A, B

In [39]:
def evaluate_api_model(data, model_type="openai", hint=False):
    A = []
    B = []
    
    data = data["train"]

    rate_limiter = RateLimiter(min_delay_ms=1000)

    if model_type == "openai":
        client = OpenAI(api_key=OPEN_API_KEY)
    elif model_type == "gemini":
        genai.configure(api_key=GEMINI_API_KEY)
        model = genai.GenerativeModel('gemini-1.5-pro')
    else:
        raise ValueError("model_type must be either 'openai' or 'gemini'")
    
    for idx, item in enumerate(data):
        try:
            prompt = f"""Create a conversation between A as {item["role_a"]} and B as {item["role_b"]} in Javanese language with this context: `{item["context"]}`\n"""
            prompt += "Please follow this format:\n"
            prompt += "A: `<UTTERANCE>`\n"
            prompt += "B: `<UTTERANCE>`\n\n"
            if hint:
                prompt += "Use this Javanese's honorific level usage as a hint:\n"
                prompt += "1. Ngoko Ngoko:\n"
                prompt += "- Used for informal conversations with peers or lower-status individuals\n"
                prompt += "- Common in close relationships or familiar interactions\n"
                prompt += "2. Ngoko Alus:\n"
                prompt += "- Adds respect when speaking to equals or higher-status individuals in informal or close relationships\n"
                prompt += "- Flexible for conversations with mixed-status participants\n"
                prompt += "- Talked with a person with equals status about other person who has a higher-status\n"
                prompt += "3. Krama:\n"
                prompt += "- Used for respectful conversations with equals or lower-status individuals, especially when not close\n"
                prompt += "- Suitable for maintaining formality in less familiar interactions\n"
                prompt += "4. Krama Alus:\n"
                prompt += "- Expresses high respect in conversations with higher-status individuals or unfamiliar equals\n"
                prompt += "- Essential for formal interactions requiring utmost politeness\n"
                prompt += "- Talked with a person with higher-status about other person who has a higher-status\n"
            prompt += "Answer:\n"

            rate_limiter.wait()

            if model_type == "openai":
                response = client.chat.completions.create(
                    model="gpt-4o",
                    messages=[
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.7,
                    max_tokens=100,
                    top_p=0.9
                )
                decoded_output = response.choices[0].message.content
                
            else:
                response = model.generate_content(
                    prompt,
                    generation_config=genai.types.GenerationConfig(
                        temperature=0.7,
                        top_p=0.9,
                        top_k=50,
                        max_output_tokens=100
                    )
                )
                decoded_output = response.text

            a_count = 0
            b_count = 0
            a_utterance = ""
            b_utterance = ""

            lines = decoded_output.split("\n")
            for line in lines:
                line = line.strip()
                if line.startswith("A:"):
                    a_count += 1
                    if a_count == 2:
                        if '"' in line:
                            a_utterance = line[line.find('"'):line.rfind('"')+1].strip('"')
                        else:
                            a_utterance = line.replace("A:", "").strip()
                elif line.startswith("B:"):
                    b_count += 1
                    if b_count == 2:
                        if '"' in line:
                            b_utterance = line[line.find('"'):line.rfind('"')+1].strip('"')
                        else:
                            b_utterance = line.replace("B:", "").strip()

            a_utterance = a_utterance.strip('`').strip()
            b_utterance = b_utterance.strip('`').strip()

            print(f"\nInstance {idx + 1}:")
            print(f"Role A: {item['role_a']}")
            print(f"Role B: {item['role_b']}")
            print(f"Context: {item['context']}")
            print(f"Utterance A: {a_utterance}")
            print(f"Utterance B: {b_utterance}")
            print(f"Model: {model_type}")
            print("-" * 50)

            A.append(a_utterance if a_utterance else "")
            B.append(b_utterance if b_utterance else "")
            
        except Exception as e:
            print(f"\nError processing instance {idx + 1}:")
            print(f"Role A: {item['role_a']}")
            print(f"Role B: {item['role_b']}")
            print(f"Context: {item['context']}")
            print(f"Error: {str(e)}")
            print(f"Model: {model_type}")
            print("-" * 50)
            A.append("")
            B.append("")
            continue
            
    print("\nDONE")
    return A, B

In [41]:
model1_noHint_A, model1_noHint_B = evaluate_model(model1, tokenizer1, data, False)
model2_noHint_A, model2_noHint_B = evaluate_model(model2, tokenizer2, data, False)
model3_noHint_A, model3_noHint_B = evaluate_model(model3, tokenizer3, data, False)
model4_noHint_A, model4_noHint_B = evaluate_model(model4, tokenizer4, data, False)
model5_noHint_A, model5_noHint_B = evaluate_model(model5, tokenizer5, data, False)
model6_noHint_A, model6_noHint_B = evaluate_model(model6, tokenizer6, data, False)
modelOpenAI_noHint_A, modelOpenAI_noHint_B = evaluate_api_model(data, 'openai', False)
modelGemini_noHint_A, modelGemini_noHint_B = evaluate_api_model(data, 'gemini', False)


Instance 1:
Role A: Teacher
Role B: Student
Context: Speaker A asked speaker B about what he had eaten today.  Speaker A has a higher status or position than Speaker B.  
Utterance A: Lha lawuhe opo, Le? (And what was the side dish, son?)
Utterance B: Tahu tempe kaliyan sambel thok, Bu. (Just tofu, tempeh, and sambal
Model: gemini
--------------------------------------------------

Instance 2:
Role A: Peer
Role B: Peer
Context: Speaker A asks speaker B about what Speaker B's father ate today. Speaker A and Speaker B have equal status or position and have familiar interactions.
Utterance A: 
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 3:
Role A: Older sibling
Role B: Younger sibling
Context: Speaker A asks speaker B about what Speaker B learned at school today. Speaker B has a lower status or position than Speaker A.
Utterance A: Sing basa Jawi mau sinau apa? Coba critakna! (What did you learn in Javanese class? Tell me!)
Utterance B: Sinau



Instance 24:
Role A: Batch leader
Role B: Batch member
Context: Speaker A tells speaker B to distribute gifts to each class leader. Speaker A and Speaker B have equal status or position and have familiar interactions.
Utterance A: Yo, ojo lali. Ben cepet rampung urusane.` (Yeah, don't forget. So that the task finishes
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 25:
Role A: Student council president
Role B: Student council secretary
Context: Speaker A asked about the assignment he gave to speaker B yesterday. Speaker A and Speaker B have equal status or position and have familiar interactions.
Utterance A: 
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 26:
Role A: School principal
Role B: Student
Context: Speaker A asked about speaker B who was visited by people from the education office. Speaker B has a lower status or position than Speaker A.
Utterance A: Lho, ana apa ta kok kowe sing dipilih? (Wh


Instance 48:
Role A: Friend
Role B: Friend
Context: Speaker A asked about his first place of play with speaker B. Speaker A and Speaker B have equal status or position and have familiar interactions.
Utterance A: 
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 49:
Role A: Study group partner
Role B: Study group partner
Context: Speaker A asks if speaker B has studied for tomorrow's test. Speaker A and Speaker B have equal status or position and have familiar interactions.
Utterance A: 
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 50:
Role A: Student
Role B: Teacher
Context: Speaker A asks the current teaching location from speaker B. Speaker B has a higher status or position than Speaker A.  
Utterance A: 
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 51:
Role A: Peer
Role B: Peer
Context: Speaker A tells speaker B to wake up his friend who is next to him be

KeyboardInterrupt: 

In [43]:
model1_withHint_A, model1_withHint_B = evaluate_model(model1, tokenizer1, data, True)
model2_withHint_A, model2_withHint_B = evaluate_model(model2, tokenizer2, data, True)
model3_withHint_A, model3_withHint_B = evaluate_model(model3, tokenizer3, data, True)
model4_withHint_A, model4_withHint_B = evaluate_model(model4, tokenizer4, data, True)
model5_withHint_A, model5_withHint_B = evaluate_model(model5, tokenizer5, data, True)
model6_withHint_A, model6_withHint_B = evaluate_model(model6, tokenizer6, data, True)
modelOpenAI_withHint_A, modelOpenAI_withHint_B = evaluate_api_model(data, 'openai', True)
modelGemini_withHint_A, modelGemini_withHint_B = evaluate_api_model(data, 'gemini', True)


Instance 1:
Role A: Teacher
Role B: Student
Context: Speaker A asked speaker B about what he had eaten today.  Speaker A has a higher status or position than Speaker B.  
Utterance A: Hey, what have you eaten today?
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 2:
Role A: Peer
Role B: Peer
Context: Speaker A asks speaker B about what Speaker B's father ate today. Speaker A and Speaker B have equal status or position and have familiar interactions.
Utterance A: Bapakmu dhahar napa dina iki, Le? (Your father ate what today, dude?)
Utterance B: Dhahar sega pecel kali
Model: gemini
--------------------------------------------------

Instance 3:
Role A: Older sibling
Role B: Younger sibling
Context: Speaker A asks speaker B about what Speaker B learned at school today. Speaker B has a lower status or position than Speaker A.
Utterance A: Lha, basa Jawine sinau apa? (Ngoko)
Utterance B: Sinau babagan aksara Jawa, Mas.  (Krama Alus)
Model: gemini
--


Instance 24:
Role A: Batch leader
Role B: Batch member
Context: Speaker A tells speaker B to distribute gifts to each class leader. Speaker A and Speaker B have equal status or position and have familiar interactions.
Utterance A: Bro, kado-kadone dibagi nang ketua kelas kabeh yo.` (Bro, distribute the gifts to all the class leaders, okay?)
Utterance B: Oke,
Model: gemini
--------------------------------------------------

Instance 25:
Role A: Student council president
Role B: Student council secretary
Context: Speaker A asked about the assignment he gave to speaker B yesterday. Speaker A and Speaker B have equal status or position and have familiar interactions.
Utterance A: Wah, cepet yo. Tak kira butuh wektu luwih suwe. (Wow, that's fast
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 26:
Role A: School principal
Role B: Student
Context: Speaker A asked about speaker B who was visited by people from the education office. Speaker B has a lowe


Instance 49:
Role A: Study group partner
Role B: Study group partner
Context: Speaker A asks if speaker B has studied for tomorrow's test. Speaker A and Speaker B have equal status or position and have familiar interactions.
Utterance A: Kowe wis sinau durung kanggo tes sesuk?` (Have you studied yet for tomorrow's test?)
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 50:
Role A: Student
Role B: Teacher
Context: Speaker A asks the current teaching location from speaker B. Speaker B has a higher status or position than Speaker A.  
Utterance A: 
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 51:
Role A: Peer
Role B: Peer
Context: Speaker A tells speaker B to wake up his friend who is next to him because the Friday sermon has been completed. Speaker A and Speaker B have equal status or position and have familiar interactions.
Utterance A: 
Utterance B: 
Model: gemini
--------------------------------------


Instance 77:
Role A: Presidential aide
Role B: President
Context: Speaker A gave a drink of wedang uwuh to speaker B. Speaker B has a higher status or position than Speaker A.  
Utterance A: Mboten usah repot-repot, Pak. Menawi kirang legi, wonten gula pasir ing meja.
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 78:
Role A: Wife
Role B: Husband
Context: Speaker A asked if there was anything in the heart of speaker B. Speaker A has a lower status or position than Speaker B.
Utterance A: 
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 79:
Role A: Stranger
Role B: Stranger
Context: Speaker A greets Speaker B, who is a stranger of the same age. Speaker A and Speaker B have equal status but want to show mutual respect to each other.
Utterance A: 
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 80:
Role A: Echelon 2 official (Director)
Role B: Echelon 1 official (Di


Instance 100:
Role A: Fellow Neighbor
Role B: Fellow Neighbor (Younger)
Context: Speaker A is asking Speaker B to move the car that is parked in front of Speaker A's house. Speaker A has a higher status or position than Speaker B.  
Utterance A: 
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 101:
Role A: Friend
Role B: Friend
Context: Speaker A is apologizing to Speaker B for not being able to attend Speaker A's wedding. Speaker A and Speaker B have equal status or position and have familiar interactions.
Utterance A: Iki lho, anake lara dadakan. Kudu tak gowo nang dokter.` (This, you see, my
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 102:
Role A: an Adult
Role B: a little child
Context: Speaker A is asking Speaker B if he can tie his shoes or not, and then offers assistance and advice. Speaker A has a higher status or position than Speaker B.  
Utterance A: Loh, kok durung isa? Wis gedhe lho kowe


Instance 123:
Role A: A citizen
Role B: Head of Village
Context: Speaker A is asking Speaker B to provide input regarding the community meal event. Speaker A has a lower status or position than Speaker B.  
Utterance A: 
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 124:
Role A: MC
Role B: High ranking public officials
Context: Speaker A honors Speaker B, who is the guest of honor at the formal event. Speaker A and Speaker B have equal status or position, but they are not yet accustomed to communicating with each other, yet both still show mutual respect toward one another. 
Utterance A: 
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 125:
Role A: Son
Role B: Father
Context: Speaker A invites Speaker B to attend the celebration of his child's graduation. Speaker A has a lower status or position than Speaker B.  
Utterance A: 
Utterance B: 
Model: gemini
------------------------------------------------


Instance 146:
Role A: Junior Level Public Official
Role B: Director
Context: Speaker A asks about Speaker B's willingness to open the training event next week. Speaker B has a higher status or position than Speaker A.  
Utterance A: 
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 147:
Role A: Peer in office
Role B: Peer in office
Context: Speaker A invites Speaker B to deliver a letter to the Minister. Speaker A and Speaker B have equal status or position and have familiar interactions.
Utterance A: Iki lho, ana surat penting arep dikirim menyang Pak Menteri. Aku lagi akeh gawean. Kira-kira kowe gelem ngeterke ora?` (This, there's an important letter that needs
Utterance B: 
Model: gemini
--------------------------------------------------

Instance 148:
Role A: Childhood friend
Role B: Childhood friend
Context: Speaker A invites Speaker B to buy a guitar at the store. Speaker A and Speaker B have equal status or position and have familiar inte

## Load Evaluator Model

In [25]:
model_path = "./Model/javanese-distilbert"

In [26]:
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [27]:
trainer = Trainer(model=model, tokenizer=tokenizer)



In [28]:
def preprocess_function(examples, tokenizer):
    return tokenizer(examples, truncation=True, padding='max_length', max_length=128)

## Evaluation

In [49]:
def process_and_predict(data, model_name, hint_status, speaker, tokenizer, trainer):
    print(f"Processing {model_name}_{hint_status}_{speaker} with {len(data)} samples")

    tokenized_data = [preprocess_function(x, tokenizer) for x in data]

    dataset = Dataset.from_dict({
        "input_ids": [item['input_ids'] for item in tokenized_data],
        "attention_mask": [item['attention_mask'] for item in tokenized_data]
    })

    output = trainer.predict(dataset)
    predictions = output.predictions.argmax(-1)

    print(f"Processed {model_name}_{hint_status}_{speaker}")
    return predictions

models = ["model1", "model2", "model3", "model4", "model5", "model6", "modelOpenAI", "modelGemini"]
hint_statuses = ["noHint", "withHint"]
speakers = ["A", "B"]

predictions_dict = {}

for model in models:
    for hint_status in hint_statuses:
        for speaker in speakers:
            data_var_name = f"{model}_{hint_status}_{speaker}"
            predictions = predictions_dict.get(data_var_name, [])

            groundtruth_label_key = f"{speaker.lower()}_utterance_category"
            ground_truth = data1[groundtruth_label_key]
            
            actual_data = globals()[data_var_name]
            
            if data:
                predictions_dict[data_var_name] = process_and_predict(
                    actual_data, model, hint_status, speaker, tokenizer, trainer
                )

Processing model1_noHint_A with 160 samples


Processed model1_noHint_A
Processing model1_noHint_B with 160 samples


Processed model1_noHint_B
Processing model1_withHint_A with 160 samples


Processed model1_withHint_A
Processing model1_withHint_B with 160 samples


Processed model1_withHint_B
Processing model2_noHint_A with 160 samples


Processed model2_noHint_A
Processing model2_noHint_B with 160 samples


Processed model2_noHint_B
Processing model2_withHint_A with 160 samples


Processed model2_withHint_A
Processing model2_withHint_B with 160 samples


Processed model2_withHint_B
Processing model3_noHint_A with 160 samples


Processed model3_noHint_A
Processing model3_noHint_B with 160 samples


Processed model3_noHint_B
Processing model3_withHint_A with 160 samples


Processed model3_withHint_A
Processing model3_withHint_B with 160 samples


Processed model3_withHint_B
Processing model4_noHint_A with 160 samples


Processed model4_noHint_A
Processing model4_noHint_B with 160 samples


Processed model4_noHint_B
Processing model4_withHint_A with 160 samples


Processed model4_withHint_A
Processing model4_withHint_B with 160 samples


Processed model4_withHint_B
Processing model5_noHint_A with 160 samples


Processed model5_noHint_A
Processing model5_noHint_B with 160 samples


Processed model5_noHint_B
Processing model5_withHint_A with 160 samples


Processed model5_withHint_A
Processing model5_withHint_B with 160 samples


Processed model5_withHint_B
Processing model6_noHint_A with 160 samples


Processed model6_noHint_A
Processing model6_noHint_B with 160 samples


Processed model6_noHint_B
Processing model6_withHint_A with 160 samples


Processed model6_withHint_A
Processing model6_withHint_B with 160 samples


Processed model6_withHint_B
Processing modelOpenAI_noHint_A with 160 samples


Processed modelOpenAI_noHint_A
Processing modelOpenAI_noHint_B with 160 samples


Processed modelOpenAI_noHint_B
Processing modelOpenAI_withHint_A with 160 samples


Processed modelOpenAI_withHint_A
Processing modelOpenAI_withHint_B with 160 samples


Processed modelOpenAI_withHint_B
Processing modelGemini_noHint_A with 160 samples


Processed modelGemini_noHint_A
Processing modelGemini_noHint_B with 160 samples


Processed modelGemini_noHint_B
Processing modelGemini_withHint_A with 160 samples


Processed modelGemini_withHint_A
Processing modelGemini_withHint_B with 160 samples


Processed modelGemini_withHint_B


In [73]:
def calculate_metrics_with_report(y_true, y_pred, model, hint_status, speaker):
    overall_metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, average='macro'),
        "recall": recall_score(y_true, y_pred, average='macro'),
        "f1_score": f1_score(y_true, y_pred, average='macro')
    }

    report_dict = classification_report(y_true, y_pred, output_dict=True)

    report_rows = []

    unique_labels = sorted(set(y_true))
    for label in unique_labels:
        true_positives = sum((y_pred == label))
        total = len(y_true)
        
        class_accuracy = (true_positives) / total
        
        metrics = report_dict[str(label)]
        row = {
            'model': model,
            'hint_status': hint_status,
            'speaker': speaker,
            'label': label,
            'precision': metrics['precision'],
            'recall': metrics['recall'],
            'f1-score': metrics['f1-score'],
            'support': metrics['support'],
            'accuracy': class_accuracy
        }
        report_rows.append(row)
    
    return overall_metrics, report_rows

metrics_list = []
classification_report_rows = []
detailed_results = []

for model in models:
    for hint_status in hint_statuses:
        for speaker in speakers:
            data_var_name = f"{model}_{hint_status}_{speaker}"
            predictions = predictions_dict.get(data_var_name, None)
            
            if predictions is None or len(predictions) == 0:
                continue
                
            data = globals().get(data_var_name, [])
            if not data:
                continue
            
            groundtruth_label_key = f"{speaker.lower()}_utterance_category"
            ground_truth = dataset[groundtruth_label_key][:len(predictions)]
            
            if len(ground_truth) != len(predictions):
                print(f"Warning: Mismatch in lengths for {data_var_name}")
                continue

            metrics, report_rows = calculate_metrics_with_report(
                ground_truth, predictions, model, hint_status, f"{speaker} Utterance"
            )
 
            metrics["model"] = model
            metrics["hint_status"] = hint_status
            metrics["speaker"] = f"{speaker} Utterance"
            metrics_list.append(metrics)
            classification_report_rows.extend(report_rows)

            for idx, (true_label, pred_label) in enumerate(zip(ground_truth, predictions)):
                detailed_results.append({
                    "model": model,
                    "hint_status": hint_status,
                    "speaker": speaker,
                    "instance": idx,
                    "true_label": true_label,
                    "predicted_label": pred_label,
                    "correct": true_label == pred_label
                })

results_df = pd.DataFrame(metrics_list)
classification_report_df = pd.DataFrame(classification_report_rows)
detailed_df = pd.DataFrame(detailed_results)

final_df = results_df.pivot_table(
    index="model",
    columns=["hint_status", "speaker"],
    values=["accuracy", "precision", "recall", "f1_score"]
).round(4)

final_df = final_df.reorder_levels([1, 2, 0], axis=1).sort_index(axis=1, level=[0, 1])

final_df.to_csv('overall_metrics.csv')
classification_report_df.to_csv('classification_report.csv', index=False)

print("\nOverall Metrics:")
print(final_df)

print("\nClassification Report by Label:")

sample_report = classification_report_df[
    (classification_report_df['model'] == models[0]) & 
    (classification_report_df['hint_status'] == hint_statuses[0]) &
    (classification_report_df['speaker'] == f"{speakers[0]} Utterance")
]
print(sample_report)


Overall Metrics:
hint_status      noHint                                                  \
speaker     A Utterance                            B Utterance            
               accuracy f1_score precision  recall    accuracy f1_score   
model                                                                     
model1           0.4125   0.1514    0.1107  0.2391      0.5875   0.1880   
model2           0.4188   0.1502    0.1088  0.2428      0.5750   0.2090   
model3           0.4062   0.1630    0.2784  0.2350      0.5500   0.1789   
model4           0.4875   0.2725    0.3707  0.3349      0.5812   0.2310   
model5           0.4250   0.1821    0.1375  0.2740      0.5750   0.2249   
model6           0.4250   0.2249    0.2142  0.2982      0.5812   0.2083   
modelGemini      0.4625   0.2279    0.3608  0.2940      0.6000   0.1905   
modelOpenAI      0.4312   0.1797    0.3644  0.2490      0.6000   0.1905   

hint_status                      withHint                             \
speaker  

In [74]:
final_df

hint_status,noHint,noHint,noHint,noHint,noHint,noHint,noHint,noHint,withHint,withHint,withHint,withHint,withHint,withHint,withHint,withHint
speaker,A Utterance,A Utterance,A Utterance,A Utterance,B Utterance,B Utterance,B Utterance,B Utterance,A Utterance,A Utterance,A Utterance,A Utterance,B Utterance,B Utterance,B Utterance,B Utterance
Unnamed: 0_level_2,accuracy,f1_score,precision,recall,accuracy,f1_score,precision,recall,accuracy,f1_score,precision,recall,accuracy,f1_score,precision,recall
model,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
model1,0.4125,0.1514,0.1107,0.2391,0.5875,0.188,0.1536,0.2423,0.4312,0.1923,0.1618,0.2776,0.5875,0.1873,0.1526,0.2423
model2,0.4188,0.1502,0.1088,0.2428,0.575,0.209,0.2122,0.2512,0.4125,0.1486,0.1078,0.2391,0.575,0.184,0.1503,0.2371
model3,0.4062,0.163,0.2784,0.235,0.55,0.1789,0.1477,0.2268,0.425,0.1511,0.109,0.2464,0.6,0.2499,0.3004,0.2817
model4,0.4875,0.2725,0.3707,0.3349,0.5812,0.231,0.2888,0.2621,0.45,0.237,0.2837,0.2858,0.5625,0.1837,0.152,0.232
model5,0.425,0.1821,0.1375,0.274,0.575,0.2249,0.2798,0.2573,0.4375,0.16,0.3606,0.2534,0.6062,0.2168,0.2362,0.2641
model6,0.425,0.2249,0.2142,0.2982,0.5812,0.2083,0.1994,0.2538,0.4125,0.1889,0.3976,0.263,0.5812,0.1883,0.155,0.2397
modelGemini,0.4625,0.2279,0.3608,0.294,0.6,0.1905,0.1548,0.2474,0.425,0.1696,0.3662,0.2459,0.6062,0.2155,0.2048,0.2641
modelOpenAI,0.4312,0.1797,0.3644,0.249,0.6,0.1905,0.1548,0.2474,0.425,0.1635,0.235,0.2459,0.6,0.192,0.1569,0.2474


In [75]:
detailed_df

Unnamed: 0,model,hint_status,speaker,instance,true_label,predicted_label,correct
0,model1,noHint,A,0,0,0,True
1,model1,noHint,A,1,1,0,False
2,model1,noHint,A,2,0,0,True
3,model1,noHint,A,3,0,0,True
4,model1,noHint,A,4,0,0,True
...,...,...,...,...,...,...,...
5115,modelGemini,withHint,B,155,0,0,True
5116,modelGemini,withHint,B,156,0,0,True
5117,modelGemini,withHint,B,157,3,2,False
5118,modelGemini,withHint,B,158,0,0,True


In [84]:
classification_report_df[classification_report_df["model"]=="modelGemini"]

Unnamed: 0,model,hint_status,speaker,label,precision,recall,f1-score,support,accuracy
112,modelGemini,noHint,A Utterance,0,0.528,0.956522,0.680412,69.0,0.78125
113,modelGemini,noHint,A Utterance,1,0.0,0.0,0.0,9.0,0.0125
114,modelGemini,noHint,A Utterance,2,0.04,0.125,0.060606,8.0,0.15625
115,modelGemini,noHint,A Utterance,3,0.875,0.094595,0.170732,74.0,0.05
116,modelGemini,noHint,B Utterance,0,0.619355,0.989691,0.761905,97.0,0.96875
117,modelGemini,noHint,B Utterance,1,0.0,0.0,0.0,11.0,0.0
118,modelGemini,noHint,B Utterance,2,0.0,0.0,0.0,15.0,0.03125
119,modelGemini,noHint,B Utterance,3,0.0,0.0,0.0,37.0,0.0
120,modelGemini,withHint,A Utterance,0,0.464789,0.956522,0.625592,69.0,0.8875
121,modelGemini,withHint,A Utterance,1,0.0,0.0,0.0,9.0,0.0


In [77]:
detailed_df[detailed_df['model'] == 'model2']

Unnamed: 0,model,hint_status,speaker,instance,true_label,predicted_label,correct
640,model2,noHint,A,0,0,0,True
641,model2,noHint,A,1,1,0,False
642,model2,noHint,A,2,0,0,True
643,model2,noHint,A,3,0,0,True
644,model2,noHint,A,4,0,0,True
...,...,...,...,...,...,...,...
1275,model2,withHint,B,155,0,0,True
1276,model2,withHint,B,156,0,0,True
1277,model2,withHint,B,157,3,0,False
1278,model2,withHint,B,158,0,0,True
