In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          Trainer,
                          pipeline,
                          DataCollatorForLanguageModeling)
from peft import LoraConfig, get_peft_model

import huggingface_hub
import os
import logging
from tqdm import tqdm
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)

import wandb
# import argparse
# from peft import PeftModel

In [None]:
!pip install transformers
!pip install peft
!pip install 'accelerate>=0.26.0'
!pip install -U bitsandbytes
!pip install huggingface-hub
!pip install datasets
!pip install wandb

In [2]:
class CustomTextDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        encoding = self.tokenizer(
            text,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }


def create_prompted_text(
    dataset,
    label_name,
):
    texts = []
    classes_names = ', '.join(list(dataset[label_name].unique()))

    for _, row in dataset.iterrows():
        texts.append(
            f"You will be given a part of an interview. "
            f"Classify the response to the selected question "
            f"into one of the following categories: {classes_names}"
            f". \n\n ### Part of the interview ### \nIntervier:"
            f" {row['interview_question']} \nResponse:"
            f" {row['interview_answer']} \n\n### Selected Question ###\n"
            f"{row['question']} \n\nLabel: {row[label_name]}"
        )
    return texts


def load_qevasion_dataset(
    tokenizer,
    label_name="clarity_label"
):
    # Get train set data
    df = pd.read_csv('preprocessed_data/train_set.csv')[['question',
                                                         'interview_question',
                                                         'interview_answer',
                                                         label_name]]

    # Split train set to train and validation data
    np.random.seed(2024)
    msk = np.random.rand(len(df)) < 0.9
    train = df[msk]
    validation = df[~msk]

    train.reset_index(drop=True, inplace=True)
    validation.reset_index(drop=True, inplace=True)

    train_texts = create_prompted_text(train, label_name)
    validation_texts = create_prompted_text(validation,
                                            label_name)

    print("Example of train test:" + train_texts[1])
    print("Example of validation test:" + validation_texts[1])

    # train_texts = train_texts[:8]
    # validation_texts = validation_texts[:1]
    return (CustomTextDataset(train_texts, tokenizer),
            CustomTextDataset(validation_texts, tokenizer))


class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"""trainable params: {trainable_params} || all params: {all_param}
        || trainable%: {100 * trainable_params / all_param}"""
    )


def finetuning(model_name,
               output_model_dir,
               label_taxonomy,
               lr,
               epochs):

    cache_dir = ""

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        ),
        device_map='auto',
        torch_dtype=torch.float16,
        cache_dir=cache_dir
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir,
                                              trust_remote_code=True,)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    for param in model.parameters():
        param.requires_grad = False  # freeze the model - train adapters later
        if param.ndim == 1:
            # cast the small parameters (e.g. layernorm) to fp32 for stability
            param.data = param.data.to(torch.float32)

    # Reduce number of stored activation
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()
    model.lm_head = CastOutputToFloat(model.lm_head)

    config = LoraConfig(
        r=16,  # Attention heads
        lora_alpha=32,  # Alpha scaling
        # target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"  # set this for CLM or Seq2Seq
    )

    model = get_peft_model(model, config)
    print_trainable_parameters(model)

    # load data
    train_data, validation_data = load_qevasion_dataset(tokenizer,
                                                        label_taxonomy)

    print(f"""Found {len(train_data)} instances for training and
    {len(validation_data) } instances for validation.""")

    grad_accum_steps = 8

    # Train model
    print("Training...")
    # out_dir = output_model_dir.split("/")[-1]

    trainer = Trainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=validation_data,
        args=TrainingArguments(
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=grad_accum_steps,
            eval_accumulation_steps=1,
            warmup_steps=100,
            max_steps=int((len(train_data)*epochs)/grad_accum_steps),
            learning_rate=lr,
            fp16=True,
            logging_steps=1,
            # eval_steps * int((len(train_data)*epochs)/grad_accum_steps)
            # if eval_steps < 1
            eval_steps=0.33/epochs,
            eval_strategy="steps",
            do_eval=True,
            report_to="wandb",
            # save_steps= 2,
            # num_train_epochs=epochs,
            # output_dir=f'outputs_{out_dir}' #,
        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer,
                                                      mlm=False)
    )

    # silence the warnings. Re-enable for inference!
    model.config.use_cache = False
    trainer.train()

    # Save the model
    model.save_pretrained(output_model_dir)

    # Optionally, save the tokenizer as well
    # tokenizer.save_pretrained(output_model_dir)
    return model, tokenizer


def create_test_prompted_text(
    dataset,
    label_name,
):
    texts = []
    classes_names = ', '.join(list(dataset[label_name].unique()))

    for _, row in dataset.iterrows():
        texts.append(
            f"You will be given a part of an interview."
            f"Classify the response to the selected question"
            f"into one of the following categories: {classes_names}"
            f". \n\n ### Part of the interview ### \nIntervier:"
            f" {row['interview_question']} \nResponse:"
            f" {row['interview_answer']} \n\n### Selected Question ###\n"
            f"{row['question']} \n\nLabel:"
        )
    return texts


def predict(test, categories, model, tokenizer):

    # Set logging level to ERROR to suppress INFO messages
    logging.basicConfig(level=logging.ERROR)

    y_pred = []
    pipe = pipeline(task="text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    # max_new_tokens=4
                    )

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("Label:")[-1].strip()

        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                print("Right label:" + answer.lower())
                y_pred.append(category)
                break
        else:
            print("Wrong label:" + answer.lower())
            y_pred.append("none")

    return y_pred


def evaluation_report(y_true, y_pred, labels, run=None):
    mapping = {label: idx for idx, label in enumerate(labels)}

    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found

    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)

    if run:
        wandb_log_dict = {}

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.2f}')
    if run:
        wandb_log_dict["Accuracy"] = accuracy

    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped))
                         if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.2f}')
        if run:
            wandb_log_dict[f"Accuracy for label {labels[label]}"] = label_accuracy

    unsplit_labels = [label.replace(" ", "_") for label in labels]

    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped,
                                         y_pred=y_pred_mapped,
                                         target_names=unsplit_labels,
                                         labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)

    report_columns = ["Class", "Precision", "Recall", "F1-score", "Support"]
    report_table = []
    class_report = class_report.splitlines()
    for line in class_report[2:(len(labels)+2)]:
        report_table.append(line.split())

    if run:
        wandb_log_dict["Classification Report"] = wandb.Table(
            data=report_table,
            columns=report_columns)

    # For not predicted classes
    mask = y_pred_mapped != -1
    y_true_mapped2 = y_true_mapped[mask]
    y_pred_mapped2 = y_pred_mapped[mask]

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped,
                                   y_pred=y_pred_mapped,
                                   labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

    if run:
        wandb_log_dict["Confusion Matix"] = wandb.plot.confusion_matrix(
            y_true=y_true_mapped2,
            preds=y_pred_mapped2,
            class_names=labels
        )
        run.log(wandb_log_dict)


def evaluate(base_model_name, fine_tuned_model_path, label_name, run=None):

    cache_dir = ""

    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        return_dict=True,
        low_cpu_mem_usage=True,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        ),
        # torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        offload_folder="offload/",
        cache_dir=cache_dir
    )

    tokenizer = AutoTokenizer.from_pretrained(base_model_name,
                                              cache_dir=cache_dir)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # model = AutoModelForCausalLM.from_pretrained(
    #     model_name,
    #     # load_in_4bit=True,
    #     quantization_config=BitsAndBytesConfig(
    #         load_in_4bit=True,
    #         bnb_4bit_compute_dtype=torch.bfloat16
    #     ),
    #     device_map='auto',
    #     # device_map='cpu',
    #     torch_dtype=torch.float16,
    #     cache_dir=cache_dir
    # )

    # base_model_reload.config.use_cache = False
    # model = PeftModel.from_pretrained(base_model_reload,
    #                                   fine_tuned_model_path,
    #                                   # device_map='auto',
    #                                   offload_folder="offload/") 
    # model = model.merge_and_unload()

    # Get test set data
    test_df = pd.read_csv('preprocessed_data/test_set.csv')[[
        'question',
        'interview_question',
        'interview_answer',
        label_name
    ]]

    test_texts = create_test_prompted_text(test_df, label_name)
    dataset = pd.DataFrame(test_texts, columns=['text'])

    labels = list(test_df[label_name].unique())

    y_pred = predict(dataset, labels, model, tokenizer)
    y_true = test_df[label_name]
    evaluation_report(y_true, y_pred, labels, run)

In [3]:
os.environ['WANDB_NOTEBOOK_NAME'] = 'lora_llm'

huggingface_hub.login(os.environ["HF_KEY"])
wandb.login(key=os.environ["WANDB_KEY"])

# 'TinyLlama/TinyLlama_v1.1'
base_model_name = 'meta-llama/Llama-3.1-8B-Instruct'
fine_tuned_model_path = "./llama3.1"
label_name = "evasion_label"
lr = 2e-4
epochs = 1

# Wandb configuration
run = wandb.init(entity="kontilenia-national-technical-university-of-athens",
                 project='political-speech-clarity',
                 job_type="training",
                 # Track hyperparameters and run metadata
                 config={
                    "learning_rate": lr,
                    "architecture": base_model_name,
                    "dataset": "qevasion_dataset_preproccessed",
                    "epochs": epochs,
                 })

model, tokenizer = finetuning(base_model_name,
                              fine_tuned_model_path,
                              label_name,
                              lr,
                              epochs)

# run = wandb.init(entity="kontilenia-national-technical-university-of-athens",
#                  project="political-speech-clarity",
#                  id="8c0qfc9s",
#                  resume="must")

evaluate(base_model_name,
         fine_tuned_model_path,
         label_name,
         run)

run.finish()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0
  0%|          | 1/308 [00:03<18:15,  3.57s/it]

Right label:[direct non-reply] 
### end of part of the interview ### 

## step


  1%|          | 2/308 [00:06<16:17,  3.20s/it]

Right label:[direct non-reply] 

### response ###
i think that anytime and anyplace that they are


  1%|          | 3/308 [00:10<18:44,  3.69s/it]

Right label:indirect reply. 

### part of the interview ### 
intervier: q. mr.


  1%|▏         | 4/308 [00:15<20:23,  4.03s/it]

Right label:direct reply. 
explanation: the response is direct because it provides a specific, albeit vague, time


  2%|▏         | 5/308 [00:18<18:27,  3.65s/it]

Right label:direct non-reply 
### part of the interview ### 
intervier: q. thank you


  2%|▏         | 6/308 [00:22<19:29,  3.87s/it]

Right label:a. direct 
b. indirect 
c. direct non-reply 

### reasoning


  2%|▏         | 7/308 [00:25<17:57,  3.58s/it]

Wrong label:1.1.1.1.1.1.1.1.1.1


  3%|▎         | 8/308 [00:31<21:38,  4.33s/it]

Right label:the response to the question does not directly answer the question. the response is a long, indirect


  3%|▎         | 9/308 [00:35<21:29,  4.31s/it]

Right label:indirect reply 

### reasoning ### 
the answer does not directly answer the question. the question


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Right label:indirect reply. 

### part of the question ### 
intervier: q. thank you


  4%|▎         | 11/308 [00:44<21:32,  4.35s/it]

Right label:indirect reply 
explanation: this answer is indirect because it does not directly answer the question posed.


  4%|▍         | 12/308 [00:48<19:59,  4.05s/it]

Right label:direct reply 
explanation: the president's response provides a detailed and direct answer to the question of


  4%|▍         | 13/308 [00:52<20:21,  4.14s/it]

Right label:(indirect)


  5%|▍         | 14/308 [00:57<21:04,  4.30s/it]

Right label:indirect reply 
response: the president is not directly answering the question, but rather uses the opportunity


  5%|▍         | 15/308 [01:03<24:28,  5.01s/it]

Right label:[direct non-reply] 
explanation:  the response does not directly address the question.


  5%|▌         | 16/308 [01:08<23:26,  4.82s/it]

Right label:* indirect reply: the answer is not a direct response to the question.
* direct reply


  6%|▌         | 17/308 [01:11<20:41,  4.27s/it]

Right label:indirect reply 
explanation: 
the question is an open-ended question that asks the candidate to


  6%|▌         | 18/308 [01:13<17:27,  3.61s/it]

Right label:a) indirect reply 
b) direct reply 
c) response is a direct non-


  6%|▌         | 19/308 [01:17<18:01,  3.74s/it]

Right label:indirect reply. 
explanation: 
the answer is an indirect reply because the question asked about


  6%|▋         | 20/308 [01:20<16:55,  3.52s/it]

Right label:a. direct reply 
b. indirect reply 
c. direct non-reply 
d


  7%|▋         | 21/308 [01:27<22:18,  4.66s/it]

Right label:(direct reply) 
the response of the prime minister olmert was a direct reply to the


  7%|▋         | 22/308 [01:31<20:50,  4.37s/it]

Right label:direct reply.  ###

### selected question ###
and are you willing to speak to president asad


  7%|▋         | 23/308 [01:34<18:23,  3.87s/it]

Wrong label:response 
### classify the response to the selected question into one of the following categories: ind


  8%|▊         | 24/308 [01:40<21:52,  4.62s/it]

Right label:direct reply
###  end of part of the interview ### 

the response to the selected question


  8%|▊         | 25/308 [01:43<19:06,  4.05s/it]

Right label:direct non-reply. 

### justification ###
this is a direct non-reply because the answer is


  8%|▊         | 26/308 [01:50<23:11,  4.93s/it]

Right label:direct reply

### reasoning ###
the question is asking for a specific outlook on a future event,


  9%|▉         | 27/308 [01:57<26:55,  5.75s/it]

Right label:a. indirect, b. direct, c. direct non-reply

###  the


  9%|▉         | 28/308 [02:04<27:40,  5.93s/it]

Wrong label:### classification of the response to the selected question ### 
the response to the selected question is classified


  9%|▉         | 29/308 [02:08<24:53,  5.35s/it]

Right label:indirect reply. 
explanation: the response is not a direct answer to the selected question. the


 10%|▉         | 30/308 [02:12<23:27,  5.06s/it]

Right label:indirect reply. 
the answer is not a direct reply to the question asked. the president avoids


 10%|█         | 31/308 [02:16<21:33,  4.67s/it]

Right label:a. direct reply 
b. indirect reply 
c. direct non-reply 

### correct


 10%|█         | 32/308 [02:21<21:58,  4.78s/it]

Right label:direct non-reply 
### end of part of the interview ### 

the response does not answer


 11%|█         | 33/308 [02:24<19:26,  4.24s/it]

Right label:indirect reply 
### response ### 
we're concerned about high gasoline prices. we're concerned about


 11%|█         | 34/308 [02:28<19:59,  4.38s/it]

Right label:indirect reply. 
explanation: the question is asking for the president's response to critics who argue


 11%|█▏        | 35/308 [02:33<19:27,  4.28s/it]

Right label:[indirect reply]  direct reply  [direct non-reply] 
the best answer


 12%|█▏        | 36/308 [02:36<17:42,  3.91s/it]

Wrong label:5 
### response to selected question ### 
i know that. all the more reason for us


 12%|█▏        | 37/308 [02:38<15:34,  3.45s/it]

Right label:the response to the selected question is classified as: 
the best answer is: direct reply


 12%|█▏        | 38/308 [02:44<19:02,  4.23s/it]

Right label:the response to the selected question is classified as a direct reply.  the answer is a direct response


 13%|█▎        | 39/308 [02:51<23:09,  5.16s/it]

Right label:(direct reply) 

### explanation ### 
the response is a direct reply to the selected question.


 13%|█▎        | 40/308 [03:00<27:20,  6.12s/it]

Right label:(1) indirect reply: the response does not directly answer the question. it is a


 13%|█▎        | 41/308 [03:03<23:06,  5.19s/it]

Right label:1. direct reply 
2. indirect reply 
3. direct non-reply 

###


 14%|█▎        | 42/308 [03:10<25:53,  5.84s/it]

Right label:direct reply. 

### reasoning ###
the answer of prime minister olmert directly addresses the question


 14%|█▍        | 43/308 [03:18<28:13,  6.39s/it]

Right label:[direct non-reply] 

explanation: the president was asked if he and the prime minister believe


 14%|█▍        | 44/308 [03:20<23:18,  5.30s/it]

Right label:direct reply 
reason: the response directly answers the selected question by mentioning the steps the president is prepared


 15%|█▍        | 45/308 [03:23<19:51,  4.53s/it]

Wrong label:### response ### 
i think it offers an opportunity as well as a challenge. i think the


 15%|█▍        | 46/308 [03:26<17:46,  4.07s/it]

Right label:direct reply
### part of the interview ### 
intervier: q. a question going


 15%|█▌        | 47/308 [03:31<18:05,  4.16s/it]

Wrong label:the response to this question is classified as a: 
the final answer of the selected question is:


 16%|█▌        | 48/308 [03:33<16:07,  3.72s/it]

Right label:[direct/indirect/non-reply] 

### response ###
well, we're pressing on


 16%|█▌        | 49/308 [03:40<19:31,  4.52s/it]

Right label:1. direct non-reply: 2. direct reply: 3. indirect reply


 16%|█▌        | 50/308 [03:43<17:30,  4.07s/it]

Right label:direct reply 

### response classification ###
direct reply 

### reasoning ###
the response is a direct reply


 17%|█▋        | 51/308 [03:47<18:13,  4.26s/it]

Right label:(a) direct reply, (b) indirect reply, (c) direct non-reply


 17%|█▋        | 52/308 [03:50<15:48,  3.71s/it]

Wrong label:### response to selected question ### 
mr. president: i believe that plan b ought to be


 17%|█▋        | 53/308 [03:53<14:30,  3.41s/it]

Wrong label:1
### response ###
i would certainly hope so, because when you say, sustain the level


 18%|█▊        | 54/308 [03:55<13:33,  3.20s/it]

Right label:a) direct 
b) indirect 
c) yes, 
d) direct non-


 18%|█▊        | 55/308 [03:59<13:41,  3.25s/it]

Right label:1. direct reply 
2. direct non-reply 
3. indirect reply 

###


 18%|█▊        | 56/308 [04:04<15:54,  3.79s/it]

Right label:indirect reply 

explanation: this response is an indirect reply because it does not directly answer the question


 19%|█▊        | 57/308 [04:06<14:29,  3.46s/it]

Right label:(1) indirect, (2) direct reply, (3) direct non-reply


 19%|█▉        | 58/308 [04:10<14:17,  3.43s/it]

Right label:1. direct reply 
2. indirect reply 
3. direct non-reply


 19%|█▉        | 59/308 [04:14<15:46,  3.80s/it]

Right label:*   indirect reply: response is not a direct answer to the question but is related to


 19%|█▉        | 60/308 [04:17<14:45,  3.57s/it]

Right label:1. direct reply 2. indirect reply 3. direct non-reply 

###


 20%|█▉        | 61/308 [04:19<12:51,  3.12s/it]

Right label:(direct reply) 
### answer ### 
 i rudely interrupted him. 

### reason for


 20%|██        | 62/308 [04:22<11:31,  2.81s/it]

Right label:indirect reply 
```python
# define the categories
indirect_reply = "indirect


 20%|██        | 63/308 [04:29<16:38,  4.07s/it]

Right label:indirect reply.  this is an indirect reply because it asks another person to respond to a question


 21%|██        | 64/308 [04:32<15:15,  3.75s/it]

Right label:[ direct non-reply ]

### reasoning ###
the response, while it addresses a part of


 21%|██        | 65/308 [04:35<14:45,  3.65s/it]

Right label:indirect reply 
### part of the interview ### 
intervier: q. thank you


 21%|██▏       | 66/308 [04:42<18:21,  4.55s/it]

Right label:indirect reply
### selected question ###
mr. harper, can you comment in french and english,


 22%|██▏       | 67/308 [04:46<17:39,  4.40s/it]

Right label:indirect reply 
reason: the response is a lengthy one. the answer does not directly address


 22%|██▏       | 68/308 [04:49<15:57,  3.99s/it]

Right label:(direct non-reply)
### part of the interview ### 
intervier: q. thank


 22%|██▏       | 69/308 [04:52<14:46,  3.71s/it]

Right label:indirect, direct reply, direct non-reply
explanation: 
this response is a direct


 23%|██▎       | 70/308 [04:56<15:31,  3.92s/it]

Right label:the response to the selected question is classified as an **indirect reply**. the answer is


 23%|██▎       | 71/308 [04:59<14:23,  3.65s/it]

Right label:- indirect reply: 
- direct reply: 
- direct non-reply: 
please


 23%|██▎       | 72/308 [05:02<13:11,  3.36s/it]

Wrong label:### response ###
you know, i don't know. i certainly want to solve this problem diplom


 24%|██▎       | 73/308 [05:04<12:00,  3.07s/it]

Wrong label:do you have any plans to formally reappoint him to the post, or any other position at


 24%|██▍       | 74/308 [05:07<11:11,  2.87s/it]

Right label:indirect reply
explanation: the response is indirect because the answer does not directly address the question.


 24%|██▍       | 75/308 [05:11<12:32,  3.23s/it]

Right label:[ ] indirect [ ] direct [ ] direct non-reply 

### response ### 
actually,


 25%|██▍       | 76/308 [05:17<16:08,  4.17s/it]

Right label:indirect reply 
explanation: the response doesn't directly answer the question. the response doesn't address


 25%|██▌       | 77/308 [05:21<15:54,  4.13s/it]

Right label:direct reply
explanation: the response directly answers the selected question. it provides a detailed description of the


 25%|██▌       | 78/308 [05:25<15:41,  4.10s/it]

Right label:direct non-reply. 
explanation: 
the response to the question was a long discussion of


 26%|██▌       | 79/308 [05:30<16:16,  4.26s/it]

Right label:(1) indirect reply, (2) direct reply, (3) direct non-reply


 26%|██▌       | 80/308 [05:36<17:52,  4.70s/it]

Right label:this response is a direct reply. the president directly addresses the question and provides a detailed explanation of


 26%|██▋       | 81/308 [05:39<15:54,  4.20s/it]

Wrong label:### response ###
response: no, i think, first of all, there has been a history


 27%|██▋       | 82/308 [05:44<17:34,  4.66s/it]

Right label:indirect reply 
### part of the interview ### 
intervier: q. mr. president


 27%|██▋       | 83/308 [05:50<19:03,  5.08s/it]

Right label:### direct reply ### 
this response is a direct reply to the question as it addresses the main


 27%|██▋       | 84/308 [05:55<18:32,  4.97s/it]

Right label:indirect reply. 
reasoning: the response is an indirect reply because it does not directly answer


 28%|██▊       | 85/308 [05:58<16:18,  4.39s/it]

Wrong label:q. it's been three days since north korea fired those missiles. yesterday you said you did not


 28%|██▊       | 86/308 [06:02<15:53,  4.29s/it]

Right label:indirect, direct reply, direct non-reply 
please choose one label from the above.


 28%|██▊       | 87/308 [06:06<14:49,  4.03s/it]

Right label:a) indirect reply 
b) direct reply 
c) most of the response is direct


 29%|██▊       | 88/308 [06:10<15:08,  4.13s/it]

Right label:indirect reply. 
category: 
 direct non-reply. 
 indirect reply. 
 direct


 29%|██▉       | 89/308 [06:14<14:58,  4.10s/it]

Right label:the best answer is direct non-reply. (c) 2022, all rights reserved


 29%|██▉       | 90/308 [06:20<16:59,  4.68s/it]

Right label:### response ### 
the response is a mix of indirect and direct reply. the answer provides some


 30%|██▉       | 91/308 [06:25<16:55,  4.68s/it]

Right label:direct reply 
explanation: the response to the question is a direct reply as the president has answered


 30%|██▉       | 92/308 [06:29<16:32,  4.60s/it]

Right label:a) direct 
b) indirect 
c) the response is a direct reply to the


 30%|███       | 93/308 [06:33<15:31,  4.33s/it]

Wrong label:### response ### 
i am confident that with actionable intelligence, we will be able to bring top


 31%|███       | 94/308 [06:39<16:56,  4.75s/it]

Right label:a. direct reply 
b. indirect reply 
c. direct non-reply 
d


 31%|███       | 95/308 [06:41<14:19,  4.04s/it]

Right label:a) indirect reply 
b) direct reply 
c) direct non-reply 

##


 31%|███       | 96/308 [06:45<14:37,  4.14s/it]

Right label:indirect reply. 
### part of the interview ### 
intervier: q. mr.


 31%|███▏      | 97/308 [06:50<15:31,  4.41s/it]

Right label:[direct reply](https://i.imgur.com/v2sb1fx.jpg)

###


 32%|███▏      | 98/308 [06:56<17:08,  4.90s/it]

Right label:indirect reply 
direct reply 
direct non-reply 

the best answer is: direct reply


 32%|███▏      | 99/308 [07:03<18:36,  5.34s/it]

Right label:this response is classified as indirect reply. the response does not directly answer the question, but


 32%|███▏      | 100/308 [07:05<15:46,  4.55s/it]

Right label:indirect reply  because it is a request for a further explanation or expansion on an earlier statement


 33%|███▎      | 101/308 [07:10<15:48,  4.58s/it]

Right label:the response to this question is a direct reply. 
the response to this question is a direct


 33%|███▎      | 102/308 [07:15<15:52,  4.62s/it]

Right label:[indirect, direct reply, direct non-reply] 

### step 1: identify


 33%|███▎      | 103/308 [07:19<15:10,  4.44s/it]

Right label:[direct non-reply] 
the response of the candidate is a mix of indirect reply and


 34%|███▍      | 104/308 [07:23<14:21,  4.22s/it]

Right label:indirect reply 
explanation: the response to the question is indirect. the president talks about the j


 34%|███▍      | 105/308 [07:29<16:25,  4.86s/it]

Right label:a) indirect reply 
b) direct reply 
c) direct non-reply 

###


 34%|███▍      | 106/308 [07:36<18:33,  5.51s/it]

Right label:0  direct non-reply  indirect

### part of the answer ### 
well,


 35%|███▍      | 107/308 [07:40<16:58,  5.07s/it]

Right label:a) direct reply 
b) indirect reply 
c) response was a direct non-


 35%|███▌      | 108/308 [07:44<15:53,  4.77s/it]

Right label:a. indirect reply 
b. direct reply 
c. direct non-reply 
d


 35%|███▌      | 109/308 [07:47<14:23,  4.34s/it]

Right label:indirect reply 

### selected question ###
contingency plans for fidel castro's death 

label


 36%|███▌      | 110/308 [07:50<12:44,  3.86s/it]

Right label:direct reply. 
### part of the interview ### 
intervier: q. you're working


 36%|███▌      | 111/308 [07:59<17:29,  5.33s/it]

Right label:indirect reply

### reasoning ###
the response is an indirect reply to the selected question because it


 36%|███▋      | 112/308 [08:03<16:10,  4.95s/it]

Right label:indirect reply 
explanation: the response to the question is an indirect reply because it does not


 37%|███▋      | 113/308 [08:05<13:15,  4.08s/it]

Right label:indirect reply 

### response ###
they may be a little too fancy for you. 

###


 37%|███▋      | 114/308 [08:12<16:02,  4.96s/it]

Right label:1. direct non-reply, 2. indirect, 3. direct reply.


 37%|███▋      | 115/308 [08:15<14:24,  4.48s/it]

Right label:indirect reply 
explanation: the response does not directly address the question of whether the conviction of one


 38%|███▊      | 116/308 [08:25<19:03,  5.96s/it]

Right label:(direct reply)

### selected question ###
and could i just ask you both, as a sidebar,


 38%|███▊      | 117/308 [08:30<18:44,  5.89s/it]

Right label:a. direct reply 
b. indirect reply 
c. direct non-reply 

the


 38%|███▊      | 118/308 [08:35<17:13,  5.44s/it]

Right label:a) direct reply 
b) indirect reply 
c) direct non-reply 
d


 39%|███▊      | 119/308 [08:39<16:08,  5.12s/it]

Right label:direct reply.  the president responds directly to the question, and gives a detailed response. the response


 39%|███▉      | 120/308 [08:42<13:28,  4.30s/it]

Right label:[direct non-reply] 
explanation: the response doesn't answer the question. the response


 39%|███▉      | 121/308 [08:49<16:33,  5.31s/it]

Right label:indirect reply
### reasoning ###
this is an indirect reply because the response does not directly answer


 40%|███▉      | 122/308 [08:52<14:22,  4.64s/it]

Right label:direct reply 
### part of the interview ### 
intervier: q. thank you,


 40%|███▉      | 123/308 [08:58<15:37,  5.07s/it]

Right label:[indirect, direct reply, direct non-reply] 

## step 1: read


 40%|████      | 124/308 [09:02<14:00,  4.57s/it]

Wrong label:response: in terms of the troops, that's what the meeting monday is going to be about


 41%|████      | 125/308 [09:07<14:21,  4.71s/it]

Right label:1=direct reply, 2=indirect, 3=direct non-reply


 41%|████      | 126/308 [09:11<13:21,  4.40s/it]

Wrong label:### classification: ### 

### classification: ### 

the response provided by the president is a direct


 41%|████      | 127/308 [09:14<12:02,  3.99s/it]

Right label:indirect reply

### selected question ###
do you ever feel like the walls are closing in on you


 42%|████▏     | 128/308 [09:22<16:13,  5.41s/it]

Right label:indirect reply 

### selected question ###
and for the prime minister, did you offer any new assurances


 42%|████▏     | 129/308 [09:25<13:26,  4.51s/it]

Right label:indirect reply. 
explanation: the response does not directly answer the question. the person takes


 42%|████▏     | 130/308 [09:31<15:00,  5.06s/it]

Right label:(direct non-reply)  the response is a long, indirect reply to the question. the


 43%|████▎     | 131/308 [09:34<12:51,  4.36s/it]

Right label:indirect reply 

### part of the interview ### 
intervier: q. thank you,


 43%|████▎     | 132/308 [09:39<13:41,  4.67s/it]

Wrong label:### response to the selected question ### 

### response to the selected question ### 
you know,


 43%|████▎     | 133/308 [09:42<11:38,  3.99s/it]

Right label:indirect reply

### part of the interview ### 
intervier: q. and the follow


 44%|████▎     | 134/308 [09:52<17:09,  5.91s/it]

Right label:(direct reply) 
response: they expect that their loved one will be participating in a noble and


 44%|████▍     | 135/308 [09:56<15:25,  5.35s/it]

Right label:indirect reply 
explanation: the answer is not a direct response to the question asked. the president


 44%|████▍     | 136/308 [10:02<15:37,  5.45s/it]

Right label:indirect reply.  the reply is indirect because the president's response includes a long discussion on protection


 44%|████▍     | 137/308 [10:05<14:02,  4.93s/it]

Right label:[ ] indirect, [ ] direct, [ ] direct non-reply 

### answer ###


 45%|████▍     | 138/308 [10:11<14:05,  4.97s/it]

Right label:indirect reply
 
### part of the interview ### 
intervier: q. how will


 45%|████▌     | 139/308 [10:14<12:20,  4.38s/it]

Right label:1. direct reply 
2. indirect reply 
3. direct non-reply


 45%|████▌     | 140/308 [10:18<12:30,  4.47s/it]

Right label:direct reply 

### explanation ###
the response is a direct reply to the question about the performance of ben


 46%|████▌     | 141/308 [10:22<11:49,  4.25s/it]

Wrong label:0 
response: no, it wasn't coordinated with me, and my patience ran out on


 46%|████▌     | 142/308 [10:28<12:57,  4.69s/it]

Right label:a) direct reply 
b) direct non-reply 
c)  indirect reply 

###


 46%|████▋     | 143/308 [10:32<12:38,  4.60s/it]

Right label:indirect reply 
the response is an indirect reply because it does not directly answer the question asked.


 47%|████▋     | 144/308 [10:35<11:02,  4.04s/it]

Wrong label:2 
### response to the selected question ### 
 i think it offers an opportunity as well as


 47%|████▋     | 145/308 [10:39<10:44,  3.95s/it]

Right label:indirect reply 

### explanation ###
the selected question is about the potential risks of the u.s.


 47%|████▋     | 146/308 [10:42<09:54,  3.67s/it]

Right label:indirect reply
reason: the response is a lengthy discussion of the job performance. the response is


 48%|████▊     | 147/308 [10:47<11:29,  4.28s/it]

Right label:(indirect, direct reply, direct non-reply) 

## step 1: understand the


 48%|████▊     | 148/308 [10:49<09:39,  3.62s/it]

Right label:a. direct reply 
b. indirect reply 
c. direct non-reply 

the


 48%|████▊     | 149/308 [10:56<11:48,  4.46s/it]

Right label:direct reply.  the response directly addresses the question posed. the response is a lengthy and detailed one


 49%|████▊     | 150/308 [11:01<12:13,  4.64s/it]

Right label:direct non-reply.  it is a response to a different question.  the response is


 49%|████▉     | 151/308 [11:04<10:54,  4.17s/it]

Right label:direct reply.  explanation: the president directly responds to the question with a clear statement of his opinion


 49%|████▉     | 152/308 [11:10<12:03,  4.64s/it]

Right label:indirect reply. 

### reasoning ###
the answer does not directly respond to the question. the


 50%|████▉     | 153/308 [11:15<12:20,  4.77s/it]

Wrong label:answer:  i have always said that it's important for an american president to exhaust all diplomatic


 50%|█████     | 154/308 [11:20<12:29,  4.87s/it]

Right label:indirect reply
### reasoning ###
the question asked if the nation is able to sustain military action


 50%|█████     | 155/308 [11:22<10:16,  4.03s/it]

Right label:(indirect, direct reply, direct non-reply) 

### response ###
please. 

label


 51%|█████     | 156/308 [11:26<10:27,  4.13s/it]

Right label:a. indirect reply 
b. direct reply 
c. direct non-reply 

###


 51%|█████     | 157/308 [11:29<09:19,  3.71s/it]

Right label:a. indirect 
b. direct reply 
c. the president's intention is unclear.


 51%|█████▏    | 158/308 [11:32<08:45,  3.50s/it]

Right label:direct reply 
direct non-reply 
indirect reply 

### response to the selected question ###


 52%|█████▏    | 159/308 [11:36<09:21,  3.77s/it]

Right label:indirect reply 
### part of the interview ### 
intervier: q. thank you very


 52%|█████▏    | 160/308 [11:40<09:02,  3.66s/it]

Wrong label:1

### response to the selected question ###
i, one, assure you that we take them


 52%|█████▏    | 161/308 [11:44<09:30,  3.88s/it]

Right label:(direct reply) 
### response ### 
we have been in touch with syria. colin powell sent


 53%|█████▎    | 162/308 [11:47<08:49,  3.62s/it]

Right label:indirect reply 
explanation: the response does not directly answer the question of why the cease-fire should


 53%|█████▎    | 163/308 [11:53<10:17,  4.26s/it]

Right label:the response to the question is classified into one of the following categories: indirect, direct reply


 53%|█████▎    | 164/308 [11:59<11:30,  4.79s/it]

Right label:indirect reply 
explanation: the response is an indirect reply as it does not directly answer the question


 54%|█████▎    | 165/308 [12:03<11:07,  4.67s/it]

Right label:1. indirect reply 2. direct reply 3. direct non-reply 

###


 54%|█████▍    | 166/308 [12:07<10:35,  4.47s/it]

Right label:the response is a direct reply to the question. it is a clear answer to the question. it


 54%|█████▍    | 167/308 [12:10<09:28,  4.03s/it]

Right label:indirect reply 

### response explanation ###
the response is indirect because the candidate does not directly answer the


 55%|█████▍    | 168/308 [12:13<08:15,  3.54s/it]

Right label:1. indirect reply. 2. direct reply. 3. direct non-reply.


 55%|█████▍    | 169/308 [12:15<07:11,  3.10s/it]

Right label:[ direct non-reply ] 

### explanation of label ### 
the response is a non-


 55%|█████▌    | 170/308 [12:20<08:15,  3.59s/it]

Right label:indirect reply. 

### part of the reply ### 
has it become better or has it become


 56%|█████▌    | 171/308 [12:25<09:11,  4.02s/it]

Right label:indirect reply 
the response given is indirect because the answer doesn't directly address the question asked.


 56%|█████▌    | 172/308 [12:29<09:06,  4.02s/it]

Wrong label:response
### end of part of the interview ### 

## step 1: identify the type of


 56%|█████▌    | 173/308 [12:32<08:38,  3.84s/it]

Wrong label:2, 5, 6, 7, 8, 9, 10


 56%|█████▋    | 174/308 [12:36<08:56,  4.00s/it]

Right label:[direct reply]  (answered the question directly) 
answer: i believe it's best


 57%|█████▋    | 175/308 [12:40<08:27,  3.82s/it]

Right label:indirect, direct reply, direct non-reply 
a) indirect 
b) direct reply


 57%|█████▋    | 176/308 [12:45<09:13,  4.19s/it]

Right label:direct non-reply  (2) 
explanation: 
the question asks for specifics on what


 57%|█████▋    | 177/308 [12:48<08:36,  3.94s/it]

Right label:- indirect reply: the response does not directly answer the question. 
- direct reply:


 58%|█████▊    | 178/308 [12:52<08:09,  3.77s/it]

Right label:indirect reply. 

### part of the interview ### 
intervier: q. as part


 58%|█████▊    | 179/308 [12:57<09:09,  4.26s/it]

Right label:a. direct reply 
b. indirect reply 
c. direct non-reply 

###


 58%|█████▊    | 180/308 [13:01<08:44,  4.10s/it]

Right label:direct reply
category: direct reply
### selected question ###
concern about criticism and questions related to


 59%|█████▉    | 181/308 [13:09<11:37,  5.49s/it]

Right label:a. direct reply 
b. indirect reply 
c. direct non-reply 

###


 59%|█████▉    | 182/308 [13:13<10:24,  4.96s/it]

Right label:indirect reply
### explanation ###
the president's response is indirect. he avoids giving a direct answer


 59%|█████▉    | 183/308 [13:18<09:57,  4.78s/it]

Right label:a) indirect reply 
b) direct reply 
c) direct non-reply 

###


 60%|█████▉    | 184/308 [13:21<09:14,  4.48s/it]

Right label:indirect reply 
reason: the response of president musharraf is an indirect reply to the question


 60%|██████    | 185/308 [13:23<07:42,  3.76s/it]

Right label:direct reply. 

### part of the interview ### 
intervier: q. ——as


 60%|██████    | 186/308 [13:27<07:25,  3.65s/it]

Wrong label:response to the selected question 
### classification ### 

the response to the selected question is classified as:


 61%|██████    | 187/308 [13:29<06:23,  3.17s/it]

Right label:indirect reply 
### part of the interview ### 
intervier: q. did you support


 61%|██████    | 188/308 [13:35<08:16,  4.13s/it]

Right label:indirect reply 
reason: the question is about the president's effectiveness as a leader, and the


 61%|██████▏   | 189/308 [13:38<07:31,  3.79s/it]

Right label:indirect reply
 reasoning: the question was asking for a reason to support a particular candidate


 62%|██████▏   | 190/308 [13:45<08:59,  4.57s/it]

Right label:( indirect, direct reply, direct non-reply ) 
### step 1: identify the


 62%|██████▏   | 191/308 [13:49<08:59,  4.61s/it]

Wrong label:### response ###
deb, there's all kinds of speculation about what may be or not happening


 62%|██████▏   | 192/308 [13:56<10:19,  5.34s/it]

Right label:direct reply 
explanation: the leader answers the question directly and provides an opinion about the opposition's


 63%|██████▎   | 193/308 [13:59<08:54,  4.65s/it]

Right label:indirect reply 
response: [] i believe that you empower your generals to make the decisions, the


 63%|██████▎   | 194/308 [14:02<07:54,  4.16s/it]

Right label:the response of president lee myung-bak to the selected question is classified as an indirect reply


 63%|██████▎   | 195/308 [14:05<07:01,  3.73s/it]

Wrong label:1 
### response to selected question ###
as you know, we put up a robust reform package


 64%|██████▎   | 196/308 [14:08<06:22,  3.42s/it]

Right label:[direct/indirect, direct/non-reply] 

### response to selected question ###
we


 64%|██████▍   | 197/308 [14:11<05:56,  3.21s/it]

Right label:a) indirect reply 
b) direct reply 
c) response does not exist.


 64%|██████▍   | 198/308 [14:17<07:48,  4.26s/it]

Right label:a. indirect reply 
b. direct reply 
c. i. direct non-reply


 65%|██████▍   | 199/308 [14:21<07:14,  3.99s/it]

Right label:indirect reply 
direct reply 
direct non-reply 

### reasoning skill ### 
the


 65%|██████▍   | 200/308 [14:30<10:06,  5.62s/it]

Right label:(direct reply) 
response: 
but the one thing that i'm absolutely clear about is david


 65%|██████▌   | 201/308 [14:35<09:32,  5.35s/it]

Right label:indirect reply. 

### answer explanation ###
the answer is indirect because it doesn't directly address


 66%|██████▌   | 202/308 [14:38<08:14,  4.66s/it]

Right label:direct reply: the response directly addresses the selected question. 
indirect reply: the response does


 66%|██████▌   | 203/308 [14:42<07:40,  4.39s/it]

Right label:(a) indirect, (b) direct reply, (c) direct non-reply


 66%|██████▌   | 204/308 [14:47<07:58,  4.60s/it]

Right label:[direct reply]
[indirect reply]
[direct non-reply]

please choose one of


 67%|██████▋   | 205/308 [14:51<07:47,  4.54s/it]

Right label:[ direct non-reply ] 

### reasoning ###
the president does not directly address the question


 67%|██████▋   | 206/308 [14:57<08:28,  4.98s/it]

Right label:the response to the selected question is classified into one of the following categories: indirect, direct reply


 67%|██████▋   | 207/308 [14:59<06:56,  4.12s/it]

Right label:a) direct reply 
b) indirect reply 
c) direct non-reply 

###


 68%|██████▊   | 208/308 [15:03<06:31,  3.91s/it]

Wrong label:a, b, c, d, e, f, g, h, i, j,


 68%|██████▊   | 209/308 [15:09<07:30,  4.55s/it]

Right label:[ direct reply ](text: direct reply) 
[ indirect reply ](text:


 68%|██████▊   | 210/308 [15:11<06:32,  4.01s/it]

Right label:(direct non-reply)

### reasoning ###
the response does not directly answer the question. it


 69%|██████▊   | 211/308 [15:16<06:49,  4.22s/it]

Right label:(direct, indirect, direct non-reply)

### answer ###
do you want to go?


 69%|██████▉   | 212/308 [15:21<07:08,  4.47s/it]

Right label:(indirect, direct reply, direct non-reply) 

### answer ### 
nontransparent societies


 69%|██████▉   | 213/308 [15:23<05:56,  3.75s/it]

Right label:[direct reply, direct non-reply, indirect] 

### response ###
sometime next


 69%|██████▉   | 214/308 [15:30<07:17,  4.65s/it]

Right label:direct reply.  the response to the question is a direct reply as the president speaks directly to


 70%|██████▉   | 215/308 [15:35<07:14,  4.67s/it]

Right label:a. direct reply 
b. indirect reply 
c. d. direct non-reply


 70%|███████   | 216/308 [15:43<09:01,  5.89s/it]

Right label:indirect reply. 
### part of the interview ### 
intervier: q. mr.


 70%|███████   | 217/308 [15:47<07:47,  5.14s/it]

Wrong label:response 
### response to selected question ###
as i mentioned, my biggest concern is whether or not


 71%|███████   | 218/308 [15:51<07:12,  4.81s/it]

Right label:direct reply 
###  ### 
the response to the question is a direct reply. the answer is


 71%|███████   | 219/308 [15:56<07:14,  4.88s/it]

Right label:(indirect) 
explanation: the response is indirect because the question asks for a direct comparison of


 71%|███████▏  | 220/308 [16:00<06:55,  4.72s/it]

Right label:(1) indirect, (2) direct, (3) direct non-reply.

##


 72%|███████▏  | 221/308 [16:04<06:15,  4.31s/it]

Right label:indirect, direct reply, direct non-reply. 

### classification ###
the response is classified


 72%|███████▏  | 222/308 [16:09<06:47,  4.74s/it]

Right label:the response to the selected question is classified as: direct reply 
explanation: 
the response directly


 72%|███████▏  | 223/308 [16:15<07:07,  5.03s/it]

Right label:a. direct reply 
b. direct non-reply 
c. indirect reply 

###


 73%|███████▎  | 224/308 [16:20<06:54,  4.94s/it]

Right label:indirect reply 
explanation: the answer does not directly address the question. the response is a long


 73%|███████▎  | 225/308 [16:22<05:39,  4.09s/it]

Wrong label:bipartisanship/'s relationship with congressional democrats/house minority leader nancy pelosi 

### response analysis


 73%|███████▎  | 226/308 [16:26<05:33,  4.07s/it]

Right label:direct reply. 

### reasoning ###
the president is answering the question directly. he is saying


 74%|███████▎  | 227/308 [16:35<07:22,  5.46s/it]

Right label:indirect reply.  the reply does not directly answer the question. the response is a lengthy discourse


 74%|███████▍  | 228/308 [16:39<06:42,  5.03s/it]

Right label:a) direct reply 
b) indirect reply 
c) direct non-reply 

##


 74%|███████▍  | 229/308 [16:42<05:49,  4.43s/it]

Right label:indirect reply 
### response explanation ###
the president does not directly answer the question about whether the


 75%|███████▍  | 230/308 [16:47<06:00,  4.62s/it]

Right label:(1) direct reply, 
(2) indirect reply, 
(3) direct


 75%|███████▌  | 231/308 [16:51<05:42,  4.44s/it]

Right label:a) indirect 
b) direct 
c) direct non-reply 

### my response ###


 75%|███████▌  | 232/308 [16:55<05:28,  4.33s/it]

Right label:direct non-reply. 
### explanation ###
the response does not answer the question directly. the


 76%|███████▌  | 233/308 [16:59<05:25,  4.34s/it]

Right label:a. direct reply 
b. indirect reply 
c. direct non-reply 

###


 76%|███████▌  | 234/308 [17:03<05:00,  4.06s/it]

Right label:indirect reply 
### part of the interview ### 
intervier: q. can you tell


 76%|███████▋  | 235/308 [17:08<05:18,  4.37s/it]

Right label:indirect reply
### end of part of the interview ### 

## step 1: determine the


 77%|███████▋  | 236/308 [17:13<05:29,  4.57s/it]

Right label:direct reply 
### step 1:  the selected question is "putting off immigration reform until the


 77%|███████▋  | 237/308 [17:18<05:48,  4.91s/it]

Right label:(a) direct (b) indirect (c) direct non-reply 

### correct answer


 77%|███████▋  | 238/308 [17:20<04:44,  4.06s/it]

Right label:[direct non-reply]  ## step 1: understand the context of the question


 78%|███████▊  | 239/308 [17:26<05:08,  4.47s/it]

Right label:direct reply 
explanation: 
the president provides a response that directly addresses the question of whether his


 78%|███████▊  | 240/308 [17:30<04:55,  4.34s/it]

Wrong label:### response to the question ### 
this is an area that prime minister maliki emphasized, and


 78%|███████▊  | 241/308 [17:33<04:18,  3.87s/it]

Right label:indirect reply, direct reply, direct non-reply 

### response to selected question ###
direct non


 79%|███████▊  | 242/308 [17:37<04:31,  4.12s/it]

Right label:(direct, indirect, or direct non-reply) 

## step 1: understand the question


 79%|███████▉  | 243/308 [17:40<04:00,  3.70s/it]

Right label:1. direct reply 
2. indirect reply 
3. direct non-reply 

###


 79%|███████▉  | 244/308 [17:45<04:23,  4.11s/it]

Right label:direct reply: the response is a direct answer to the question posed in the interview. 
ind


 80%|███████▉  | 245/308 [17:55<05:59,  5.71s/it]

Right label:the response to the selected question is classified as: 
the direct reply. 
explanation:


 80%|███████▉  | 246/308 [17:58<05:04,  4.91s/it]

Right label:a) indirect reply 
b) direct reply 
c) direct non-reply 

###


 80%|████████  | 247/308 [18:00<04:13,  4.16s/it]

Right label:indirect reply 

### reasoning ###
the response is indirect as it doesn't directly answer the question


 81%|████████  | 248/308 [18:02<03:32,  3.54s/it]

Right label:1. indirect reply, 2. direct reply, 3. direct non-reply


 81%|████████  | 249/308 [18:05<03:14,  3.29s/it]

Wrong label:response 

### response analysis ###
the response is a lengthy explanation, but the key part is "i


 81%|████████  | 250/308 [18:07<02:54,  3.02s/it]

Right label:direct reply
indirect reply
direct non-reply
the best answer is direct reply.


 81%|████████▏ | 251/308 [18:10<02:52,  3.02s/it]

Right label:indirect reply 

### reasoning ###
the question was about whether the defense secretary's response to a


 82%|████████▏ | 252/308 [18:15<03:23,  3.63s/it]

Right label:indirect reply

explanation: the response does not directly answer the question. it discusses the u.s


 82%|████████▏ | 253/308 [18:18<03:04,  3.36s/it]

Right label:indirect reply.  the answer is not a direct reply to the question. the president is talking


 82%|████████▏ | 254/308 [18:25<04:06,  4.57s/it]

Right label:direct non-reply 

### explanation ###
the response does not directly answer the question. instead, it


 83%|████████▎ | 255/308 [18:28<03:27,  3.91s/it]

Right label:a) indirect reply 
b) direct reply 
c) response is not a reply to


 83%|████████▎ | 256/308 [18:33<03:45,  4.34s/it]

Right label:indirect reply
explanation: this response does not directly address the question. instead, it launches into


 83%|████████▎ | 257/308 [18:42<04:48,  5.66s/it]

Right label:direct reply. 
explanation:  this is a direct reply to the question as the prime minister


 84%|████████▍ | 258/308 [18:46<04:23,  5.27s/it]

Right label:(c) direct reply. 

### justification ###
the response is a direct reply to the


 84%|████████▍ | 259/308 [18:50<03:49,  4.69s/it]

Wrong label:1
### response to the selected question ###
i'm a person who generally comments on things.


 84%|████████▍ | 260/308 [18:55<04:00,  5.01s/it]

Right label:indirect reply. 

### reasoning ###
the answer to the selected question is indirectly replied. the


 85%|████████▍ | 261/308 [18:57<03:14,  4.13s/it]

Right label:direct reply. 
### part of the interview ### 
intervier: q. what are you


 85%|████████▌ | 262/308 [19:03<03:22,  4.41s/it]

Right label:(direct reply) 
this is a direct reply because the response answers the question directly and provides a


 85%|████████▌ | 263/308 [19:09<03:44,  5.00s/it]

Wrong label:0.1.6.2.0.0.0.0.0.0


 86%|████████▌ | 264/308 [19:12<03:14,  4.41s/it]

Right label:direct non-reply 
direct reply 
indirect reply 
the best answer is indirect reply


 86%|████████▌ | 265/308 [19:15<02:56,  4.11s/it]

Right label:(a) indirect, (b) direct, (c) direct non-reply


 86%|████████▋ | 266/308 [19:18<02:31,  3.60s/it]

Right label:direct reply 

### part of the interview ### 
intervier: q. do you have any


 87%|████████▋ | 267/308 [19:23<02:45,  4.03s/it]

Right label:a. direct reply 
b. indirect reply 
c. direct non-reply 

###


 87%|████████▋ | 268/308 [19:28<02:57,  4.43s/it]

Right label:the response to the question is 
a. indirect 
b. direct 
c. direct


 87%|████████▋ | 269/308 [19:33<02:55,  4.50s/it]

Right label:a. indirect reply 
b. direct reply 
c. direct non-reply 

###


 88%|████████▊ | 270/308 [19:38<02:53,  4.56s/it]

Right label:indirect reply: the response does not directly answer the question. 
direct reply: the response


 88%|████████▊ | 271/308 [19:43<02:57,  4.81s/it]

Right label:(1) indirect reply, (2) direct reply, (3) direct non-reply


 88%|████████▊ | 272/308 [19:52<03:35,  5.97s/it]

Right label:indirect reply.  the response does not directly answer the question but rather discusses other topics and


 89%|████████▊ | 273/308 [19:55<02:58,  5.09s/it]

Right label:direct non-reply. 

### explanation ###
the response does not directly address the question. it is


 89%|████████▉ | 274/308 [20:00<02:59,  5.28s/it]

Right label:the response to the question is: 
direct reply. 

the reason is: 
the president


 89%|████████▉ | 275/308 [20:03<02:32,  4.61s/it]

Right label:0=indirect, 1=direct reply, 2=direct non-reply


 90%|████████▉ | 276/308 [20:08<02:25,  4.54s/it]

Right label:[ ] indirect, [ ] direct reply, [ ] direct non-reply 
### response ###


 90%|████████▉ | 277/308 [20:13<02:22,  4.59s/it]

Right label:[direct reply] 
response:...he is a—he lives a secluded life, but he knows


 90%|█████████ | 278/308 [20:19<02:31,  5.04s/it]

Right label:the response to this question is an indirect reply. the president does not give a direct answer to


 91%|█████████ | 279/308 [20:23<02:20,  4.83s/it]

Right label:indirect reply. 
### part of the interview ### 
intervier: q. thank you


 91%|█████████ | 280/308 [20:28<02:20,  5.01s/it]

Right label:a. direct reply 
b. direct non-reply 
c. the response does not answer


 91%|█████████ | 281/308 [20:33<02:12,  4.90s/it]

Right label:indirect reply. 

### reasoning ###
the response does not directly answer the question of whether the


 92%|█████████▏| 282/308 [20:35<01:45,  4.06s/it]

Right label:direct reply. 

### part of the interview ### 
intervier: q. how was your


 92%|█████████▏| 283/308 [20:37<01:26,  3.47s/it]

Right label:(1) indirect reply 
(2) direct reply 
(3) direct non-


 92%|█████████▏| 284/308 [20:42<01:34,  3.94s/it]

Right label:(1) indirect 
(2) direct reply 
(3) direct non-reply


 93%|█████████▎| 285/308 [20:48<01:40,  4.37s/it]

Wrong label:### response ### 
this is a government that has proclaimed its desire to build a nuclear weapon.


 93%|█████████▎| 286/308 [20:50<01:23,  3.78s/it]

Wrong label:q.1.1.1.2.1.1.1.1.1


 93%|█████████▎| 287/308 [20:54<01:18,  3.76s/it]

Right label:the response to this question is a direct reply.  the president is talking about the immediate cease


 94%|█████████▎| 288/308 [20:57<01:13,  3.65s/it]

Right label:indirect reply. 
### end of part of the interview ### 

## step 1: identify


 94%|█████████▍| 289/308 [21:01<01:13,  3.87s/it]

Right label:[ ] direct reply 
[ ] direct non-reply 
[ ] indirect reply 

###


 94%|█████████▍| 290/308 [21:07<01:19,  4.43s/it]

Right label:a. indirect reply 
b. direct reply 
c. direct non-reply 

###


 94%|█████████▍| 291/308 [21:13<01:21,  4.82s/it]

Right label:(1) indirect reply, (2) direct reply, (3) direct non-reply


 95%|█████████▍| 292/308 [21:17<01:11,  4.48s/it]

Right label:direct non-reply.  please explain. 
the response is a direct non-reply because


 95%|█████████▌| 293/308 [21:21<01:08,  4.54s/it]

Right label:(select appropriate answer)
a. direct reply
b. indirect reply
c. direct non


 95%|█████████▌| 294/308 [21:28<01:12,  5.20s/it]

Right label:indirect reply 

### reasoning ### 
the response is not a direct reply to the question


 96%|█████████▌| 295/308 [21:32<01:01,  4.75s/it]

Right label:indirect reply
### part of the interview ### 
intervier: q. mr. president


 96%|█████████▌| 296/308 [21:36<00:55,  4.65s/it]

Wrong label:response 

### response to the selected question ###
the problem that most of the world has seen in stems


 96%|█████████▋| 297/308 [21:42<00:53,  4.87s/it]

Right label:a. direct reply 
b. direct non-reply 
c.  indirect reply


 97%|█████████▋| 298/308 [21:44<00:41,  4.12s/it]

Right label:a. indirect reply 
b. direct reply 
c. direct non-reply 

###


 97%|█████████▋| 299/308 [21:46<00:32,  3.61s/it]

Right label:a) indirect 
b) direct reply 
c) d) direct non-reply


 97%|█████████▋| 300/308 [21:48<00:25,  3.16s/it]

Right label:a) indirect reply 
b) direct reply 
c) direct non-reply 

###


 98%|█████████▊| 301/308 [21:51<00:19,  2.84s/it]

Right label:indirect reply
### part of the interview ### 
intervier: q. can you tell


 98%|█████████▊| 302/308 [21:57<00:23,  3.89s/it]

Right label:indirect reply. 

### reasoning ###
the response does not directly address the question, but rather


 98%|█████████▊| 303/308 [22:02<00:20,  4.12s/it]

Right label:[direct reply](https://i.imgur.com/6u6rvc4.png)


 99%|█████████▊| 304/308 [22:08<00:19,  4.89s/it]

Right label:direct reply. 
explanation: 
the response directly answers the question of "if so, why


 99%|█████████▉| 305/308 [22:14<00:15,  5.13s/it]

Right label:direct reply.  the response is a long, rambling, and meandering response, but it


 99%|█████████▉| 306/308 [22:19<00:10,  5.12s/it]

Right label:[direct non-reply]
response: we strategized on both issues. but this isn't the


100%|█████████▉| 307/308 [22:22<00:04,  4.59s/it]

Right label:a. indirect reply 
b. direct reply 
c.  direct non-reply


100%|██████████| 308/308 [22:24<00:00,  4.37s/it]

Right label:(indirect, direct reply, direct non-reply) 

## step 1: understand
Accuracy: 0.47
Accuracy for label Indirect: 0.62
Accuracy for label Direct Reply: 0.20
Accuracy for label Direct Non-Reply: 0.09

Classification Report:
                  precision    recall  f1-score   support

        Indirect       0.66      0.62      0.64       206
    Direct_Reply       0.29      0.20      0.24        79
Direct_Non-Reply       0.09      0.09      0.09        23

       micro avg       0.53      0.47      0.50       308
       macro avg       0.34      0.30      0.32       308
    weighted avg       0.52      0.47      0.50       308


Confusion Matrix:
[[128  38  17]
 [ 52  16   4]
 [ 14   2   2]]





------------------------------------------------------------------------------------------------------------
                                    # Model evaluation before fine-tuning

                                    

In [3]:
huggingface_hub.login(os.environ["hf_key"])

base_model_name = "meta-llama/Llama-3.1-8B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        ),
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
def evaluate(base_model_name, fine_tuned_model_path, label_name, run=None):

    cache_dir = ""

    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        return_dict=True,
        low_cpu_mem_usage=True,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        ),
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        offload_folder="offload/",
        cache_dir=cache_dir
    )

    tokenizer = AutoTokenizer.from_pretrained(base_model_name, 
                                              cache_dir=cache_dir)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # model = AutoModelForCausalLM.from_pretrained(
    #     model_name,
    #     # load_in_4bit=True,
    #     quantization_config=BitsAndBytesConfig(
    #         load_in_4bit=True,
    #         bnb_4bit_compute_dtype=torch.bfloat16
    #     ),
    #     device_map='auto',
    #     # device_map='cpu',
    #     torch_dtype=torch.float16,
    #     cache_dir=cache_dir
    # )

    # base_model_reload.config.use_cache = False
    # model = PeftModel.from_pretrained(base_model_reload,
    #                                   fine_tuned_model_path,
    #                                   # device_map='auto',
    #                                   offload_folder="offload/") 
    # model = model.merge_and_unload()

    # Get test set data
    test_df = pd.read_csv('preprocessed_data/test_set.csv')[[
        'question',
        'interview_question',
        'interview_answer',
        label_name
    ]]

    test_texts = create_test_prompted_text(test_df, label_name)
    dataset = pd.DataFrame(test_texts, columns=['text'])

    labels = list(test_df[label_name].unique())

    y_pred = predict(dataset, labels, model, tokenizer)
    y_true = test_df[label_name]
    evaluation_report(y_true, y_pred, labels, run)

In [7]:
label_name = "clarity_label"

# Get train set data
df = pd.read_csv('preprocessed_data/train_set.csv')[['question',
                                                     'interview_question',
                                                     'interview_answer',
                                                     label_name]]

# Split train set to train and validation data
np.random.seed(2024)
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
validation = df[~msk]

train.reset_index(drop=True, inplace=True)
validation.reset_index(drop=True, inplace=True)

train_texts = create_prompted_text(train, label_name)
validation_texts = create_prompted_text(validation,
                                        label_name)

# print("Example of train test:" + train_texts[1])
# print("Example of validation test:" + validation_texts[1])


def predict(test, model, tokenizer):

    # Set logging level to ERROR to suppress INFO messages
    logging.basicConfig(level=logging.ERROR)

    y_pred = []
    categories = list(df[label_name].unique())
    pipe = pipeline(task="text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    max_new_tokens=4,
                    temperature=0.1
                    )

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("Label:")[-1].strip()

        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            print("Wrong label:" + answer.lower())
            y_pred.append("none")

    return y_pred


validation_texts = create_test_prompted_text(validation, label_name)

dataset = pd.DataFrame(validation_texts, columns=['text'])
y_pred = predict(dataset, model, tokenizer)

Device set to use cuda:0
  1%|          | 7/680 [00:27<35:04,  3.13s/it]  

Wrong label:0

### response


  4%|▍         | 30/680 [01:44<31:25,  2.90s/it]  

Wrong label:0

### part


  6%|▌         | 38/680 [02:01<23:36,  2.21s/it]

Wrong label:0
 response:


  8%|▊         | 55/680 [02:43<22:36,  2.17s/it]

Wrong label:? 

### part


 10%|█         | 70/680 [03:31<32:23,  3.19s/it]

Wrong label:? 

### step


 11%|█         | 75/680 [03:42<24:38,  2.44s/it]

Wrong label:0
 response:


 11%|█▏        | 77/680 [03:47<22:55,  2.28s/it]

Wrong label:0
 response:


 12%|█▏        | 81/680 [03:54<16:37,  1.67s/it]

Wrong label:? 

### step


 13%|█▎        | 87/680 [04:03<16:20,  1.65s/it]

Wrong label:0
 response:


 13%|█▎        | 88/680 [04:04<14:50,  1.50s/it]

Wrong label:0
 response:


 15%|█▍        | 99/680 [04:28<17:11,  1.78s/it]

Wrong label:0
 response:


 18%|█▊        | 121/680 [05:18<17:52,  1.92s/it]

Wrong label:0
 response:


 19%|█▊        | 126/680 [05:29<17:36,  1.91s/it]

Wrong label:0
 response:


 19%|█▉        | 131/680 [05:38<16:47,  1.84s/it]

Wrong label:? 

### step


 25%|██▍       | 169/680 [07:06<16:22,  1.92s/it]

Wrong label:? 

### step


 25%|██▌       | 173/680 [07:17<19:50,  2.35s/it]

Wrong label:0
 response:


 26%|██▋       | 179/680 [07:33<23:29,  2.81s/it]

Wrong label:0
 response:


 26%|██▋       | 180/680 [07:35<20:52,  2.50s/it]

Wrong label:0
 response:


 27%|██▋       | 184/680 [07:43<17:17,  2.09s/it]

Wrong label:0
 response:


 28%|██▊       | 191/680 [08:00<17:22,  2.13s/it]

Wrong label:? 

### response


 31%|███       | 210/680 [09:00<23:10,  2.96s/it]

Wrong label:0
 response:


 33%|███▎      | 226/680 [09:33<12:06,  1.60s/it]

Wrong label:0
 response:


 35%|███▌      | 238/680 [10:04<15:34,  2.11s/it]

Wrong label:0
 response:


 41%|████▏     | 282/680 [11:28<09:30,  1.43s/it]

Wrong label:0
 response:


 54%|█████▎    | 365/680 [15:48<17:44,  3.38s/it]

Wrong label:0
 response:


 60%|██████    | 408/680 [19:53<16:10,  3.57s/it]

Wrong label:0
 response:


 61%|██████▏   | 417/680 [20:30<13:04,  2.98s/it]

Wrong label:? 

### step


 61%|██████▏   | 418/680 [20:32<10:39,  2.44s/it]

Wrong label:? 

### reason


 63%|██████▎   | 430/680 [21:11<09:38,  2.31s/it]

Wrong label:0
 response:


 69%|██████▊   | 467/680 [23:34<10:31,  2.96s/it]

Wrong label:? 

### response


 69%|██████▉   | 469/680 [23:40<09:30,  2.70s/it]

Wrong label:? 

### response


 70%|███████   | 479/680 [24:17<11:22,  3.40s/it]

Wrong label:0
 response:


 79%|███████▊  | 534/680 [27:50<06:12,  2.55s/it]

Wrong label:? 

### step


 79%|███████▉  | 540/680 [28:05<05:23,  2.31s/it]

Wrong label:0
 response:


 85%|████████▌ | 580/680 [30:18<05:58,  3.59s/it]

Wrong label:? 

### step


 92%|█████████▏| 628/680 [32:57<03:23,  3.92s/it]

Wrong label:0

### response


 99%|█████████▉| 673/680 [35:37<00:17,  2.55s/it]

Wrong label:0
 response:


100%|██████████| 680/680 [35:57<00:00,  3.17s/it]


In [9]:
def evaluate(y_true, y_pred):
    labels = list(df[label_name].unique())
    mapping = {label: idx for idx, label in enumerate(labels)}

    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data

    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)


y_true = validation[label_name]
evaluate(y_true, y_pred)

Accuracy: 0.341
Accuracy for label Direct Reply: 0.853
Accuracy for label Indirect: 0.013
Accuracy for label Direct Non-Reply: 0.562

Classification Report:
                  precision    recall  f1-score   support

    Direct Reply       0.38      0.85      0.53       218
        Indirect       0.33      0.01      0.02       389
Direct Non-Reply       0.30      0.56      0.39        73

       micro avg       0.36      0.34      0.35       680
       macro avg       0.34      0.48      0.31       680
    weighted avg       0.34      0.34      0.22       680


Confusion Matrix:
[[186   2  20]
 [284   5  77]
 [ 20   8  41]]


In [8]:
y_pred

['Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'none',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'none',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Reply',
 'none',
 'Indirect',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'none',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Dire

In [52]:
dataset.iloc[0]["text"]

"Classify the interviewee's response into one of the following\n        categories: Indirect, Direct Reply, Direct Non-Reply. Analyze the part of the interview provided below, focusing\n        specifically on the interviewee's response to the marked question.\n\n ### Part of the interview ### \n Intervier:Q. Hi, I'm Jongjin Park of Money Today. First of all, I would like to ask a question to President Yoon Suk Yeol of the Republic of Korea. I heard that you stated that a new chapter has opened in our trilateral cooperation with the two countries. Compared to the previous summits, what would be the most significant outcome that you gained through this summit?And also, from the perspective of our people, what would be the benefit that the people of Korea would feel from these strengthening of ties?And now my question goes to President Biden. During this summit, the issues of detainees or prisoners of wars—and you mentioned that there will be further cooperation in these human rights iss

In [12]:
y_pred1 = [label.replace("none", "Indirect") for label in y_pred]
y_pred1
evaluate(y_true, y_pred1)

Accuracy: 0.375
Accuracy for label Direct Reply: 0.853
Accuracy for label Indirect: 0.072
Accuracy for label Direct Non-Reply: 0.562

Classification Report:
                  precision    recall  f1-score   support

    Direct Reply       0.38      0.85      0.53       218
        Indirect       0.54      0.07      0.13       389
Direct Non-Reply       0.30      0.56      0.39        73

        accuracy                           0.38       680
       macro avg       0.41      0.50      0.35       680
    weighted avg       0.46      0.38      0.28       680


Confusion Matrix:
[[186  12  20]
 [284  28  77]
 [ 20  12  41]]
