In [2]:
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          Trainer,
                          pipeline,
                          DataCollatorForLanguageModeling)
from peft import LoraConfig, get_peft_model

import huggingface_hub
import os
import logging
from tqdm import tqdm
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)

# import wandb
# import argparse
# from peft import PeftModel

In [1]:
!pip install transformers
!pip install peft
!pip install 'accelerate>=0.26.0'
!pip install -U bitsandbytes
!pip install huggingface-hub
!pip install datasets
# !pip install wandb

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface-hub<1.0,>=0.30.0->transformers)
  Downloading hf_xet-1.1.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m184.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.32.4-py3-none-any.whl (512 kB)
Downloading safetensors-0.

In [3]:
class CustomTextDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        encoding = self.tokenizer(
            text,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }


def create_prompted_text(
    dataset,
    label_name,
):
    texts = []
    classes_names = ', '.join(list(dataset[label_name].unique()))

    for _, row in dataset.iterrows():
        texts.append(
            f"You will be given a part of an interview. "
            f"Classify the response to the selected question "
            f"into one of the following categories: {classes_names}"
            f". \n\n ### Part of the interview ### \nIntervier:"
            f" {row['interview_question']} \nResponse:"
            f" {row['interview_answer']} \n\n### Selected Question ###\n"
            f"{row['question']} \n\nLabel: {row[label_name]}"
        )
    return texts


def load_qevasion_dataset(
    tokenizer,
    label_name="clarity_label"
):
    # Get train set data
    df = pd.read_csv('preprocessed_data/train_set.csv')[['question',
                                                         'interview_question',
                                                         'interview_answer',
                                                         label_name]]

    # Split train set to train and validation data
    np.random.seed(2024)
    msk = np.random.rand(len(df)) < 0.9
    train = df[msk]
    validation = df[~msk]

    train.reset_index(drop=True, inplace=True)
    validation.reset_index(drop=True, inplace=True)

    train_texts = create_prompted_text(train, label_name)
    validation_texts = create_prompted_text(validation,
                                            label_name)

    print("Example of train test:" + train_texts[1])
    print("Example of validation test:" + validation_texts[1])

    train_texts = train_texts[:8]
    validation_texts = validation_texts[:1]
    return (CustomTextDataset(train_texts, tokenizer),
            CustomTextDataset(validation_texts, tokenizer))


def compute_metrics():
    print("All good")
    return None


class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"""trainable params: {trainable_params} || all params: {all_param}
        || trainable%: {100 * trainable_params / all_param}"""
    )


def finetuning(model_name,
               output_model_dir,
               label_taxonomy,
               lr,
               epochs):

    cache_dir = ""

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        ),
        device_map='auto',
        torch_dtype=torch.float16,
        cache_dir=cache_dir
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir,
                                              trust_remote_code=True,)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    for param in model.parameters():
        param.requires_grad = False  # freeze the model - train adapters later
        if param.ndim == 1:
            # cast the small parameters (e.g. layernorm) to fp32 for stability
            param.data = param.data.to(torch.float32)

    # Reduce number of stored activation
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()
    model.lm_head = CastOutputToFloat(model.lm_head)

    config = LoraConfig(
        r=16,  # Attention heads
        lora_alpha=32,  # Alpha scaling
        # target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"  # set this for CLM or Seq2Seq
    )

    model = get_peft_model(model, config)
    print_trainable_parameters(model)

    # load data
    train_data, validation_data = load_qevasion_dataset(tokenizer,
                                                        label_taxonomy)

    print(f"""Found {len(train_data)} instances for training and
    {len(validation_data) } instances for validation.""")

    grad_accum_steps = 8

    # Train model
    print("Training...")
    # out_dir = output_model_dir.split("/")[-1]

    trainer = Trainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=validation_data,
        args=TrainingArguments(
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=grad_accum_steps,
            eval_accumulation_steps=1,
            warmup_steps=100,
            max_steps=int((len(train_data)*epochs)/grad_accum_steps),
            learning_rate=lr,
            fp16=True,
            logging_steps=1,
            # eval_steps * int((len(train_data)*epochs)/grad_accum_steps)
            # if eval_steps < 1
            eval_steps=0.33/epochs,
            eval_strategy="steps",
            do_eval=True,
            compute_metrics=compute_metrics,
            # report_to="wandb",
            # save_steps= 2,
            # num_train_epochs=epochs,
            # output_dir=f'outputs_{out_dir}' #,
        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer,
                                                      mlm=False)
    )

    # silence the warnings. Re-enable for inference!
    model.config.use_cache = False
    trainer.train()

    # Save the model
    model.save_pretrained(output_model_dir)

    # Optionally, save the tokenizer as well
    # tokenizer.save_pretrained(output_model_dir)
    return model, tokenizer


def create_test_prompted_text(
    dataset,
    label_name,
):
    texts = []
    classes_names = ', '.join(list(dataset[label_name].unique()))

    for _, row in dataset.iterrows():
        texts.append(
            f"You will be given a part of an interview."
            f"Classify the response to the selected question"
            f"into one of the following categories: {classes_names}"
            f". \n\n ### Part of the interview ### \nIntervier:"
            f" {row['interview_question']} \nResponse:"
            f" {row['interview_answer']} \n\n### Selected Question ###\n"
            f"{row['question']} \n\nLabel:"
        )
    return texts


def predict(test, categories, model, tokenizer):

    # Set logging level to ERROR to suppress INFO messages
    logging.basicConfig(level=logging.ERROR)

    y_pred = []
    pipe = pipeline(task="text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    # max_new_tokens=4
                    )

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("Label:")[-1].strip()

        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                print("Right label:" + answer.lower())
                y_pred.append(category)
                break
        else:
            print("Wrong label:" + answer.lower())
            y_pred.append("none")

    return y_pred


def evaluation_report(y_true, y_pred, labels, run=None):
    mapping = {label: idx for idx, label in enumerate(labels)}

    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found

    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)

    if run:
        wandb_log_dict = {}

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.2f}')
    if run:
        wandb_log_dict["Accuracy"] = accuracy

    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped))
                         if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.2f}')
        if run:
            wandb_log_dict[f"Accuracy for label {labels[label]}"] = label_accuracy

    unsplit_labels = [label.replace(" ", "_") for label in labels]

    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped,
                                         y_pred=y_pred_mapped,
                                         target_names=unsplit_labels,
                                         labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)

    report_columns = ["Class", "Precision", "Recall", "F1-score", "Support"]
    report_table = []
    class_report = class_report.splitlines()
    for line in class_report[2:(len(labels)+2)]:
        report_table.append(line.split())

    # if run:
    #     wandb_log_dict["Classification Report"] = wandb.Table(
    #         data=report_table,
    #         columns=report_columns)

    # For not predicted classes
    mask = y_pred_mapped != -1
    y_true_mapped2 = y_true_mapped[mask]
    y_pred_mapped2 = y_pred_mapped[mask]

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped,
                                   y_pred=y_pred_mapped,
                                   labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

    # if run:
    #     wandb_log_dict["Confusion Matix"] = wandb.plot.confusion_matrix(
    #         y_true=y_true_mapped2,
    #         preds=y_pred_mapped2,
    #         class_names=labels
    #     )
    #     run.log(wandb_log_dict)


def evaluate(base_model_name,
             fine_tuned_model_path,
             label_name="clarity_label",
             model=None,
             tokenizer=None,
             run=None):

    if not model:

        model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            return_dict=True,
            low_cpu_mem_usage=True,
            quantization_config=BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.bfloat16
            ),
            # torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True,
            offload_folder="offload/",
            cache_dir=""
        )
    if not tokenizer:
        tokenizer = AutoTokenizer.from_pretrained(base_model_name,
                                                  cache_dir="")
        
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # Get test set data
    test_df = pd.read_csv('preprocessed_data/test_set.csv')[[
        'question',
        'interview_question',
        'interview_answer',
        label_name
    ]]

    test_texts = create_test_prompted_text(test_df, label_name)
    dataset = pd.DataFrame(test_texts, columns=['text'])

    labels = list(test_df[label_name].unique())

    y_pred = predict(dataset, labels, model, tokenizer)
    y_true = test_df[label_name]
    evaluation_report(y_true, y_pred, labels, run)

In [None]:
os.environ['WANDB_NOTEBOOK_NAME'] = 'lora_llm'
os.environ["WANDB_DISABLED"] = "true"

# Load the API key from the secret.json file
with open('secrets.json', 'r') as file:
    secrets = json.load(file)
    huggingface_hub.login(secrets.get('HF_KEY'))
    # wandb.login(secrets.get('WANDB_KEY'))

# 'TinyLlama/TinyLlama_v1.1'
base_model_name = 'meta-llama/Llama-3.1-8B-Instruct'
fine_tuned_model_path = "./llama3.1"
label_name = "evasion_label"
lr = 2e-4
epochs = 1

# Wandb configuration
# run = wandb.init(entity="kontilenia-national-technical-university-of-athens",
#                  project='political-speech-clarity',
#                  job_type="training",
#                  # Track hyperparameters and run metadata
#                  config={
#                     "learning_rate": lr,
#                     "architecture": base_model_name,
#                     "dataset": "qevasion_dataset_preproccessed",
#                     "epochs": epochs,
#                  })

model, tokenizer = finetuning(base_model_name,
                              fine_tuned_model_path,
                              label_name,
                              lr,
                              epochs)

# run = wandb.init(entity="kontilenia-national-technical-university-of-athens",
#                  project="political-speech-clarity",
#                  id="8c0qfc9s",
#                  resume="must")

evaluate(base_model_name,
         fine_tuned_model_path,
         model,
         tokenizer)
#          run)

# run.finish()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

------------------------------------------------------------------------------------------------------------
                                    # Model evaluation before fine-tuning

                                    

In [3]:
huggingface_hub.login(os.environ["hf_key"])

base_model_name = "meta-llama/Llama-3.1-8B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        ),
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
def evaluate(base_model_name, fine_tuned_model_path, label_name, run=None):

    cache_dir = ""

    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        return_dict=True,
        low_cpu_mem_usage=True,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        ),
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        offload_folder="offload/",
        cache_dir=cache_dir
    )

    tokenizer = AutoTokenizer.from_pretrained(base_model_name, 
                                              cache_dir=cache_dir)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # model = AutoModelForCausalLM.from_pretrained(
    #     model_name,
    #     # load_in_4bit=True,
    #     quantization_config=BitsAndBytesConfig(
    #         load_in_4bit=True,
    #         bnb_4bit_compute_dtype=torch.bfloat16
    #     ),
    #     device_map='auto',
    #     # device_map='cpu',
    #     torch_dtype=torch.float16,
    #     cache_dir=cache_dir
    # )

    # base_model_reload.config.use_cache = False
    # model = PeftModel.from_pretrained(base_model_reload,
    #                                   fine_tuned_model_path,
    #                                   # device_map='auto',
    #                                   offload_folder="offload/") 
    # model = model.merge_and_unload()

    # Get test set data
    test_df = pd.read_csv('preprocessed_data/test_set.csv')[[
        'question',
        'interview_question',
        'interview_answer',
        label_name
    ]]

    test_texts = create_test_prompted_text(test_df, label_name)
    dataset = pd.DataFrame(test_texts, columns=['text'])

    labels = list(test_df[label_name].unique())

    y_pred = predict(dataset, labels, model, tokenizer)
    y_true = test_df[label_name]
    evaluation_report(y_true, y_pred, labels, run)

In [7]:
label_name = "clarity_label"

# Get train set data
df = pd.read_csv('preprocessed_data/train_set.csv')[['question',
                                                     'interview_question',
                                                     'interview_answer',
                                                     label_name]]

# Split train set to train and validation data
np.random.seed(2024)
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
validation = df[~msk]

train.reset_index(drop=True, inplace=True)
validation.reset_index(drop=True, inplace=True)

train_texts = create_prompted_text(train, label_name)
validation_texts = create_prompted_text(validation,
                                        label_name)

# print("Example of train test:" + train_texts[1])
# print("Example of validation test:" + validation_texts[1])


def predict(test, model, tokenizer):

    # Set logging level to ERROR to suppress INFO messages
    logging.basicConfig(level=logging.ERROR)

    y_pred = []
    categories = list(df[label_name].unique())
    pipe = pipeline(task="text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    max_new_tokens=4,
                    temperature=0.1
                    )

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("Label:")[-1].strip()

        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            print("Wrong label:" + answer.lower())
            y_pred.append("none")

    return y_pred


validation_texts = create_test_prompted_text(validation, label_name)

dataset = pd.DataFrame(validation_texts, columns=['text'])
y_pred = predict(dataset, model, tokenizer)

Device set to use cuda:0
  1%|          | 7/680 [00:27<35:04,  3.13s/it]  

Wrong label:0

### response


  4%|▍         | 30/680 [01:44<31:25,  2.90s/it]  

Wrong label:0

### part


  6%|▌         | 38/680 [02:01<23:36,  2.21s/it]

Wrong label:0
 response:


  8%|▊         | 55/680 [02:43<22:36,  2.17s/it]

Wrong label:? 

### part


 10%|█         | 70/680 [03:31<32:23,  3.19s/it]

Wrong label:? 

### step


 11%|█         | 75/680 [03:42<24:38,  2.44s/it]

Wrong label:0
 response:


 11%|█▏        | 77/680 [03:47<22:55,  2.28s/it]

Wrong label:0
 response:


 12%|█▏        | 81/680 [03:54<16:37,  1.67s/it]

Wrong label:? 

### step


 13%|█▎        | 87/680 [04:03<16:20,  1.65s/it]

Wrong label:0
 response:


 13%|█▎        | 88/680 [04:04<14:50,  1.50s/it]

Wrong label:0
 response:


 15%|█▍        | 99/680 [04:28<17:11,  1.78s/it]

Wrong label:0
 response:


 18%|█▊        | 121/680 [05:18<17:52,  1.92s/it]

Wrong label:0
 response:


 19%|█▊        | 126/680 [05:29<17:36,  1.91s/it]

Wrong label:0
 response:


 19%|█▉        | 131/680 [05:38<16:47,  1.84s/it]

Wrong label:? 

### step


 25%|██▍       | 169/680 [07:06<16:22,  1.92s/it]

Wrong label:? 

### step


 25%|██▌       | 173/680 [07:17<19:50,  2.35s/it]

Wrong label:0
 response:


 26%|██▋       | 179/680 [07:33<23:29,  2.81s/it]

Wrong label:0
 response:


 26%|██▋       | 180/680 [07:35<20:52,  2.50s/it]

Wrong label:0
 response:


 27%|██▋       | 184/680 [07:43<17:17,  2.09s/it]

Wrong label:0
 response:


 28%|██▊       | 191/680 [08:00<17:22,  2.13s/it]

Wrong label:? 

### response


 31%|███       | 210/680 [09:00<23:10,  2.96s/it]

Wrong label:0
 response:


 33%|███▎      | 226/680 [09:33<12:06,  1.60s/it]

Wrong label:0
 response:


 35%|███▌      | 238/680 [10:04<15:34,  2.11s/it]

Wrong label:0
 response:


 41%|████▏     | 282/680 [11:28<09:30,  1.43s/it]

Wrong label:0
 response:


 54%|█████▎    | 365/680 [15:48<17:44,  3.38s/it]

Wrong label:0
 response:


 60%|██████    | 408/680 [19:53<16:10,  3.57s/it]

Wrong label:0
 response:


 61%|██████▏   | 417/680 [20:30<13:04,  2.98s/it]

Wrong label:? 

### step


 61%|██████▏   | 418/680 [20:32<10:39,  2.44s/it]

Wrong label:? 

### reason


 63%|██████▎   | 430/680 [21:11<09:38,  2.31s/it]

Wrong label:0
 response:


 69%|██████▊   | 467/680 [23:34<10:31,  2.96s/it]

Wrong label:? 

### response


 69%|██████▉   | 469/680 [23:40<09:30,  2.70s/it]

Wrong label:? 

### response


 70%|███████   | 479/680 [24:17<11:22,  3.40s/it]

Wrong label:0
 response:


 79%|███████▊  | 534/680 [27:50<06:12,  2.55s/it]

Wrong label:? 

### step


 79%|███████▉  | 540/680 [28:05<05:23,  2.31s/it]

Wrong label:0
 response:


 85%|████████▌ | 580/680 [30:18<05:58,  3.59s/it]

Wrong label:? 

### step


 92%|█████████▏| 628/680 [32:57<03:23,  3.92s/it]

Wrong label:0

### response


 99%|█████████▉| 673/680 [35:37<00:17,  2.55s/it]

Wrong label:0
 response:


100%|██████████| 680/680 [35:57<00:00,  3.17s/it]


In [9]:
def evaluate(y_true, y_pred):
    labels = list(df[label_name].unique())
    mapping = {label: idx for idx, label in enumerate(labels)}

    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data

    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)


y_true = validation[label_name]
evaluate(y_true, y_pred)

Accuracy: 0.341
Accuracy for label Direct Reply: 0.853
Accuracy for label Indirect: 0.013
Accuracy for label Direct Non-Reply: 0.562

Classification Report:
                  precision    recall  f1-score   support

    Direct Reply       0.38      0.85      0.53       218
        Indirect       0.33      0.01      0.02       389
Direct Non-Reply       0.30      0.56      0.39        73

       micro avg       0.36      0.34      0.35       680
       macro avg       0.34      0.48      0.31       680
    weighted avg       0.34      0.34      0.22       680


Confusion Matrix:
[[186   2  20]
 [284   5  77]
 [ 20   8  41]]


In [8]:
y_pred

['Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'none',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'none',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Reply',
 'none',
 'Indirect',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'Direct Reply',
 'Direct Non-Reply',
 'Direct Reply',
 'none',
 'Direct Reply',
 'Direct Reply',
 'Direct Reply',
 'Dire

In [52]:
dataset.iloc[0]["text"]

"Classify the interviewee's response into one of the following\n        categories: Indirect, Direct Reply, Direct Non-Reply. Analyze the part of the interview provided below, focusing\n        specifically on the interviewee's response to the marked question.\n\n ### Part of the interview ### \n Intervier:Q. Hi, I'm Jongjin Park of Money Today. First of all, I would like to ask a question to President Yoon Suk Yeol of the Republic of Korea. I heard that you stated that a new chapter has opened in our trilateral cooperation with the two countries. Compared to the previous summits, what would be the most significant outcome that you gained through this summit?And also, from the perspective of our people, what would be the benefit that the people of Korea would feel from these strengthening of ties?And now my question goes to President Biden. During this summit, the issues of detainees or prisoners of wars—and you mentioned that there will be further cooperation in these human rights iss

In [12]:
y_pred1 = [label.replace("none", "Indirect") for label in y_pred]
y_pred1
evaluate(y_true, y_pred1)

Accuracy: 0.375
Accuracy for label Direct Reply: 0.853
Accuracy for label Indirect: 0.072
Accuracy for label Direct Non-Reply: 0.562

Classification Report:
                  precision    recall  f1-score   support

    Direct Reply       0.38      0.85      0.53       218
        Indirect       0.54      0.07      0.13       389
Direct Non-Reply       0.30      0.56      0.39        73

        accuracy                           0.38       680
       macro avg       0.41      0.50      0.35       680
    weighted avg       0.46      0.38      0.28       680


Confusion Matrix:
[[186  12  20]
 [284  28  77]
 [ 20  12  41]]
