In [None]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoTokenizer, AutoModel, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import classification_report, f1_score,confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import json
import numpy as np
from tqdm import tqdm
from typing import List
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
# --- Constants ---
MODEL_NAME = 'roberta-base'
NUM_LABELS = 3
LABELS = ["No", "To some extent", "Yes"]
LABEL2ID = {label: i for i, label in enumerate(LABELS)}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}
TASKS = ['Mistake_Identification', 'Mistake_Location', 'Providing_Guidance', 'Actionability']

# --- Dataset Loader ---
class TutorDataset(Dataset):
    def __init__(self, data: List[dict], tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['conversation_history'] + "\nTutor Response: " + item['response']
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        labels = {task: LABEL2ID[item['annotation'][task]] for task in TASKS}

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor([labels[task] for task in TASKS], dtype=torch.long)
        }

# --- Model Definition ---
class PedagogicalModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(0.2)
        self.heads = nn.ModuleDict({task: nn.Linear(self.encoder.config.hidden_size, NUM_LABELS) for task in TASKS})

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(outputs.last_hidden_state[:, 0])
        return {task: head(pooled) for task, head in self.heads.items()}

# --- Focal Loss ---
class FocalLoss(nn.Module):
    def __init__(self, alpha=1.0, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()

# --- Evaluation ---
def evaluate_model(model, dataloader):
    model.eval()
    device = next(model.parameters()).device
    y_true = {task: [] for task in TASKS}
    y_pred = {task: [] for task in TASKS}

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels']

            outputs = model(input_ids, attention_mask)
            for i, task in enumerate(TASKS):
                logits = outputs[task].cpu()
                preds = torch.argmax(logits, dim=-1)
                y_pred[task].extend(preds.tolist())
                y_true[task].extend(labels[:, i].tolist())

    return y_true, y_pred


In [3]:
# --- MODIFIED: Load JSON and Separate Tutor Responses ---
def load_json_dataset_by_tutor(path):
    with open(path, 'r') as f:
        raw = json.load(f)

    tutor_data = {}
    for conv in raw:
        for tutor_name, info in conv['tutor_responses'].items():
            sample = {
                'conversation_id': conv['conversation_id'],
                'conversation_history': conv['conversation_history'],
                'response': info['response'],
                'annotation': info['annotation']
            }
            tutor_data.setdefault(tutor_name, []).append(sample)
    return tutor_data

# --- Evaluate Predictions: Exact & Lenient ---
def evaluate_predictions(y_pred, y_true):
    for task in TASKS:
        print(f"\n=== {task} ===")
        print("-- Exact (3-Class) --")
        print(classification_report(
            y_true[task],
            y_pred[task],
            labels=[0, 1, 2],
            target_names=LABELS,
            digits=3
        ))
        print("Confusion Matrix:")
        print(confusion_matrix(y_true[task], y_pred[task], labels=[0, 1, 2]))
        print("Accuracy:", accuracy_score(y_true[task], y_pred[task]))
        print("Macro F1:", f1_score(y_true[task], y_pred[task], average='macro'))
        print("Micro F1:", f1_score(y_true[task], y_pred[task], average='micro'))
        # Lenient Evaluation (2-Class)
        def binarize(labels):
            return [0 if l == 0 else 1 for l in labels]  # No = 0, TSE/Yes = 1

        y_true_bin = binarize(y_true[task])
        y_pred_bin = binarize(y_pred[task])

        print("-- Lenient (2-Class) --")
        print(classification_report(
            y_true_bin,
            y_pred_bin,
            labels=[0, 1],
            target_names=["No", "Yes/TSE"],
            digits=3
        ))
        print("Confusion Matrix:")
        print(confusion_matrix(y_true_bin, y_pred_bin, labels=[0, 1]))
        print("Accuracy:", accuracy_score(y_true_bin, y_pred_bin))
        print("Macro F1:", f1_score(y_true_bin, y_pred_bin, average='macro'))
        print("Micro F1:", f1_score(y_true_bin, y_pred_bin, average='micro'))

# --- Training Function ---
def train_model(model, train_loader, val_loader, epochs=25, lr=2e-5, use_focal=True, save_dir="models"):
    import os
    os.makedirs(save_dir, exist_ok=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)

    loss_fn = FocalLoss() if use_focal else nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = sum([loss_fn(outputs[task], labels[:, i]) for i, task in enumerate(TASKS)])
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            total_loss += loss.item()

        print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")

        # 🔒 Save model after each epoch
        torch.save(model.state_dict(), os.path.join(save_dir, f"model.pt"))

    # Evaluate after all epochs (once per tutor)
    y_true, y_pred = evaluate_model(model, val_loader)
    print("\nFINAL EVALUATION ON VALIDATION SET")
    evaluate_predictions(y_pred, y_true)
    return y_true, y_pred

In [None]:
# --- Tutor-wise Training & Evaluation ---
if __name__ == '__main__':
    from sklearn.model_selection import train_test_split

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tutor_data = load_json_dataset_by_tutor('ai_tutors_dataset.json')
    tutor_scores = {}

    for tutor_name, data in tutor_data.items():
        print(f"\n\n================= TUTOR: {tutor_name} =================")

        train_data, val_data = train_test_split(data, test_size=0.3, random_state=42)
        train_ds = TutorDataset(train_data, tokenizer)
        val_ds = TutorDataset(val_data, tokenizer)

        train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
        val_loader = DataLoader(val_ds, batch_size=8)

        model = PedagogicalModel()
        train_model(model, train_loader, val_loader, epochs=50, lr=2e-5, use_focal=True)

        y_true, y_pred = evaluate_model(model, val_loader)
        tutor_scores[tutor_name] = {
            task: f1_score(y_true[task], y_pred[task], average='macro')
            for task in TASKS
        }

    # --- Display Summary ---
    print("\n\n========= SUMMARY: MACRO F1 SCORES BY TUTOR =========")
    for tutor, scores in tutor_scores.items():
        print(f"{tutor}:")
        for task in TASKS:
            print(f"  {task}: {scores[task]:.3f}")





Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:13<00:00,  2.20it/s]


Epoch 1 Loss: 1.6083


Training Epoch 2: 100%|██████████| 30/30 [00:10<00:00,  2.86it/s]


Epoch 2 Loss: 1.4304


Training Epoch 3: 100%|██████████| 30/30 [00:10<00:00,  2.87it/s]


Epoch 3 Loss: 1.4491


Training Epoch 4: 100%|██████████| 30/30 [00:10<00:00,  2.87it/s]


Epoch 4 Loss: 1.3627


Training Epoch 5: 100%|██████████| 30/30 [00:10<00:00,  2.90it/s]


Epoch 5 Loss: 1.3089


Training Epoch 6: 100%|██████████| 30/30 [00:10<00:00,  2.88it/s]


Epoch 6 Loss: 1.1901


Training Epoch 7: 100%|██████████| 30/30 [00:10<00:00,  3.00it/s]


Epoch 7 Loss: 0.9865


Training Epoch 8: 100%|██████████| 30/30 [00:11<00:00,  2.57it/s]


Epoch 8 Loss: 0.7570


Training Epoch 9: 100%|██████████| 30/30 [00:11<00:00,  2.67it/s]


Epoch 9 Loss: 0.5198


Training Epoch 10: 100%|██████████| 30/30 [00:11<00:00,  2.70it/s]


Epoch 10 Loss: 0.3814


Training Epoch 11: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 11 Loss: 0.2730


Training Epoch 12: 100%|██████████| 30/30 [00:11<00:00,  2.71it/s]


Epoch 12 Loss: 0.2456


Training Epoch 13: 100%|██████████| 30/30 [00:11<00:00,  2.69it/s]


Epoch 13 Loss: 0.2186


Training Epoch 14: 100%|██████████| 30/30 [00:11<00:00,  2.68it/s]


Epoch 14 Loss: 0.1791


Training Epoch 15: 100%|██████████| 30/30 [00:11<00:00,  2.69it/s]


Epoch 15 Loss: 0.1344


Training Epoch 16: 100%|██████████| 30/30 [00:11<00:00,  2.57it/s]


Epoch 16 Loss: 0.1034


Training Epoch 17: 100%|██████████| 30/30 [00:11<00:00,  2.68it/s]


Epoch 17 Loss: 0.0915


Training Epoch 18: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 18 Loss: 0.0721


Training Epoch 19: 100%|██████████| 30/30 [00:11<00:00,  2.69it/s]


Epoch 19 Loss: 0.0562


Training Epoch 20: 100%|██████████| 30/30 [00:10<00:00,  2.74it/s]


Epoch 20 Loss: 0.0528


Training Epoch 21: 100%|██████████| 30/30 [00:11<00:00,  2.55it/s]


Epoch 21 Loss: 0.0385


Training Epoch 22: 100%|██████████| 30/30 [00:11<00:00,  2.67it/s]


Epoch 22 Loss: 0.0275


Training Epoch 23: 100%|██████████| 30/30 [00:11<00:00,  2.70it/s]


Epoch 23 Loss: 0.0205


Training Epoch 24: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 24 Loss: 0.0259


Training Epoch 25: 100%|██████████| 30/30 [00:11<00:00,  2.70it/s]


Epoch 25 Loss: 0.0216


Training Epoch 26: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 26 Loss: 0.0159


Training Epoch 27: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 27 Loss: 0.0099


Training Epoch 28: 100%|██████████| 30/30 [00:11<00:00,  2.69it/s]


Epoch 28 Loss: 0.0095


Training Epoch 29: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 29 Loss: 0.0092


Training Epoch 30: 100%|██████████| 30/30 [00:11<00:00,  2.69it/s]


Epoch 30 Loss: 0.0078


Training Epoch 31: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 31 Loss: 0.0065


Training Epoch 32: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 32 Loss: 0.0077


Training Epoch 33: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 33 Loss: 0.0063


Training Epoch 34: 100%|██████████| 30/30 [00:10<00:00,  2.73it/s]


Epoch 34 Loss: 0.0050


Training Epoch 35: 100%|██████████| 30/30 [00:11<00:00,  2.72it/s]


Epoch 35 Loss: 0.0061


Training Epoch 36: 100%|██████████| 30/30 [00:11<00:00,  2.68it/s]


Epoch 36 Loss: 0.0052


Training Epoch 37: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 37 Loss: 0.0047


Training Epoch 38: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 38 Loss: 0.0047


Training Epoch 39: 100%|██████████| 30/30 [00:11<00:00,  2.67it/s]


Epoch 39 Loss: 0.0047


Training Epoch 40: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 40 Loss: 0.0049


Training Epoch 41: 100%|██████████| 30/30 [00:11<00:00,  2.64it/s]


Epoch 41 Loss: 0.0040


Training Epoch 42: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 42 Loss: 0.0039


Training Epoch 43: 100%|██████████| 30/30 [00:11<00:00,  2.67it/s]


Epoch 43 Loss: 0.0041


Training Epoch 44: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 44 Loss: 0.0037


Training Epoch 45: 100%|██████████| 30/30 [00:11<00:00,  2.67it/s]


Epoch 45 Loss: 0.0040


Training Epoch 46: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 46 Loss: 0.0033


Training Epoch 47: 100%|██████████| 30/30 [00:11<00:00,  2.64it/s]


Epoch 47 Loss: 0.0036


Training Epoch 48: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 48 Loss: 0.0038


Training Epoch 49: 100%|██████████| 30/30 [00:11<00:00,  2.64it/s]


Epoch 49 Loss: 0.0033


Training Epoch 50: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 50 Loss: 0.0038

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         0
To some extent      0.000     0.000     0.000         3
           Yes      0.944     0.895     0.919        57

     micro avg      0.850     0.850     0.850        60
     macro avg      0.315     0.298     0.306        60
  weighted avg      0.897     0.850     0.873        60

Confusion Matrix:
[[ 0  0  0]
 [ 0  0  3]
 [ 0  6 51]]
Accuracy: 0.85
Macro F1: 0.4594594594594595
Micro F1: 0.85
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         0
     Yes/TSE      1.000     1.000     1.000        60

   micro avg      1.000     1.000     1.000        60
   macro avg      0.500     0.500     0.500        60
weighted avg      1.000     1.000     1.000        60

Confusion Matrix:
[[ 0  0]
 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 1 Loss: 1.7346


Training Epoch 2: 100%|██████████| 30/30 [00:11<00:00,  2.67it/s]


Epoch 2 Loss: 1.7212


Training Epoch 3: 100%|██████████| 30/30 [00:11<00:00,  2.68it/s]


Epoch 3 Loss: 1.6099


Training Epoch 4: 100%|██████████| 30/30 [00:11<00:00,  2.70it/s]


Epoch 4 Loss: 1.5874


Training Epoch 5: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 5 Loss: 1.5446


Training Epoch 6: 100%|██████████| 30/30 [00:11<00:00,  2.69it/s]


Epoch 6 Loss: 1.4796


Training Epoch 7: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 7 Loss: 1.3941


Training Epoch 8: 100%|██████████| 30/30 [00:10<00:00,  2.75it/s]


Epoch 8 Loss: 1.2377


Training Epoch 9: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 9 Loss: 1.0716


Training Epoch 10: 100%|██████████| 30/30 [00:11<00:00,  2.71it/s]


Epoch 10 Loss: 0.8740


Training Epoch 11: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 11 Loss: 0.7105


Training Epoch 12: 100%|██████████| 30/30 [00:11<00:00,  2.71it/s]


Epoch 12 Loss: 0.5541


Training Epoch 13: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 13 Loss: 0.3634


Training Epoch 14: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 14 Loss: 0.2248


Training Epoch 15: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 15 Loss: 0.1595


Training Epoch 16: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 16 Loss: 0.1551


Training Epoch 17: 100%|██████████| 30/30 [00:11<00:00,  2.56it/s]


Epoch 17 Loss: 0.1022


Training Epoch 18: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 18 Loss: 0.0800


Training Epoch 19: 100%|██████████| 30/30 [00:11<00:00,  2.70it/s]


Epoch 19 Loss: 0.0831


Training Epoch 20: 100%|██████████| 30/30 [00:11<00:00,  2.54it/s]


Epoch 20 Loss: 0.0499


Training Epoch 21: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 21 Loss: 0.0387


Training Epoch 22: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 22 Loss: 0.0344


Training Epoch 23: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 23 Loss: 0.0352


Training Epoch 24: 100%|██████████| 30/30 [00:11<00:00,  2.56it/s]


Epoch 24 Loss: 0.0263


Training Epoch 25: 100%|██████████| 30/30 [00:07<00:00,  3.77it/s]


Epoch 25 Loss: 0.0227


Training Epoch 26: 100%|██████████| 30/30 [00:10<00:00,  2.85it/s]


Epoch 26 Loss: 0.0206


Training Epoch 27: 100%|██████████| 30/30 [00:10<00:00,  2.86it/s]


Epoch 27 Loss: 0.0184


Training Epoch 28: 100%|██████████| 30/30 [00:10<00:00,  2.84it/s]


Epoch 28 Loss: 0.0153


Training Epoch 29: 100%|██████████| 30/30 [00:10<00:00,  2.85it/s]


Epoch 29 Loss: 0.0140


Training Epoch 30: 100%|██████████| 30/30 [00:09<00:00,  3.17it/s]


Epoch 30 Loss: 0.0118


Training Epoch 31: 100%|██████████| 30/30 [00:11<00:00,  2.71it/s]


Epoch 31 Loss: 0.0139


Training Epoch 32: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 32 Loss: 0.0138


Training Epoch 33: 100%|██████████| 30/30 [00:11<00:00,  2.71it/s]


Epoch 33 Loss: 0.0111


Training Epoch 34: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 34 Loss: 0.0111


Training Epoch 35: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 35 Loss: 0.0092


Training Epoch 36: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 36 Loss: 0.0083


Training Epoch 37: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 37 Loss: 0.0089


Training Epoch 38: 100%|██████████| 30/30 [00:11<00:00,  2.64it/s]


Epoch 38 Loss: 0.0089


Training Epoch 39: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 39 Loss: 0.0081


Training Epoch 40: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 40 Loss: 0.0083


Training Epoch 41: 100%|██████████| 30/30 [00:11<00:00,  2.57it/s]


Epoch 41 Loss: 0.0079


Training Epoch 42: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 42 Loss: 0.0080


Training Epoch 43: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 43 Loss: 0.0073


Training Epoch 44: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 44 Loss: 0.0085


Training Epoch 45: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 45 Loss: 0.0075


Training Epoch 46: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 46 Loss: 0.0062


Training Epoch 47: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 47 Loss: 0.0061


Training Epoch 48: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 48 Loss: 0.0058


Training Epoch 49: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 49 Loss: 0.0063


Training Epoch 50: 100%|██████████| 30/30 [00:11<00:00,  2.71it/s]


Epoch 50 Loss: 0.0072

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         6
To some extent      0.000     0.000     0.000         4
           Yes      0.831     0.980     0.899        50

      accuracy                          0.817        60
     macro avg      0.277     0.327     0.300        60
  weighted avg      0.692     0.817     0.749        60

Confusion Matrix:
[[ 0  0  6]
 [ 0  0  4]
 [ 0  1 49]]
Accuracy: 0.8166666666666667
Macro F1: 0.2996941896024465
Micro F1: 0.8166666666666667
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         6
     Yes/TSE      0.900     1.000     0.947        54

    accuracy                          0.900        60
   macro avg      0.450     0.500     0.474        60
weighted avg      0.810     0.900     0.853        60



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 1 Loss: 1.2424


Training Epoch 2: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 2 Loss: 0.9617


Training Epoch 3: 100%|██████████| 30/30 [00:16<00:00,  1.84it/s]


Epoch 3 Loss: 0.9691


Training Epoch 4: 100%|██████████| 30/30 [00:16<00:00,  1.78it/s]


Epoch 4 Loss: 0.9302


Training Epoch 5: 100%|██████████| 30/30 [00:16<00:00,  1.80it/s]


Epoch 5 Loss: 0.9020


Training Epoch 6: 100%|██████████| 30/30 [00:17<00:00,  1.73it/s]


Epoch 6 Loss: 0.8049


Training Epoch 7: 100%|██████████| 30/30 [00:16<00:00,  1.81it/s]


Epoch 7 Loss: 0.7127


Training Epoch 8: 100%|██████████| 30/30 [00:16<00:00,  1.81it/s]


Epoch 8 Loss: 0.5826


Training Epoch 9: 100%|██████████| 30/30 [00:16<00:00,  1.82it/s]


Epoch 9 Loss: 0.4735


Training Epoch 10: 100%|██████████| 30/30 [00:16<00:00,  1.82it/s]


Epoch 10 Loss: 0.3382


Training Epoch 11: 100%|██████████| 30/30 [00:16<00:00,  1.81it/s]


Epoch 11 Loss: 0.2189


Training Epoch 12: 100%|██████████| 30/30 [00:16<00:00,  1.79it/s]


Epoch 12 Loss: 0.1805


Training Epoch 13: 100%|██████████| 30/30 [00:16<00:00,  1.82it/s]


Epoch 13 Loss: 0.1255


Training Epoch 14: 100%|██████████| 30/30 [00:16<00:00,  1.78it/s]


Epoch 14 Loss: 0.0782


Training Epoch 15: 100%|██████████| 30/30 [00:16<00:00,  1.81it/s]


Epoch 15 Loss: 0.0599


Training Epoch 16: 100%|██████████| 30/30 [00:16<00:00,  1.79it/s]


Epoch 16 Loss: 0.0564


Training Epoch 17: 100%|██████████| 30/30 [00:16<00:00,  1.80it/s]


Epoch 17 Loss: 0.0276


Training Epoch 18: 100%|██████████| 30/30 [00:16<00:00,  1.78it/s]


Epoch 18 Loss: 0.0208


Training Epoch 19: 100%|██████████| 30/30 [00:16<00:00,  1.81it/s]


Epoch 19 Loss: 0.0144


Training Epoch 20: 100%|██████████| 30/30 [00:16<00:00,  1.84it/s]


Epoch 20 Loss: 0.0095


Training Epoch 21: 100%|██████████| 30/30 [00:14<00:00,  2.00it/s]


Epoch 21 Loss: 0.0088


Training Epoch 22: 100%|██████████| 30/30 [00:16<00:00,  1.80it/s]


Epoch 22 Loss: 0.0068


Training Epoch 23: 100%|██████████| 30/30 [00:16<00:00,  1.81it/s]


Epoch 23 Loss: 0.0061


Training Epoch 24: 100%|██████████| 30/30 [00:16<00:00,  1.85it/s]


Epoch 24 Loss: 0.0063


Training Epoch 25: 100%|██████████| 30/30 [00:16<00:00,  1.79it/s]


Epoch 25 Loss: 0.0050


Training Epoch 26: 100%|██████████| 30/30 [00:16<00:00,  1.80it/s]


Epoch 26 Loss: 0.0048


Training Epoch 27: 100%|██████████| 30/30 [00:16<00:00,  1.82it/s]


Epoch 27 Loss: 0.0064


Training Epoch 28: 100%|██████████| 30/30 [00:16<00:00,  1.85it/s]


Epoch 28 Loss: 0.0055


Training Epoch 29: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 29 Loss: 0.0048


Training Epoch 30: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 30 Loss: 0.0047


Training Epoch 31: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 31 Loss: 0.0038


Training Epoch 32: 100%|██████████| 30/30 [00:11<00:00,  2.53it/s]


Epoch 32 Loss: 0.0031


Training Epoch 33: 100%|██████████| 30/30 [00:11<00:00,  2.70it/s]


Epoch 33 Loss: 0.0031


Training Epoch 34: 100%|██████████| 30/30 [00:11<00:00,  2.52it/s]


Epoch 34 Loss: 0.0034


Training Epoch 35: 100%|██████████| 30/30 [00:11<00:00,  2.57it/s]


Epoch 35 Loss: 0.0029


Training Epoch 36: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 36 Loss: 0.0023


Training Epoch 37: 100%|██████████| 30/30 [00:10<00:00,  2.73it/s]


Epoch 37 Loss: 0.0022


Training Epoch 38: 100%|██████████| 30/30 [00:11<00:00,  2.56it/s]


Epoch 38 Loss: 0.0024


Training Epoch 39: 100%|██████████| 30/30 [00:11<00:00,  2.57it/s]


Epoch 39 Loss: 0.0026


Training Epoch 40: 100%|██████████| 30/30 [00:11<00:00,  2.67it/s]


Epoch 40 Loss: 0.0023


Training Epoch 41: 100%|██████████| 30/30 [00:11<00:00,  2.71it/s]


Epoch 41 Loss: 0.0024


Training Epoch 42: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 42 Loss: 0.0024


Training Epoch 43: 100%|██████████| 30/30 [00:11<00:00,  2.56it/s]


Epoch 43 Loss: 0.0023


Training Epoch 44: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 44 Loss: 0.0020


Training Epoch 45: 100%|██████████| 30/30 [00:11<00:00,  2.52it/s]


Epoch 45 Loss: 0.0020


Training Epoch 46: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 46 Loss: 0.0021


Training Epoch 47: 100%|██████████| 30/30 [00:11<00:00,  2.56it/s]


Epoch 47 Loss: 0.0020


Training Epoch 48: 100%|██████████| 30/30 [00:11<00:00,  2.64it/s]


Epoch 48 Loss: 0.0020


Training Epoch 49: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 49 Loss: 0.0026


Training Epoch 50: 100%|██████████| 30/30 [00:11<00:00,  2.54it/s]


Epoch 50 Loss: 0.0025

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         0
To some extent      0.000     0.000     0.000         0
           Yes      1.000     1.000     1.000        60

     micro avg      1.000     1.000     1.000        60
     macro avg      0.333     0.333     0.333        60
  weighted avg      1.000     1.000     1.000        60

Confusion Matrix:
[[ 0  0  0]
 [ 0  0  0]
 [ 0  0 60]]
Accuracy: 1.0
Macro F1: 1.0
Micro F1: 1.0
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         0
     Yes/TSE      1.000     1.000     1.000        60

   micro avg      1.000     1.000     1.000        60
   macro avg      0.500     0.500     0.500        60
weighted avg      1.000     1.000     1.000        60

Confusion Matrix:
[[ 0  0]
 [ 0 60]]
Accuracy

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:11<00:00,  2.54it/s]


Epoch 1 Loss: 1.2295


Training Epoch 2: 100%|██████████| 30/30 [00:11<00:00,  2.64it/s]


Epoch 2 Loss: 0.9196


Training Epoch 3: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 3 Loss: 0.8917


Training Epoch 4: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 4 Loss: 0.8541


Training Epoch 5: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 5 Loss: 0.8314


Training Epoch 6: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 6 Loss: 0.7368


Training Epoch 7: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 7 Loss: 0.6701


Training Epoch 8: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 8 Loss: 0.5389


Training Epoch 9: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 9 Loss: 0.4320


Training Epoch 10: 100%|██████████| 30/30 [00:11<00:00,  2.57it/s]


Epoch 10 Loss: 0.3437


Training Epoch 11: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 11 Loss: 0.2981


Training Epoch 12: 100%|██████████| 30/30 [00:11<00:00,  2.54it/s]


Epoch 12 Loss: 0.2369


Training Epoch 13: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 13 Loss: 0.1665


Training Epoch 14: 100%|██████████| 30/30 [00:11<00:00,  2.64it/s]


Epoch 14 Loss: 0.1200


Training Epoch 15: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 15 Loss: 0.1091


Training Epoch 16: 100%|██████████| 30/30 [00:08<00:00,  3.40it/s]


Epoch 16 Loss: 0.0738


Training Epoch 17: 100%|██████████| 30/30 [00:10<00:00,  2.88it/s]


Epoch 17 Loss: 0.0376


Training Epoch 18: 100%|██████████| 30/30 [00:10<00:00,  2.88it/s]


Epoch 18 Loss: 0.0437


Training Epoch 19: 100%|██████████| 30/30 [00:10<00:00,  2.87it/s]


Epoch 19 Loss: 0.0258


Training Epoch 20: 100%|██████████| 30/30 [00:10<00:00,  2.87it/s]


Epoch 20 Loss: 0.0170


Training Epoch 21: 100%|██████████| 30/30 [00:09<00:00,  3.09it/s]


Epoch 21 Loss: 0.0126


Training Epoch 22: 100%|██████████| 30/30 [00:11<00:00,  2.68it/s]


Epoch 22 Loss: 0.0098


Training Epoch 23: 100%|██████████| 30/30 [00:12<00:00,  2.50it/s]


Epoch 23 Loss: 0.0071


Training Epoch 24: 100%|██████████| 30/30 [00:11<00:00,  2.55it/s]


Epoch 24 Loss: 0.0065


Training Epoch 25: 100%|██████████| 30/30 [00:11<00:00,  2.55it/s]


Epoch 25 Loss: 0.0059


Training Epoch 26: 100%|██████████| 30/30 [00:11<00:00,  2.64it/s]


Epoch 26 Loss: 0.0073


Training Epoch 27: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 27 Loss: 0.0075


Training Epoch 28: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 28 Loss: 0.0044


Training Epoch 29: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 29 Loss: 0.0042


Training Epoch 30: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 30 Loss: 0.0042


Training Epoch 31: 100%|██████████| 30/30 [00:11<00:00,  2.72it/s]


Epoch 31 Loss: 0.0041


Training Epoch 32: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 32 Loss: 0.0036


Training Epoch 33: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 33 Loss: 0.0050


Training Epoch 34: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 34 Loss: 0.0030


Training Epoch 35: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 35 Loss: 0.0029


Training Epoch 36: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 36 Loss: 0.0028


Training Epoch 37: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 37 Loss: 0.0026


Training Epoch 38: 100%|██████████| 30/30 [00:11<00:00,  2.57it/s]


Epoch 38 Loss: 0.0027


Training Epoch 39: 100%|██████████| 30/30 [00:11<00:00,  2.69it/s]


Epoch 39 Loss: 0.0028


Training Epoch 40: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 40 Loss: 0.0023


Training Epoch 41: 100%|██████████| 30/30 [00:11<00:00,  2.55it/s]


Epoch 41 Loss: 0.0024


Training Epoch 42: 100%|██████████| 30/30 [00:11<00:00,  2.67it/s]


Epoch 42 Loss: 0.0021


Training Epoch 43: 100%|██████████| 30/30 [00:11<00:00,  2.53it/s]


Epoch 43 Loss: 0.0023


Training Epoch 44: 100%|██████████| 30/30 [00:11<00:00,  2.64it/s]


Epoch 44 Loss: 0.0023


Training Epoch 45: 100%|██████████| 30/30 [00:11<00:00,  2.68it/s]


Epoch 45 Loss: 0.0019


Training Epoch 46: 100%|██████████| 30/30 [00:11<00:00,  2.57it/s]


Epoch 46 Loss: 0.0019


Training Epoch 47: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 47 Loss: 0.0020


Training Epoch 48: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 48 Loss: 0.0023


Training Epoch 49: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 49 Loss: 0.0018


Training Epoch 50: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 50 Loss: 0.0021

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         3
To some extent      0.000     0.000     0.000         0
           Yes      0.950     1.000     0.974        57

     micro avg      0.950     0.950     0.950        60
     macro avg      0.317     0.333     0.325        60
  weighted avg      0.902     0.950     0.926        60

Confusion Matrix:
[[ 0  0  3]
 [ 0  0  0]
 [ 0  0 57]]
Accuracy: 0.95
Macro F1: 0.48717948717948717
Micro F1: 0.9500000000000001
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         3
     Yes/TSE      0.950     1.000     0.974        57

    accuracy                          0.950        60
   macro avg      0.475     0.500     0.487        60
weighted avg      0.902     0.950     0.926        60

Confusion Mat

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 1 Loss: 1.4585


Training Epoch 2: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 2 Loss: 1.3334


Training Epoch 3: 100%|██████████| 30/30 [00:11<00:00,  2.54it/s]


Epoch 3 Loss: 1.2749


Training Epoch 4: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 4 Loss: 1.2519


Training Epoch 5: 100%|██████████| 30/30 [00:11<00:00,  2.64it/s]


Epoch 5 Loss: 1.2174


Training Epoch 6: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 6 Loss: 1.0736


Training Epoch 7: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 7 Loss: 0.9208


Training Epoch 8: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 8 Loss: 0.6972


Training Epoch 9: 100%|██████████| 30/30 [00:10<00:00,  2.74it/s]


Epoch 9 Loss: 0.4484


Training Epoch 10: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 10 Loss: 0.2985


Training Epoch 11: 100%|██████████| 30/30 [00:11<00:00,  2.57it/s]


Epoch 11 Loss: 0.2271


Training Epoch 12: 100%|██████████| 30/30 [00:11<00:00,  2.56it/s]


Epoch 12 Loss: 0.1631


Training Epoch 13: 100%|██████████| 30/30 [00:11<00:00,  2.70it/s]


Epoch 13 Loss: 0.0956


Training Epoch 14: 100%|██████████| 30/30 [00:10<00:00,  2.73it/s]


Epoch 14 Loss: 0.0679


Training Epoch 15: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 15 Loss: 0.0481


Training Epoch 16: 100%|██████████| 30/30 [00:11<00:00,  2.55it/s]


Epoch 16 Loss: 0.0327


Training Epoch 17: 100%|██████████| 30/30 [00:11<00:00,  2.68it/s]


Epoch 17 Loss: 0.0268


Training Epoch 18: 100%|██████████| 30/30 [00:11<00:00,  2.55it/s]


Epoch 18 Loss: 0.0249


Training Epoch 19: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 19 Loss: 0.0188


Training Epoch 20: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 20 Loss: 0.0160


Training Epoch 21: 100%|██████████| 30/30 [00:11<00:00,  2.57it/s]


Epoch 21 Loss: 0.0140


Training Epoch 22: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 22 Loss: 0.0122


Training Epoch 23: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 23 Loss: 0.0113


Training Epoch 24: 100%|██████████| 30/30 [00:11<00:00,  2.69it/s]


Epoch 24 Loss: 0.0075


Training Epoch 25: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 25 Loss: 0.0065


Training Epoch 26: 100%|██████████| 30/30 [00:12<00:00,  2.49it/s]


Epoch 26 Loss: 0.0074


Training Epoch 27: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 27 Loss: 0.0055


Training Epoch 28: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 28 Loss: 0.0060


Training Epoch 29: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 29 Loss: 0.0070


Training Epoch 30: 100%|██████████| 30/30 [00:11<00:00,  2.71it/s]


Epoch 30 Loss: 0.0059


Training Epoch 31: 100%|██████████| 30/30 [00:11<00:00,  2.72it/s]


Epoch 31 Loss: 0.0059


Training Epoch 32: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 32 Loss: 0.0057


Training Epoch 33: 100%|██████████| 30/30 [00:11<00:00,  2.71it/s]


Epoch 33 Loss: 0.0042


Training Epoch 34: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 34 Loss: 0.0041


Training Epoch 35: 100%|██████████| 30/30 [00:11<00:00,  2.52it/s]


Epoch 35 Loss: 0.0043


Training Epoch 36: 100%|██████████| 30/30 [00:11<00:00,  2.56it/s]


Epoch 36 Loss: 0.0041


Training Epoch 37: 100%|██████████| 30/30 [00:12<00:00,  2.49it/s]


Epoch 37 Loss: 0.0044


Training Epoch 38: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 38 Loss: 0.0040


Training Epoch 39: 100%|██████████| 30/30 [00:11<00:00,  2.57it/s]


Epoch 39 Loss: 0.0036


Training Epoch 40: 100%|██████████| 30/30 [00:11<00:00,  2.72it/s]


Epoch 40 Loss: 0.0040


Training Epoch 41: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 41 Loss: 0.0033


Training Epoch 42: 100%|██████████| 30/30 [00:08<00:00,  3.37it/s]


Epoch 42 Loss: 0.0031


Training Epoch 43: 100%|██████████| 30/30 [00:10<00:00,  2.88it/s]


Epoch 43 Loss: 0.0037


Training Epoch 44: 100%|██████████| 30/30 [00:10<00:00,  2.78it/s]


Epoch 44 Loss: 0.0031


Training Epoch 45: 100%|██████████| 30/30 [00:10<00:00,  2.80it/s]


Epoch 45 Loss: 0.0031


Training Epoch 46: 100%|██████████| 30/30 [00:10<00:00,  2.88it/s]


Epoch 46 Loss: 0.0030


Training Epoch 47: 100%|██████████| 30/30 [00:11<00:00,  2.70it/s]


Epoch 47 Loss: 0.0031


Training Epoch 48: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 48 Loss: 0.0030


Training Epoch 49: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 49 Loss: 0.0031


Training Epoch 50: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 50 Loss: 0.0028

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         0
To some extent      0.000     0.000     0.000         1
           Yes      0.982     0.932     0.957        59

      accuracy                          0.917        60
     macro avg      0.327     0.311     0.319        60
  weighted avg      0.966     0.917     0.941        60

Confusion Matrix:
[[ 0  0  0]
 [ 0  0  1]
 [ 1  3 55]]
Accuracy: 0.9166666666666666
Macro F1: 0.31884057971014496
Micro F1: 0.9166666666666666
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         0
     Yes/TSE      1.000     0.983     0.992        60

    accuracy                          0.983        60
   macro avg      0.500     0.492     0.496        60
weighted avg      1.000     0.983     0.992        60


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 1 Loss: 1.4868


Training Epoch 2: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 2 Loss: 1.4811


Training Epoch 3: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 3 Loss: 1.3887


Training Epoch 4: 100%|██████████| 30/30 [00:11<00:00,  2.57it/s]


Epoch 4 Loss: 1.3508


Training Epoch 5: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 5 Loss: 1.3753


Training Epoch 6: 100%|██████████| 30/30 [00:11<00:00,  2.64it/s]


Epoch 6 Loss: 1.2967


Training Epoch 7: 100%|██████████| 30/30 [00:11<00:00,  2.57it/s]


Epoch 7 Loss: 1.2508


Training Epoch 8: 100%|██████████| 30/30 [00:10<00:00,  2.82it/s]


Epoch 8 Loss: 1.2167


Training Epoch 9: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 9 Loss: 1.1214


Training Epoch 10: 100%|██████████| 30/30 [00:11<00:00,  2.70it/s]


Epoch 10 Loss: 1.0572


Training Epoch 11: 100%|██████████| 30/30 [00:11<00:00,  2.70it/s]


Epoch 11 Loss: 0.9444


Training Epoch 12: 100%|██████████| 30/30 [00:11<00:00,  2.69it/s]


Epoch 12 Loss: 0.7266


Training Epoch 13: 100%|██████████| 30/30 [00:11<00:00,  2.67it/s]


Epoch 13 Loss: 0.6206


Training Epoch 14: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 14 Loss: 0.5225


Training Epoch 15: 100%|██████████| 30/30 [00:11<00:00,  2.56it/s]


Epoch 15 Loss: 0.4196


Training Epoch 16: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 16 Loss: 0.3234


Training Epoch 17: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 17 Loss: 0.2688


Training Epoch 18: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 18 Loss: 0.2431


Training Epoch 19: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 19 Loss: 0.1942


Training Epoch 20: 100%|██████████| 30/30 [00:11<00:00,  2.54it/s]


Epoch 20 Loss: 0.1872


Training Epoch 21: 100%|██████████| 30/30 [00:11<00:00,  2.71it/s]


Epoch 21 Loss: 0.1636


Training Epoch 22: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 22 Loss: 0.1398


Training Epoch 23: 100%|██████████| 30/30 [00:11<00:00,  2.71it/s]


Epoch 23 Loss: 0.1104


Training Epoch 24: 100%|██████████| 30/30 [00:11<00:00,  2.67it/s]


Epoch 24 Loss: 0.1042


Training Epoch 25: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 25 Loss: 0.0873


Training Epoch 26: 100%|██████████| 30/30 [00:10<00:00,  2.73it/s]


Epoch 26 Loss: 0.0783


Training Epoch 27: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 27 Loss: 0.0600


Training Epoch 28: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 28 Loss: 0.0650


Training Epoch 29: 100%|██████████| 30/30 [00:11<00:00,  2.68it/s]


Epoch 29 Loss: 0.0599


Training Epoch 30: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 30 Loss: 0.0574


Training Epoch 31: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 31 Loss: 0.0494


Training Epoch 32: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 32 Loss: 0.0476


Training Epoch 33: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 33 Loss: 0.0391


Training Epoch 34: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 34 Loss: 0.0411


Training Epoch 35: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 35 Loss: 0.0337


Training Epoch 36: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 36 Loss: 0.0315


Training Epoch 37: 100%|██████████| 30/30 [00:11<00:00,  2.73it/s]


Epoch 37 Loss: 0.0337


Training Epoch 38: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 38 Loss: 0.0275


Training Epoch 39: 100%|██████████| 30/30 [00:11<00:00,  2.70it/s]


Epoch 39 Loss: 0.0312


Training Epoch 40: 100%|██████████| 30/30 [00:11<00:00,  2.64it/s]


Epoch 40 Loss: 0.0273


Training Epoch 41: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 41 Loss: 0.0214


Training Epoch 42: 100%|██████████| 30/30 [00:11<00:00,  2.72it/s]


Epoch 42 Loss: 0.0238


Training Epoch 43: 100%|██████████| 30/30 [00:11<00:00,  2.68it/s]


Epoch 43 Loss: 0.0263


Training Epoch 44: 100%|██████████| 30/30 [00:11<00:00,  2.67it/s]


Epoch 44 Loss: 0.0189


Training Epoch 45: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 45 Loss: 0.0195


Training Epoch 46: 100%|██████████| 30/30 [00:11<00:00,  2.67it/s]


Epoch 46 Loss: 0.0200


Training Epoch 47: 100%|██████████| 30/30 [00:11<00:00,  2.66it/s]


Epoch 47 Loss: 0.0199


Training Epoch 48: 100%|██████████| 30/30 [00:11<00:00,  2.69it/s]


Epoch 48 Loss: 0.0201


Training Epoch 49: 100%|██████████| 30/30 [00:11<00:00,  2.63it/s]


Epoch 49 Loss: 0.0180


Training Epoch 50: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 50 Loss: 0.0168

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         2
To some extent      0.000     0.000     0.000         9
           Yes      0.784     0.816     0.800        49

      accuracy                          0.667        60
     macro avg      0.261     0.272     0.267        60
  weighted avg      0.641     0.667     0.653        60

Confusion Matrix:
[[ 0  0  2]
 [ 0  0  9]
 [ 1  8 40]]
Accuracy: 0.6666666666666666
Macro F1: 0.26666666666666666
Micro F1: 0.6666666666666666
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         2
     Yes/TSE      0.966     0.983     0.974        58

    accuracy                          0.950        60
   macro avg      0.483     0.491     0.487        60
weighted avg      0.934     0.950     0.942        60


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 1 Loss: 1.5493


Training Epoch 2: 100%|██████████| 30/30 [00:11<00:00,  2.67it/s]


Epoch 2 Loss: 1.4558


Training Epoch 3: 100%|██████████| 30/30 [00:11<00:00,  2.69it/s]


Epoch 3 Loss: 1.4630


Training Epoch 4: 100%|██████████| 30/30 [00:11<00:00,  2.68it/s]


Epoch 4 Loss: 1.3970


Training Epoch 5: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 5 Loss: 1.4290


Training Epoch 6: 100%|██████████| 30/30 [00:11<00:00,  2.54it/s]


Epoch 6 Loss: 1.3668


Training Epoch 7: 100%|██████████| 30/30 [00:11<00:00,  2.52it/s]


Epoch 7 Loss: 1.2771


Training Epoch 8: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 8 Loss: 1.1693


Training Epoch 9: 100%|██████████| 30/30 [00:11<00:00,  2.69it/s]


Epoch 9 Loss: 1.0150


Training Epoch 10: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 10 Loss: 0.7517


Training Epoch 11: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 11 Loss: 0.5641


Training Epoch 12: 100%|██████████| 30/30 [00:12<00:00,  2.48it/s]


Epoch 12 Loss: 0.4614


Training Epoch 13: 100%|██████████| 30/30 [00:11<00:00,  2.64it/s]


Epoch 13 Loss: 0.3906


Training Epoch 14: 100%|██████████| 30/30 [00:11<00:00,  2.62it/s]


Epoch 14 Loss: 0.2683


Training Epoch 15: 100%|██████████| 30/30 [00:11<00:00,  2.54it/s]


Epoch 15 Loss: 0.1765


Training Epoch 16: 100%|██████████| 30/30 [00:10<00:00,  2.74it/s]


Epoch 16 Loss: 0.1646


Training Epoch 17: 100%|██████████| 30/30 [00:11<00:00,  2.71it/s]


Epoch 17 Loss: 0.1358


Training Epoch 18: 100%|██████████| 30/30 [00:11<00:00,  2.67it/s]


Epoch 18 Loss: 0.0786


Training Epoch 19: 100%|██████████| 30/30 [00:10<00:00,  2.73it/s]


Epoch 19 Loss: 0.0590


Training Epoch 20: 100%|██████████| 30/30 [00:07<00:00,  3.76it/s]


Epoch 20 Loss: 0.0547


Training Epoch 21: 100%|██████████| 30/30 [00:10<00:00,  2.89it/s]


Epoch 21 Loss: 0.0428


Training Epoch 22: 100%|██████████| 30/30 [00:10<00:00,  2.90it/s]


Epoch 22 Loss: 0.0374


Training Epoch 23: 100%|██████████| 30/30 [00:10<00:00,  2.84it/s]


Epoch 23 Loss: 0.0474


Training Epoch 24: 100%|██████████| 30/30 [00:10<00:00,  2.82it/s]


Epoch 24 Loss: 0.0248


Training Epoch 25: 100%|██████████| 30/30 [00:10<00:00,  2.89it/s]


Epoch 25 Loss: 0.0222


Training Epoch 26: 100%|██████████| 30/30 [00:10<00:00,  2.77it/s]


Epoch 26 Loss: 0.0169


Training Epoch 27: 100%|██████████| 30/30 [00:11<00:00,  2.54it/s]


Epoch 27 Loss: 0.0169


Training Epoch 28: 100%|██████████| 30/30 [00:12<00:00,  2.38it/s]


Epoch 28 Loss: 0.0149


Training Epoch 29: 100%|██████████| 30/30 [00:11<00:00,  2.52it/s]


Epoch 29 Loss: 0.0125


Training Epoch 30: 100%|██████████| 30/30 [00:12<00:00,  2.43it/s]


Epoch 30 Loss: 0.0140


Training Epoch 31: 100%|██████████| 30/30 [00:11<00:00,  2.65it/s]


Epoch 31 Loss: 0.0109


Training Epoch 32: 100%|██████████| 30/30 [00:12<00:00,  2.50it/s]


Epoch 32 Loss: 0.0114


Training Epoch 33: 100%|██████████| 30/30 [00:12<00:00,  2.34it/s]


Epoch 33 Loss: 0.0104


Training Epoch 34: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 34 Loss: 0.0079


Training Epoch 35: 100%|██████████| 30/30 [00:12<00:00,  2.48it/s]


Epoch 35 Loss: 0.0077


Training Epoch 36: 100%|██████████| 30/30 [00:12<00:00,  2.48it/s]


Epoch 36 Loss: 0.0090


Training Epoch 37: 100%|██████████| 30/30 [00:12<00:00,  2.39it/s]


Epoch 37 Loss: 0.0065


Training Epoch 38: 100%|██████████| 30/30 [00:12<00:00,  2.31it/s]


Epoch 38 Loss: 0.0065


Training Epoch 39: 100%|██████████| 30/30 [00:12<00:00,  2.33it/s]


Epoch 39 Loss: 0.0065


Training Epoch 40: 100%|██████████| 30/30 [00:13<00:00,  2.30it/s]


Epoch 40 Loss: 0.0065


Training Epoch 41: 100%|██████████| 30/30 [00:12<00:00,  2.43it/s]


Epoch 41 Loss: 0.0075


Training Epoch 42: 100%|██████████| 30/30 [00:11<00:00,  2.51it/s]


Epoch 42 Loss: 0.0057


Training Epoch 43: 100%|██████████| 30/30 [00:12<00:00,  2.38it/s]


Epoch 43 Loss: 0.0052


Training Epoch 44: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 44 Loss: 0.0057


Training Epoch 45: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 45 Loss: 0.0047


Training Epoch 46: 100%|██████████| 30/30 [00:12<00:00,  2.32it/s]


Epoch 46 Loss: 0.0051


Training Epoch 47: 100%|██████████| 30/30 [00:12<00:00,  2.38it/s]


Epoch 47 Loss: 0.0053


Training Epoch 48: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 48 Loss: 0.0057


Training Epoch 49: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 49 Loss: 0.0051


Training Epoch 50: 100%|██████████| 30/30 [00:12<00:00,  2.36it/s]


Epoch 50 Loss: 0.0052

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         6
To some extent      0.000     0.000     0.000         0
           Yes      0.898     0.981     0.938        54

     micro avg      0.883     0.883     0.883        60
     macro avg      0.299     0.327     0.313        60
  weighted avg      0.808     0.883     0.844        60

Confusion Matrix:
[[ 0  0  6]
 [ 0  0  0]
 [ 1  0 53]]
Accuracy: 0.8833333333333333
Macro F1: 0.4690265486725664
Micro F1: 0.8833333333333333
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         6
     Yes/TSE      0.898     0.981     0.938        54

    accuracy                          0.883        60
   macro avg      0.449     0.491     0.469        60
weighted avg      0.808     0.883     0.844        60



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:12<00:00,  2.34it/s]


Epoch 1 Loss: 1.1790


Training Epoch 2: 100%|██████████| 30/30 [00:12<00:00,  2.37it/s]


Epoch 2 Loss: 0.8058


Training Epoch 3: 100%|██████████| 30/30 [00:12<00:00,  2.49it/s]


Epoch 3 Loss: 0.7493


Training Epoch 4: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 4 Loss: 0.7027


Training Epoch 5: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 5 Loss: 0.6351


Training Epoch 6: 100%|██████████| 30/30 [00:12<00:00,  2.39it/s]


Epoch 6 Loss: 0.5933


Training Epoch 7: 100%|██████████| 30/30 [00:13<00:00,  2.29it/s]


Epoch 7 Loss: 0.4924


Training Epoch 8: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 8 Loss: 0.3942


Training Epoch 9: 100%|██████████| 30/30 [00:12<00:00,  2.39it/s]


Epoch 9 Loss: 0.3239


Training Epoch 10: 100%|██████████| 30/30 [00:12<00:00,  2.35it/s]


Epoch 10 Loss: 0.2062


Training Epoch 11: 100%|██████████| 30/30 [00:13<00:00,  2.30it/s]


Epoch 11 Loss: 0.1594


Training Epoch 12: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 12 Loss: 0.1079


Training Epoch 13: 100%|██████████| 30/30 [00:12<00:00,  2.35it/s]


Epoch 13 Loss: 0.0747


Training Epoch 14: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 14 Loss: 0.0507


Training Epoch 15: 100%|██████████| 30/30 [00:12<00:00,  2.43it/s]


Epoch 15 Loss: 0.0316


Training Epoch 16: 100%|██████████| 30/30 [00:12<00:00,  2.47it/s]


Epoch 16 Loss: 0.0187


Training Epoch 17: 100%|██████████| 30/30 [00:12<00:00,  2.43it/s]


Epoch 17 Loss: 0.0229


Training Epoch 18: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 18 Loss: 0.0233


Training Epoch 19: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 19 Loss: 0.0146


Training Epoch 20: 100%|██████████| 30/30 [00:12<00:00,  2.35it/s]


Epoch 20 Loss: 0.0114


Training Epoch 21: 100%|██████████| 30/30 [00:12<00:00,  2.37it/s]


Epoch 21 Loss: 0.0129


Training Epoch 22: 100%|██████████| 30/30 [00:11<00:00,  2.50it/s]


Epoch 22 Loss: 0.0076


Training Epoch 23: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 23 Loss: 0.0119


Training Epoch 24: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 24 Loss: 0.0110


Training Epoch 25: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 25 Loss: 0.0071


Training Epoch 26: 100%|██████████| 30/30 [00:12<00:00,  2.35it/s]


Epoch 26 Loss: 0.0074


Training Epoch 27: 100%|██████████| 30/30 [00:12<00:00,  2.36it/s]


Epoch 27 Loss: 0.0043


Training Epoch 28: 100%|██████████| 30/30 [00:12<00:00,  2.38it/s]


Epoch 28 Loss: 0.0087


Training Epoch 29: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 29 Loss: 0.0035


Training Epoch 30: 100%|██████████| 30/30 [00:12<00:00,  2.34it/s]


Epoch 30 Loss: 0.0044


Training Epoch 31: 100%|██████████| 30/30 [00:12<00:00,  2.39it/s]


Epoch 31 Loss: 0.0039


Training Epoch 32: 100%|██████████| 30/30 [00:13<00:00,  2.29it/s]


Epoch 32 Loss: 0.0054


Training Epoch 33: 100%|██████████| 30/30 [00:12<00:00,  2.49it/s]


Epoch 33 Loss: 0.0041


Training Epoch 34: 100%|██████████| 30/30 [00:11<00:00,  2.55it/s]


Epoch 34 Loss: 0.0088


Training Epoch 35: 100%|██████████| 30/30 [00:11<00:00,  2.51it/s]


Epoch 35 Loss: 0.0040


Training Epoch 36: 100%|██████████| 30/30 [00:12<00:00,  2.32it/s]


Epoch 36 Loss: 0.0054


Training Epoch 37: 100%|██████████| 30/30 [00:12<00:00,  2.36it/s]


Epoch 37 Loss: 0.0049


Training Epoch 38: 100%|██████████| 30/30 [00:12<00:00,  2.47it/s]


Epoch 38 Loss: 0.0034


Training Epoch 39: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 39 Loss: 0.0030


Training Epoch 40: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 40 Loss: 0.0028


Training Epoch 41: 100%|██████████| 30/30 [00:12<00:00,  2.39it/s]


Epoch 41 Loss: 0.0080


Training Epoch 42: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 42 Loss: 0.0054


Training Epoch 43: 100%|██████████| 30/30 [00:12<00:00,  2.34it/s]


Epoch 43 Loss: 0.0021


Training Epoch 44: 100%|██████████| 30/30 [00:12<00:00,  2.37it/s]


Epoch 44 Loss: 0.0044


Training Epoch 45: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 45 Loss: 0.0044


Training Epoch 46: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 46 Loss: 0.0026


Training Epoch 47: 100%|██████████| 30/30 [00:11<00:00,  2.55it/s]


Epoch 47 Loss: 0.0034


Training Epoch 48: 100%|██████████| 30/30 [00:13<00:00,  2.24it/s]


Epoch 48 Loss: 0.0035


Training Epoch 49: 100%|██████████| 30/30 [00:12<00:00,  2.33it/s]


Epoch 49 Loss: 0.0040


Training Epoch 50: 100%|██████████| 30/30 [00:12<00:00,  2.34it/s]


Epoch 50 Loss: 0.0049

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.929     1.000     0.963        39
To some extent      0.000     0.000     0.000         0
           Yes      1.000     0.857     0.923        21

     micro avg      0.950     0.950     0.950        60
     macro avg      0.643     0.619     0.629        60
  weighted avg      0.954     0.950     0.949        60

Confusion Matrix:
[[39  0  0]
 [ 0  0  0]
 [ 3  0 18]]
Accuracy: 0.95
Macro F1: 0.9430199430199431
Micro F1: 0.9500000000000001
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.929     1.000     0.963        39
     Yes/TSE      1.000     0.857     0.923        21

    accuracy                          0.950        60
   macro avg      0.964     0.929     0.943        60
weighted avg      0.954     0.950     0.949        60

Confusion Matr

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 8/8 [00:03<00:00,  2.58it/s]


Epoch 1 Loss: 1.6156


Training Epoch 2: 100%|██████████| 8/8 [00:03<00:00,  2.34it/s]


Epoch 2 Loss: 1.4090


Training Epoch 3: 100%|██████████| 8/8 [00:03<00:00,  2.43it/s]


Epoch 3 Loss: 1.2625


Training Epoch 4: 100%|██████████| 8/8 [00:02<00:00,  2.70it/s]


Epoch 4 Loss: 1.2754


Training Epoch 5: 100%|██████████| 8/8 [00:02<00:00,  2.96it/s]


Epoch 5 Loss: 1.2816


Training Epoch 6: 100%|██████████| 8/8 [00:03<00:00,  2.67it/s]


Epoch 6 Loss: 1.0956


Training Epoch 7: 100%|██████████| 8/8 [00:02<00:00,  2.76it/s]


Epoch 7 Loss: 1.0778


Training Epoch 8: 100%|██████████| 8/8 [00:02<00:00,  2.67it/s]


Epoch 8 Loss: 0.9414


Training Epoch 9: 100%|██████████| 8/8 [00:02<00:00,  2.93it/s]


Epoch 9 Loss: 0.8575


Training Epoch 10: 100%|██████████| 8/8 [00:02<00:00,  2.74it/s]


Epoch 10 Loss: 0.6839


Training Epoch 11: 100%|██████████| 8/8 [00:02<00:00,  2.80it/s]


Epoch 11 Loss: 0.5777


Training Epoch 12: 100%|██████████| 8/8 [00:02<00:00,  3.82it/s]


Epoch 12 Loss: 0.4092


Training Epoch 13: 100%|██████████| 8/8 [00:02<00:00,  2.97it/s]


Epoch 13 Loss: 0.2616


Training Epoch 14: 100%|██████████| 8/8 [00:02<00:00,  2.89it/s]


Epoch 14 Loss: 0.2006


Training Epoch 15: 100%|██████████| 8/8 [00:02<00:00,  3.03it/s]


Epoch 15 Loss: 0.1361


Training Epoch 16: 100%|██████████| 8/8 [00:02<00:00,  3.01it/s]


Epoch 16 Loss: 0.0988


Training Epoch 17: 100%|██████████| 8/8 [00:02<00:00,  3.07it/s]


Epoch 17 Loss: 0.0749


Training Epoch 18: 100%|██████████| 8/8 [00:02<00:00,  3.06it/s]


Epoch 18 Loss: 0.0580


Training Epoch 19: 100%|██████████| 8/8 [00:02<00:00,  3.07it/s]


Epoch 19 Loss: 0.0540


Training Epoch 20: 100%|██████████| 8/8 [00:02<00:00,  3.87it/s]


Epoch 20 Loss: 0.0384


Training Epoch 21: 100%|██████████| 8/8 [00:03<00:00,  2.57it/s]


Epoch 21 Loss: 0.0373


Training Epoch 22: 100%|██████████| 8/8 [00:02<00:00,  3.28it/s]


Epoch 22 Loss: 0.0292


Training Epoch 23: 100%|██████████| 8/8 [00:02<00:00,  2.96it/s]


Epoch 23 Loss: 0.0277


Training Epoch 24: 100%|██████████| 8/8 [00:02<00:00,  3.35it/s]


Epoch 24 Loss: 0.0173


Training Epoch 25: 100%|██████████| 8/8 [00:02<00:00,  3.15it/s]


Epoch 25 Loss: 0.0166


Training Epoch 26: 100%|██████████| 8/8 [00:02<00:00,  3.25it/s]


Epoch 26 Loss: 0.0152


Training Epoch 27: 100%|██████████| 8/8 [00:03<00:00,  2.56it/s]


Epoch 27 Loss: 0.0147


Training Epoch 28: 100%|██████████| 8/8 [00:03<00:00,  2.59it/s]


Epoch 28 Loss: 0.0137


Training Epoch 29: 100%|██████████| 8/8 [00:02<00:00,  2.81it/s]


Epoch 29 Loss: 0.0128


Training Epoch 30: 100%|██████████| 8/8 [00:02<00:00,  2.88it/s]


Epoch 30 Loss: 0.0121


Training Epoch 31: 100%|██████████| 8/8 [00:02<00:00,  3.26it/s]


Epoch 31 Loss: 0.0177


Training Epoch 32: 100%|██████████| 8/8 [00:02<00:00,  2.80it/s]


Epoch 32 Loss: 0.0173


Training Epoch 33: 100%|██████████| 8/8 [00:02<00:00,  3.04it/s]


Epoch 33 Loss: 0.0109


Training Epoch 34: 100%|██████████| 8/8 [00:02<00:00,  2.70it/s]


Epoch 34 Loss: 0.0086


Training Epoch 35: 100%|██████████| 8/8 [00:03<00:00,  2.58it/s]


Epoch 35 Loss: 0.0080


Training Epoch 36: 100%|██████████| 8/8 [00:03<00:00,  2.62it/s]


Epoch 36 Loss: 0.0152


Training Epoch 37: 100%|██████████| 8/8 [00:03<00:00,  2.42it/s]


Epoch 37 Loss: 0.0079


Training Epoch 38: 100%|██████████| 8/8 [00:02<00:00,  2.97it/s]


Epoch 38 Loss: 0.0120


Training Epoch 39: 100%|██████████| 8/8 [00:02<00:00,  3.08it/s]


Epoch 39 Loss: 0.0095


Training Epoch 40: 100%|██████████| 8/8 [00:02<00:00,  3.32it/s]


Epoch 40 Loss: 0.0088


Training Epoch 41: 100%|██████████| 8/8 [00:02<00:00,  2.73it/s]


Epoch 41 Loss: 0.0088


Training Epoch 42: 100%|██████████| 8/8 [00:03<00:00,  2.49it/s]


Epoch 42 Loss: 0.0075


Training Epoch 43: 100%|██████████| 8/8 [00:02<00:00,  2.98it/s]


Epoch 43 Loss: 0.0071


Training Epoch 44: 100%|██████████| 8/8 [00:02<00:00,  3.02it/s]


Epoch 44 Loss: 0.0076


Training Epoch 45: 100%|██████████| 8/8 [00:02<00:00,  2.83it/s]


Epoch 45 Loss: 0.0102


Training Epoch 46: 100%|██████████| 8/8 [00:02<00:00,  2.67it/s]


Epoch 46 Loss: 0.0084


Training Epoch 47: 100%|██████████| 8/8 [00:02<00:00,  3.10it/s]


Epoch 47 Loss: 0.0085


Training Epoch 48: 100%|██████████| 8/8 [00:03<00:00,  2.51it/s]


Epoch 48 Loss: 0.0082


Training Epoch 49: 100%|██████████| 8/8 [00:02<00:00,  3.24it/s]


Epoch 49 Loss: 0.0066


Training Epoch 50: 100%|██████████| 8/8 [00:02<00:00,  2.95it/s]


Epoch 50 Loss: 0.0063

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         2
To some extent      0.400     0.667     0.500         3
           Yes      0.889     0.727     0.800        11

      accuracy                          0.625        16
     macro avg      0.430     0.465     0.433        16
  weighted avg      0.686     0.625     0.644        16

Confusion Matrix:
[[0 2 0]
 [0 2 1]
 [2 1 8]]
Accuracy: 0.625
Macro F1: 0.4333333333333333
Micro F1: 0.625
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         2
     Yes/TSE      0.857     0.857     0.857        14

    accuracy                          0.750        16
   macro avg      0.429     0.429     0.429        16
weighted avg      0.750     0.750     0.750        16

Confusion Matrix:
[[ 0  2]
 [ 2 12]

In [None]:
# --- Tutor-wise Training & Evaluation ---
if __name__ == '__main__':
    from sklearn.model_selection import train_test_split

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tutor_data = load_json_dataset_by_tutor('ai_tutors_dataset.json')
    tutor_scores = {}

    for tutor_name, data in tutor_data.items():
        print(f"\n\n================= TUTOR: {tutor_name} =================")

        train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
        train_ds = TutorDataset(train_data, tokenizer)
        val_ds = TutorDataset(val_data, tokenizer)

        train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
        val_loader = DataLoader(val_ds, batch_size=8)

        model = PedagogicalModel()
        train_model(model, train_loader, val_loader, epochs=25, lr=2e-5, use_focal=True)

        y_true, y_pred = evaluate_model(model, val_loader)
        tutor_scores[tutor_name] = {
            task: f1_score(y_true[task], y_pred[task], average='macro')
            for task in TASKS
        }

    # --- Display Summary ---
    print("\n\n========= SUMMARY: MACRO F1 SCORES BY TUTOR =========")
    for tutor, scores in tutor_scores.items():
        print(f"{tutor}:")
        for task in TASKS:
            print(f"  {task}: {scores[task]:.3f}")






Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 1 Loss: 1.5855


Training Epoch 2: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 2 Loss: 1.5351


Training Epoch 3: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 3 Loss: 1.4445


Training Epoch 4: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 4 Loss: 1.4214


Training Epoch 5: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 5 Loss: 1.3743


Training Epoch 6: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 6 Loss: 1.3594


Training Epoch 7: 100%|██████████| 30/30 [00:12<00:00,  2.43it/s]


Epoch 7 Loss: 1.2838


Training Epoch 8: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 8 Loss: 1.2273


Training Epoch 9: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 9 Loss: 1.0010


Training Epoch 10: 100%|██████████| 30/30 [00:07<00:00,  3.76it/s]


Epoch 10 Loss: 0.7282


Training Epoch 11: 100%|██████████| 30/30 [00:07<00:00,  3.77it/s]


Epoch 11 Loss: 0.5814


Training Epoch 12: 100%|██████████| 30/30 [00:09<00:00,  3.00it/s]


Epoch 12 Loss: 0.4499


Training Epoch 13: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 13 Loss: 0.2928


Training Epoch 14: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 14 Loss: 0.2599


Training Epoch 15: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 15 Loss: 0.1882


Training Epoch 16: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 16 Loss: 0.1469


Training Epoch 17: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 17 Loss: 0.1252


Training Epoch 18: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 18 Loss: 0.1045


Training Epoch 19: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 19 Loss: 0.0809


Training Epoch 20: 100%|██████████| 30/30 [00:12<00:00,  2.43it/s]


Epoch 20 Loss: 0.0775


Training Epoch 21: 100%|██████████| 30/30 [00:12<00:00,  2.43it/s]


Epoch 21 Loss: 0.0810


Training Epoch 22: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 22 Loss: 0.0599


Training Epoch 23: 100%|██████████| 30/30 [00:11<00:00,  2.68it/s]


Epoch 23 Loss: 0.0516


Training Epoch 24: 100%|██████████| 30/30 [00:07<00:00,  3.78it/s]


Epoch 24 Loss: 0.0534


Training Epoch 25: 100%|██████████| 30/30 [00:11<00:00,  2.59it/s]


Epoch 25 Loss: 0.0519

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         0
To some extent      0.000     0.000     0.000         3
           Yes      0.944     0.895     0.919        57

     micro avg      0.850     0.850     0.850        60
     macro avg      0.315     0.298     0.306        60
  weighted avg      0.897     0.850     0.873        60

Confusion Matrix:
[[ 0  0  0]
 [ 0  0  3]
 [ 0  6 51]]
Accuracy: 0.85
Macro F1: 0.4594594594594595
Micro F1: 0.85
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         0
     Yes/TSE      1.000     1.000     1.000        60

   micro avg      1.000     1.000     1.000        60
   macro avg      0.500     0.500     0.500        60
weighted avg      1.000     1.000     1.000        60

Confusion Matrix:
[[ 0  0]
 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 1 Loss: 1.7942


Training Epoch 2: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 2 Loss: 1.6641


Training Epoch 3: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 3 Loss: 1.5826


Training Epoch 4: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 4 Loss: 1.5481


Training Epoch 5: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 5 Loss: 1.5315


Training Epoch 6: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 6 Loss: 1.4554


Training Epoch 7: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 7 Loss: 1.3511


Training Epoch 8: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 8 Loss: 1.2387


Training Epoch 9: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 9 Loss: 1.0138


Training Epoch 10: 100%|██████████| 30/30 [00:10<00:00,  2.99it/s]


Epoch 10 Loss: 0.8636


Training Epoch 11: 100%|██████████| 30/30 [00:08<00:00,  3.38it/s]


Epoch 11 Loss: 0.6853


Training Epoch 12: 100%|██████████| 30/30 [00:11<00:00,  2.56it/s]


Epoch 12 Loss: 0.4919


Training Epoch 13: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 13 Loss: 0.4042


Training Epoch 14: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 14 Loss: 0.2988


Training Epoch 15: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 15 Loss: 0.2322


Training Epoch 16: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 16 Loss: 0.1847


Training Epoch 17: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 17 Loss: 0.1473


Training Epoch 18: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 18 Loss: 0.1310


Training Epoch 19: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 19 Loss: 0.1202


Training Epoch 20: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 20 Loss: 0.1030


Training Epoch 21: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 21 Loss: 0.0874


Training Epoch 22: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 22 Loss: 0.0869


Training Epoch 23: 100%|██████████| 30/30 [00:08<00:00,  3.61it/s]


Epoch 23 Loss: 0.0798


Training Epoch 24: 100%|██████████| 30/30 [00:09<00:00,  3.14it/s]


Epoch 24 Loss: 0.0681


Training Epoch 25: 100%|██████████| 30/30 [00:11<00:00,  2.54it/s]


Epoch 25 Loss: 0.0705

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         6
To some extent      0.000     0.000     0.000         4
           Yes      0.831     0.980     0.899        50

      accuracy                          0.817        60
     macro avg      0.277     0.327     0.300        60
  weighted avg      0.692     0.817     0.749        60

Confusion Matrix:
[[ 0  0  6]
 [ 0  0  4]
 [ 0  1 49]]
Accuracy: 0.8166666666666667
Macro F1: 0.2996941896024465
Micro F1: 0.8166666666666667
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         6
     Yes/TSE      0.900     1.000     0.947        54

    accuracy                          0.900        60
   macro avg      0.450     0.500     0.474        60
weighted avg      0.810     0.900     0.853        60



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:12<00:00,  2.43it/s]


Epoch 1 Loss: 1.1096


Training Epoch 2: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 2 Loss: 0.9398


Training Epoch 3: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 3 Loss: 0.9041


Training Epoch 4: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 4 Loss: 0.8958


Training Epoch 5: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 5 Loss: 0.8819


Training Epoch 6: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 6 Loss: 0.7482


Training Epoch 7: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 7 Loss: 0.6239


Training Epoch 8: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 8 Loss: 0.5594


Training Epoch 9: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 9 Loss: 0.4442


Training Epoch 10: 100%|██████████| 30/30 [00:08<00:00,  3.60it/s]


Epoch 10 Loss: 0.3911


Training Epoch 11: 100%|██████████| 30/30 [00:10<00:00,  2.84it/s]


Epoch 11 Loss: 0.2660


Training Epoch 12: 100%|██████████| 30/30 [00:11<00:00,  2.55it/s]


Epoch 12 Loss: 0.1966


Training Epoch 13: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 13 Loss: 0.1261


Training Epoch 14: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 14 Loss: 0.0962


Training Epoch 15: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 15 Loss: 0.0853


Training Epoch 16: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 16 Loss: 0.0826


Training Epoch 17: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 17 Loss: 0.0503


Training Epoch 18: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 18 Loss: 0.0451


Training Epoch 19: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 19 Loss: 0.0360


Training Epoch 20: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 20 Loss: 0.0291


Training Epoch 21: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 21 Loss: 0.0255


Training Epoch 22: 100%|██████████| 30/30 [00:10<00:00,  2.82it/s]


Epoch 22 Loss: 0.0245


Training Epoch 23: 100%|██████████| 30/30 [00:08<00:00,  3.68it/s]


Epoch 23 Loss: 0.0284


Training Epoch 24: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 24 Loss: 0.0202


Training Epoch 25: 100%|██████████| 30/30 [00:12<00:00,  2.39it/s]


Epoch 25 Loss: 0.0230

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         0
To some extent      0.000     0.000     0.000         0
           Yes      1.000     0.967     0.983        60

      accuracy                          0.967        60
     macro avg      0.333     0.322     0.328        60
  weighted avg      1.000     0.967     0.983        60

Confusion Matrix:
[[ 0  0  0]
 [ 0  0  0]
 [ 1  1 58]]
Accuracy: 0.9666666666666667
Macro F1: 0.327683615819209
Micro F1: 0.9666666666666667
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         0
     Yes/TSE      1.000     0.983     0.992        60

    accuracy                          0.983        60
   macro avg      0.500     0.492     0.496        60
weighted avg      1.000     0.983     0.992        60

C

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 1 Loss: 1.1634


Training Epoch 2: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 2 Loss: 0.9358


Training Epoch 3: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 3 Loss: 0.9286


Training Epoch 4: 100%|██████████| 30/30 [00:12<00:00,  2.39it/s]


Epoch 4 Loss: 0.8804


Training Epoch 5: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 5 Loss: 0.8298


Training Epoch 6: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 6 Loss: 0.7975


Training Epoch 7: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 7 Loss: 0.7552


Training Epoch 8: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 8 Loss: 0.6124


Training Epoch 9: 100%|██████████| 30/30 [00:08<00:00,  3.39it/s]


Epoch 9 Loss: 0.4304


Training Epoch 10: 100%|██████████| 30/30 [00:09<00:00,  3.02it/s]


Epoch 10 Loss: 0.3797


Training Epoch 11: 100%|██████████| 30/30 [00:11<00:00,  2.56it/s]


Epoch 11 Loss: 0.3486


Training Epoch 12: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 12 Loss: 0.2361


Training Epoch 13: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 13 Loss: 0.1835


Training Epoch 14: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 14 Loss: 0.1242


Training Epoch 15: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 15 Loss: 0.0825


Training Epoch 16: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 16 Loss: 0.0664


Training Epoch 17: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 17 Loss: 0.0511


Training Epoch 18: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 18 Loss: 0.0388


Training Epoch 19: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 19 Loss: 0.0263


Training Epoch 20: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 20 Loss: 0.0240


Training Epoch 21: 100%|██████████| 30/30 [00:11<00:00,  2.68it/s]


Epoch 21 Loss: 0.0231


Training Epoch 22: 100%|██████████| 30/30 [00:08<00:00,  3.69it/s]


Epoch 22 Loss: 0.0173


Training Epoch 23: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 23 Loss: 0.0158


Training Epoch 24: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 24 Loss: 0.0170


Training Epoch 25: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 25 Loss: 0.0185

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         3
To some extent      0.000     0.000     0.000         0
           Yes      0.949     0.982     0.966        57

     micro avg      0.933     0.933     0.933        60
     macro avg      0.316     0.327     0.322        60
  weighted avg      0.902     0.933     0.917        60

Confusion Matrix:
[[ 0  0  3]
 [ 0  0  0]
 [ 1  0 56]]
Accuracy: 0.9333333333333333
Macro F1: 0.48275862068965514
Micro F1: 0.9333333333333333
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         3
     Yes/TSE      0.949     0.982     0.966        57

    accuracy                          0.933        60
   macro avg      0.475     0.491     0.483        60
weighted avg      0.902     0.933     0.917        60


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:12<00:00,  2.43it/s]


Epoch 1 Loss: 1.4074


Training Epoch 2: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 2 Loss: 1.3031


Training Epoch 3: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 3 Loss: 1.3270


Training Epoch 4: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 4 Loss: 1.2828


Training Epoch 5: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 5 Loss: 1.1807


Training Epoch 6: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 6 Loss: 1.1417


Training Epoch 7: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 7 Loss: 1.0290


Training Epoch 8: 100%|██████████| 30/30 [00:10<00:00,  2.99it/s]


Epoch 8 Loss: 0.8766


Training Epoch 9: 100%|██████████| 30/30 [00:09<00:00,  3.25it/s]


Epoch 9 Loss: 0.7392


Training Epoch 10: 100%|██████████| 30/30 [00:11<00:00,  2.56it/s]


Epoch 10 Loss: 0.5117


Training Epoch 11: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 11 Loss: 0.3804


Training Epoch 12: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 12 Loss: 0.2787


Training Epoch 13: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 13 Loss: 0.1934


Training Epoch 14: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 14 Loss: 0.1625


Training Epoch 15: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 15 Loss: 0.1370


Training Epoch 16: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 16 Loss: 0.1094


Training Epoch 17: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 17 Loss: 0.0767


Training Epoch 18: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 18 Loss: 0.0617


Training Epoch 19: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 19 Loss: 0.0542


Training Epoch 20: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 20 Loss: 0.0450


Training Epoch 21: 100%|██████████| 30/30 [00:07<00:00,  3.78it/s]


Epoch 21 Loss: 0.0399


Training Epoch 22: 100%|██████████| 30/30 [00:11<00:00,  2.61it/s]


Epoch 22 Loss: 0.0311


Training Epoch 23: 100%|██████████| 30/30 [00:11<00:00,  2.56it/s]


Epoch 23 Loss: 0.0313


Training Epoch 24: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 24 Loss: 0.0348


Training Epoch 25: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 25 Loss: 0.0339

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         0
To some extent      0.000     0.000     0.000         1
           Yes      0.982     0.949     0.966        59

      accuracy                          0.933        60
     macro avg      0.327     0.316     0.322        60
  weighted avg      0.966     0.933     0.949        60

Confusion Matrix:
[[ 0  0  0]
 [ 0  0  1]
 [ 2  1 56]]
Accuracy: 0.9333333333333333
Macro F1: 0.3218390804597701
Micro F1: 0.9333333333333333
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         0
     Yes/TSE      1.000     0.967     0.983        60

    accuracy                          0.967        60
   macro avg      0.500     0.483     0.492        60
weighted avg      1.000     0.967     0.983        60



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:12<00:00,  2.38it/s]


Epoch 1 Loss: 1.5055


Training Epoch 2: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 2 Loss: 1.4047


Training Epoch 3: 100%|██████████| 30/30 [00:12<00:00,  2.39it/s]


Epoch 3 Loss: 1.3112


Training Epoch 4: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 4 Loss: 1.2926


Training Epoch 5: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 5 Loss: 1.2553


Training Epoch 6: 100%|██████████| 30/30 [00:12<00:00,  2.43it/s]


Epoch 6 Loss: 1.0479


Training Epoch 7: 100%|██████████| 30/30 [00:09<00:00,  3.04it/s]


Epoch 7 Loss: 0.8667


Training Epoch 8: 100%|██████████| 30/30 [00:07<00:00,  3.78it/s]


Epoch 8 Loss: 0.7586


Training Epoch 9: 100%|██████████| 30/30 [00:07<00:00,  3.78it/s]


Epoch 9 Loss: 0.6223


Training Epoch 10: 100%|██████████| 30/30 [00:10<00:00,  2.75it/s]


Epoch 10 Loss: 0.4108


Training Epoch 11: 100%|██████████| 30/30 [00:11<00:00,  2.57it/s]


Epoch 11 Loss: 0.3114


Training Epoch 12: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 12 Loss: 0.2793


Training Epoch 13: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 13 Loss: 0.2384


Training Epoch 14: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 14 Loss: 0.1810


Training Epoch 15: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 15 Loss: 0.1615


Training Epoch 16: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 16 Loss: 0.1345


Training Epoch 17: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 17 Loss: 0.1168


Training Epoch 18: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 18 Loss: 0.0811


Training Epoch 19: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 19 Loss: 0.0780


Training Epoch 20: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 20 Loss: 0.0812


Training Epoch 21: 100%|██████████| 30/30 [00:10<00:00,  2.83it/s]


Epoch 21 Loss: 0.0650


Training Epoch 22: 100%|██████████| 30/30 [00:08<00:00,  3.49it/s]


Epoch 22 Loss: 0.0685


Training Epoch 23: 100%|██████████| 30/30 [00:11<00:00,  2.55it/s]


Epoch 23 Loss: 0.0566


Training Epoch 24: 100%|██████████| 30/30 [00:12<00:00,  2.43it/s]


Epoch 24 Loss: 0.0548


Training Epoch 25: 100%|██████████| 30/30 [00:12<00:00,  2.46it/s]


Epoch 25 Loss: 0.0487

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         2
To some extent      0.000     0.000     0.000         9
           Yes      0.788     0.837     0.812        49

      accuracy                          0.683        60
     macro avg      0.263     0.279     0.271        60
  weighted avg      0.644     0.683     0.663        60

Confusion Matrix:
[[ 0  0  2]
 [ 0  0  9]
 [ 1  7 41]]
Accuracy: 0.6833333333333333
Macro F1: 0.2706270627062706
Micro F1: 0.6833333333333333
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         2
     Yes/TSE      0.966     0.983     0.974        58

    accuracy                          0.950        60
   macro avg      0.483     0.491     0.487        60
weighted avg      0.934     0.950     0.942        60



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 1 Loss: 1.7017


Training Epoch 2: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 2 Loss: 1.5160


Training Epoch 3: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 3 Loss: 1.4532


Training Epoch 4: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 4 Loss: 1.4055


Training Epoch 5: 100%|██████████| 30/30 [00:12<00:00,  2.43it/s]


Epoch 5 Loss: 1.4349


Training Epoch 6: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 6 Loss: 1.3192


Training Epoch 7: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 7 Loss: 1.2275


Training Epoch 8: 100%|██████████| 30/30 [00:09<00:00,  3.33it/s]


Epoch 8 Loss: 1.0033


Training Epoch 9: 100%|██████████| 30/30 [00:10<00:00,  2.91it/s]


Epoch 9 Loss: 0.8535


Training Epoch 10: 100%|██████████| 30/30 [00:11<00:00,  2.58it/s]


Epoch 10 Loss: 0.6326


Training Epoch 11: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 11 Loss: 0.4652


Training Epoch 12: 100%|██████████| 30/30 [00:12<00:00,  2.43it/s]


Epoch 12 Loss: 0.3384


Training Epoch 13: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 13 Loss: 0.2937


Training Epoch 14: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 14 Loss: 0.1867


Training Epoch 15: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 15 Loss: 0.1440


Training Epoch 16: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 16 Loss: 0.1172


Training Epoch 17: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 17 Loss: 0.1067


Training Epoch 18: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 18 Loss: 0.0872


Training Epoch 19: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 19 Loss: 0.0783


Training Epoch 20: 100%|██████████| 30/30 [00:10<00:00,  2.83it/s]


Epoch 20 Loss: 0.0653


Training Epoch 21: 100%|██████████| 30/30 [00:08<00:00,  3.44it/s]


Epoch 21 Loss: 0.0633


Training Epoch 22: 100%|██████████| 30/30 [00:11<00:00,  2.60it/s]


Epoch 22 Loss: 0.0580


Training Epoch 23: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 23 Loss: 0.0551


Training Epoch 24: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 24 Loss: 0.0477


Training Epoch 25: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 25 Loss: 0.0460

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         6
To some extent      0.000     0.000     0.000         0
           Yes      0.898     0.981     0.938        54

     micro avg      0.883     0.883     0.883        60
     macro avg      0.299     0.327     0.313        60
  weighted avg      0.808     0.883     0.844        60

Confusion Matrix:
[[ 0  0  6]
 [ 0  0  0]
 [ 1  0 53]]
Accuracy: 0.8833333333333333
Macro F1: 0.4690265486725664
Micro F1: 0.8833333333333333
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         6
     Yes/TSE      0.898     0.981     0.938        54

    accuracy                          0.883        60
   macro avg      0.449     0.491     0.469        60
weighted avg      0.808     0.883     0.844        60



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 1 Loss: 1.2314


Training Epoch 2: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 2 Loss: 0.7881


Training Epoch 3: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 3 Loss: 0.6979


Training Epoch 4: 100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch 4 Loss: 0.6540


Training Epoch 5: 100%|██████████| 30/30 [00:12<00:00,  2.44it/s]


Epoch 5 Loss: 0.6046


Training Epoch 6: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 6 Loss: 0.5840


Training Epoch 7: 100%|██████████| 30/30 [00:09<00:00,  3.31it/s]


Epoch 7 Loss: 0.4674


Training Epoch 8: 100%|██████████| 30/30 [00:10<00:00,  2.98it/s]


Epoch 8 Loss: 0.4588


Training Epoch 9: 100%|██████████| 30/30 [00:11<00:00,  2.54it/s]


Epoch 9 Loss: 0.3685


Training Epoch 10: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 10 Loss: 0.3208


Training Epoch 11: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 11 Loss: 0.2147


Training Epoch 12: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 12 Loss: 0.1918


Training Epoch 13: 100%|██████████| 30/30 [00:12<00:00,  2.41it/s]


Epoch 13 Loss: 0.1527


Training Epoch 14: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 14 Loss: 0.1254


Training Epoch 15: 100%|██████████| 30/30 [00:12<00:00,  2.39it/s]


Epoch 15 Loss: 0.0894


Training Epoch 16: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 16 Loss: 0.0627


Training Epoch 17: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 17 Loss: 0.0505


Training Epoch 18: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 18 Loss: 0.0476


Training Epoch 19: 100%|██████████| 30/30 [00:10<00:00,  2.79it/s]


Epoch 19 Loss: 0.0457


Training Epoch 20: 100%|██████████| 30/30 [00:08<00:00,  3.54it/s]


Epoch 20 Loss: 0.0351


Training Epoch 21: 100%|██████████| 30/30 [00:11<00:00,  2.55it/s]


Epoch 21 Loss: 0.0295


Training Epoch 22: 100%|██████████| 30/30 [00:12<00:00,  2.39it/s]


Epoch 22 Loss: 0.0218


Training Epoch 23: 100%|██████████| 30/30 [00:12<00:00,  2.40it/s]


Epoch 23 Loss: 0.0264


Training Epoch 24: 100%|██████████| 30/30 [00:12<00:00,  2.45it/s]


Epoch 24 Loss: 0.0253


Training Epoch 25: 100%|██████████| 30/30 [00:12<00:00,  2.39it/s]


Epoch 25 Loss: 0.0254

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.929     1.000     0.963        39
To some extent      0.000     0.000     0.000         0
           Yes      1.000     0.857     0.923        21

     micro avg      0.950     0.950     0.950        60
     macro avg      0.643     0.619     0.629        60
  weighted avg      0.954     0.950     0.949        60

Confusion Matrix:
[[39  0  0]
 [ 0  0  0]
 [ 3  0 18]]
Accuracy: 0.95
Macro F1: 0.9430199430199431
Micro F1: 0.9500000000000001
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.929     1.000     0.963        39
     Yes/TSE      1.000     0.857     0.923        21

    accuracy                          0.950        60
   macro avg      0.964     0.929     0.943        60
weighted avg      0.954     0.950     0.949        60

Confusion Matr

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 8/8 [00:02<00:00,  2.76it/s]


Epoch 1 Loss: 1.6089


Training Epoch 2: 100%|██████████| 8/8 [00:03<00:00,  2.55it/s]


Epoch 2 Loss: 1.2158


Training Epoch 3: 100%|██████████| 8/8 [00:03<00:00,  2.54it/s]


Epoch 3 Loss: 1.1866


Training Epoch 4: 100%|██████████| 8/8 [00:03<00:00,  2.55it/s]


Epoch 4 Loss: 1.1802


Training Epoch 5: 100%|██████████| 8/8 [00:03<00:00,  2.54it/s]


Epoch 5 Loss: 1.0681


Training Epoch 6: 100%|██████████| 8/8 [00:03<00:00,  2.54it/s]


Epoch 6 Loss: 1.1152


Training Epoch 7: 100%|██████████| 8/8 [00:03<00:00,  2.54it/s]


Epoch 7 Loss: 1.0268


Training Epoch 8: 100%|██████████| 8/8 [00:03<00:00,  2.55it/s]


Epoch 8 Loss: 1.1123


Training Epoch 9: 100%|██████████| 8/8 [00:03<00:00,  2.54it/s]


Epoch 9 Loss: 0.8996


Training Epoch 10: 100%|██████████| 8/8 [00:03<00:00,  2.55it/s]


Epoch 10 Loss: 0.7698


Training Epoch 11: 100%|██████████| 8/8 [00:03<00:00,  2.66it/s]


Epoch 11 Loss: 0.6466


Training Epoch 12: 100%|██████████| 8/8 [00:02<00:00,  2.77it/s]


Epoch 12 Loss: 0.6234


Training Epoch 13: 100%|██████████| 8/8 [00:03<00:00,  2.55it/s]


Epoch 13 Loss: 0.4593


Training Epoch 14: 100%|██████████| 8/8 [00:03<00:00,  2.55it/s]


Epoch 14 Loss: 0.4218


Training Epoch 15: 100%|██████████| 8/8 [00:03<00:00,  2.54it/s]


Epoch 15 Loss: 0.3537


Training Epoch 16: 100%|██████████| 8/8 [00:03<00:00,  2.55it/s]


Epoch 16 Loss: 0.2362


Training Epoch 17: 100%|██████████| 8/8 [00:03<00:00,  2.54it/s]


Epoch 17 Loss: 0.2466


Training Epoch 18: 100%|██████████| 8/8 [00:03<00:00,  2.55it/s]


Epoch 18 Loss: 0.2243


Training Epoch 19: 100%|██████████| 8/8 [00:03<00:00,  2.54it/s]


Epoch 19 Loss: 0.1804


Training Epoch 20: 100%|██████████| 8/8 [00:03<00:00,  2.54it/s]


Epoch 20 Loss: 0.1759


Training Epoch 21: 100%|██████████| 8/8 [00:02<00:00,  3.39it/s]


Epoch 21 Loss: 0.1600


Training Epoch 22: 100%|██████████| 8/8 [00:01<00:00,  4.05it/s]


Epoch 22 Loss: 0.1665


Training Epoch 23: 100%|██████████| 8/8 [00:01<00:00,  4.08it/s]


Epoch 23 Loss: 0.1153


Training Epoch 24: 100%|██████████| 8/8 [00:01<00:00,  4.08it/s]


Epoch 24 Loss: 0.1143


Training Epoch 25: 100%|██████████| 8/8 [00:01<00:00,  4.07it/s]


Epoch 25 Loss: 0.1244

FINAL EVALUATION ON VALIDATION SET

=== Mistake_Identification ===
-- Exact (3-Class) --
                precision    recall  f1-score   support

            No      0.000     0.000     0.000         2
To some extent      0.286     0.667     0.400         3
           Yes      1.000     0.636     0.778        11

      accuracy                          0.562        16
     macro avg      0.429     0.434     0.393        16
  weighted avg      0.741     0.562     0.610        16

Confusion Matrix:
[[0 2 0]
 [1 2 0]
 [1 3 7]]
Accuracy: 0.5625
Macro F1: 0.3925925925925926
Micro F1: 0.5625
-- Lenient (2-Class) --
              precision    recall  f1-score   support

          No      0.000     0.000     0.000         2
     Yes/TSE      0.857     0.857     0.857        14

    accuracy                          0.750        16
   macro avg      0.429     0.429     0.429        16
weighted avg      0.750     0.750     0.750        16

Confusion Matrix:
[[ 0  2]
 [ 2 1