In [1]:
!pip install -q transformers datasets accelerate scikit-learn tqdm sentence-transformers


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sentence_transformers import SentenceTransformer
from tqdm import tqdm


In [1]:
# cell: imports and utils
import os, json, time, math
from datetime import datetime
from collections import defaultdict
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import random

# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# transformers
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding)

# reproducibility
SEED = 42
def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
seed_everything()
print("Torch device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))


Torch device: cuda


In [2]:
# Step 1: Clone the repo (only once per fresh runtime)
!git clone https://github.com/kaushal0494/UnifyingAITutorEvaluation.git

# Step 2: Change directory into the dataset folder
%cd UnifyingAITutorEvaluation/IndoML_Datathon/data

# Step 3: Check files
!ls -lh


Cloning into 'UnifyingAITutorEvaluation'...
remote: Enumerating objects: 106, done.[K
remote: Counting objects: 100% (106/106), done.[K
remote: Compressing objects: 100% (86/86), done.[K
remote: Total 106 (delta 31), reused 80 (delta 16), pack-reused 0 (from 0)[K
Receiving objects: 100% (106/106), 3.35 MiB | 28.10 MiB/s, done.
Resolving deltas: 100% (31/31), done.
/content/UnifyingAITutorEvaluation/IndoML_Datathon/data
total 1.8M
-rw-r--r-- 1 root root 132K Oct  3 08:25 dev_testset.json
-rw-r--r-- 1 root root 499K Oct  3 08:25 testset.json
-rw-r--r-- 1 root root 1.2M Oct  3 08:25 trainset.json


In [3]:
# =====================================================
# COMPLETE WORKING CODE WITH FIXED ENSEMBLE & PER-FOLD METRICS
# =====================================================
import json, os, random
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("CUDA available:", torch.cuda.is_available())

# =====================================================
# 1. Load & Flatten Training JSON
# =====================================================
train_data_path = "/content/UnifyingAITutorEvaluation/IndoML_Datathon/data/trainset.json"
with open(train_data_path, "r", encoding="utf-8") as f:
    train_data = json.load(f)

rows = []
for item in train_data:
    conv_id = item.get("conversation_id", "")
    history = item.get("conversation_history", "")
    tutor_responses = item.get("tutor_responses", {})
    for tutor_name, info in tutor_responses.items():
        response_text = info.get("response", "")
        annotation = info.get("annotation", {})
        mi = annotation.get("Mistake_Identification", None)
        if mi is not None:
            rows.append({
                "conversation_id": conv_id,
                "conversation_history": history,
                "tutor": tutor_name,
                "response": response_text,
                "Mistake_Identification": mi
            })

df = pd.DataFrame(rows)
print("Training dataset shape:", df.shape)
print(df["Mistake_Identification"].value_counts())

# =====================================================
# 2. Load Dev Test JSON
# =====================================================
dev_test_path = "/content/UnifyingAITutorEvaluation/IndoML_Datathon/data/testset.json"
with open(dev_test_path, "r", encoding="utf-8") as f:
    dev_test_data = json.load(f)

dev_test_rows = []
for item in dev_test_data:
    conv_id = item.get("conversation_id", "")
    history = item.get("conversation_history", "")
    tutor_responses = item.get("tutor_responses", {})
    for tutor_name, info in tutor_responses.items():
        response_text = info.get("response", "")
        dev_test_rows.append({
            "conversation_id": conv_id,
            "conversation_history": history,
            "tutor": tutor_name,
            "response": response_text
        })

dev_test_df = pd.DataFrame(dev_test_rows)
print("Dev test dataset shape:", dev_test_df.shape)

# =====================================================
# 3. MINIMAL AUGMENTATION
# =====================================================
def minimal_augment(df):
    """Simple but effective augmentation for minority class"""
    rows = df.to_dict(orient="records")
    partial = df[df["Mistake_Identification"] == "To some extent"]

    print(f"Original 'To some extent' samples: {len(partial)}")

    # Add just 2x the partial samples with simple modifications
    for _, row in partial.iterrows():
        # Version 1: Add uncertainty
        aug1 = row.copy()
        aug1['response'] = row['response'] + " I think."
        rows.append(aug1.to_dict())

        # Version 2: Add qualifier
        aug2 = row.copy()
        aug2['response'] = "Somewhat, " + row['response'].lower()
        rows.append(aug2.to_dict())

    result_df = pd.DataFrame(rows)
    print(f"After minimal augmentation: {result_df['Mistake_Identification'].value_counts()}")
    return result_df

df_aug = minimal_augment(df)

# =====================================================
# 4. Label encoding
# =====================================================
le = LabelEncoder()
df_aug["label_enc"] = le.fit_transform(df_aug["Mistake_Identification"])
classes_unique = np.unique(df_aug["label_enc"])
print("Label mapping:", {i: le.inverse_transform([i])[0] for i in classes_unique})

# =====================================================
# 5. Text preprocessing
# =====================================================
def concat_text(row, max_history_turns=3):
    history_lines = row['conversation_history'].strip().split("\n")
    if len(history_lines) > max_history_turns:
        history_lines = history_lines[-max_history_turns:]
    context = " [SEP] ".join([l.strip() for l in history_lines if l.strip()])
    return context + " [SEP] " + row['response'].strip()

texts = df_aug.apply(concat_text, axis=1).tolist()
labels = df_aug["label_enc"].values
dev_test_texts = dev_test_df.apply(concat_text, axis=1).tolist()

# =====================================================
# 6. Dataset class
# =====================================================
TRANSFORMER_MODEL = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL)

class ResponseDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.encodings = tokenizer(list(texts), truncation=True, padding="max_length", max_length=384)
        self.labels = labels if labels is not None else None

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(int(self.labels[idx]))
        return item

# =====================================================
# 7. Simple Trainer with weighted cross-entropy
# =====================================================
class SimpleTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = F.cross_entropy(logits, labels, weight=self.class_weights.to(model.device))
        return (loss, outputs) if return_outputs else loss

# =====================================================
# 8. Class weights calculation
# =====================================================
class_weights = compute_class_weight("balanced", classes=classes_unique, y=labels)
# Moderate boost for minority class
class_weights[1] *= 1.5  # "To some extent" class
class_weights = np.clip(class_weights, 0.8, 3.0)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

print("Class weights:", dict(zip(classes_unique, class_weights)))

# =====================================================
# 9. FIXED STRATIFIED K-FOLD WITH PROPER ENSEMBLE & PER-FOLD METRICS
# =====================================================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
all_val_probs = []
all_val_labels = []
all_test_probs = []

# Per-class metrics storage
class_wise_precision = {i: [] for i in classes_unique}
class_wise_recall = {i: [] for i in classes_unique}
class_wise_f1 = {i: [] for i in classes_unique}

for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
    print(f"\n=== FOLD {fold+1} ===")
    train_texts_fold = [texts[i] for i in train_idx]
    val_texts_fold = [texts[i] for i in val_idx]
    train_labels_fold = labels[train_idx]
    val_labels_fold = labels[val_idx]

    unique, counts = np.unique(val_labels_fold, return_counts=True)
    print("Validation fold class distribution:", dict(zip(unique, counts)))

    train_dataset = ResponseDataset(train_texts_fold, train_labels_fold)
    val_dataset = ResponseDataset(val_texts_fold, val_labels_fold)
    test_dataset = ResponseDataset(dev_test_texts)

    model = AutoModelForSequenceClassification.from_pretrained(
        TRANSFORMER_MODEL, num_labels=len(classes_unique)
    )

    training_args = TrainingArguments(
        output_dir=f"/content/simple_fold_{fold}",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_steps=50,
        learning_rate=3e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        warmup_steps=100,
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        save_total_limit=2,
        seed=SEED
    )

    trainer = SimpleTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=lambda p: {
            "f1_macro": f1_score(p.label_ids, np.argmax(p.predictions, axis=-1), average="macro"),
            "f1_weighted": f1_score(p.label_ids, np.argmax(p.predictions, axis=-1), average="weighted"),
            "accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=-1))
        },
        class_weights=class_weights_tensor,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()

    # ===========================
    # Validation & test predictions
    # ===========================
    val_out = trainer.predict(val_dataset)
    val_probs = F.softmax(torch.tensor(val_out.predictions), dim=1).numpy()
    all_val_probs.append(val_probs)
    all_val_labels.append(val_labels_fold)

    test_out = trainer.predict(test_dataset)
    test_probs = F.softmax(torch.tensor(test_out.predictions), dim=1).numpy()
    all_test_probs.append(test_probs)

    val_preds = np.argmax(val_probs, axis=1)

    # ===========================
    # Per-fold overall metrics
    # ===========================
    val_precision_macro = precision_score(val_labels_fold, val_preds, average="macro", zero_division=0)
    val_recall_macro = recall_score(val_labels_fold, val_preds, average="macro", zero_division=0)
    val_f1_macro = f1_score(val_labels_fold, val_preds, average="macro")
    val_acc = accuracy_score(val_labels_fold, val_preds)

    print(f"\nFold {fold+1} overall metrics:")
    print(f"  Accuracy:          {val_acc:.4f}")
    print(f"  Precision (macro): {val_precision_macro:.4f}")
    print(f"  Recall (macro):    {val_recall_macro:.4f}")
    print(f"  F1 Score (macro):  {val_f1_macro:.4f}")

    # ===========================
    # Per-class metrics
    # ===========================
    precision_per_class = precision_score(val_labels_fold, val_preds, average=None, zero_division=0)
    recall_per_class = recall_score(val_labels_fold, val_preds, average=None, zero_division=0)
    f1_per_class = f1_score(val_labels_fold, val_preds, average=None, zero_division=0)

    print("\nPer-class metrics:")
    print(f"{'Class':<20} {'Precision':<12} {'Recall':<12} {'F1-Score':<12} {'Support':<10}")
    print("-"*70)
    for class_idx in classes_unique:
        class_name = le.inverse_transform([class_idx])[0]
        support = (val_labels_fold == class_idx).sum()
        print(f"{class_name:<20} {precision_per_class[class_idx]:<12.4f} {recall_per_class[class_idx]:<12.4f} {f1_per_class[class_idx]:<12.4f} {support:<10}")
        class_wise_precision[class_idx].append(precision_per_class[class_idx])
        class_wise_recall[class_idx].append(recall_per_class[class_idx])
        class_wise_f1[class_idx].append(f1_per_class[class_idx])

# =====================================================
# 10. ENSEMBLE TEST PREDICTIONS
# =====================================================
print("\nEnsembling predictions from all folds...")
ensemble_probs = np.mean(all_test_probs, axis=0)
optimal_threshold = 0.5

final_preds = []
for p in ensemble_probs:
    if p[1] > optimal_threshold:
        final_preds.append(1)
    else:
        final_preds.append(np.argmax(p))

final_labels = le.inverse_transform(final_preds)
print(f"\nFinal prediction distribution (threshold={optimal_threshold}):")
print(pd.Series(final_labels).value_counts())

# =====================================================
# 11. Generate predictions JSON
# =====================================================
predictions = {}
for idx, row in dev_test_df.iterrows():
    conv_id = row['conversation_id']
    tutor = row['tutor']
    predicted_label = final_labels[idx]

    if conv_id not in predictions:
        predictions[conv_id] = {
            "conversation_id": conv_id,
            "conversation_history": row['conversation_history'],
            "tutor_responses": {}
        }

    predictions[conv_id]["tutor_responses"][tutor] = {
        "response": row['response'],
        "annotation": {"Mistake_Identification": predicted_label}
    }

pred_list = list(predictions.values())
with open("/content/working_predictions.json", "w", encoding="utf-8") as f:
    json.dump(pred_list, f, indent=2, ensure_ascii=False)

print(f"\nGenerated working predictions for {len(pred_list)} conversations")
print("Saved as: /content/working_predictions.json")

# =====================================================
# 12. Average validation F1 across folds
# =====================================================
avg_val_f1 = np.mean([f1_score(labels, np.argmax(probs, axis=1), average="macro")
                      for probs, labels in zip(all_val_probs, all_val_labels)])
print(f"Average validation F1 across folds: {avg_val_f1:.4f}")

# =====================================================
# 13. Cross-fold per-class metrics
# =====================================================
print("\n=== CROSS-FOLD AVERAGE METRICS ===")
avg_acc = np.mean([accuracy_score(labels, np.argmax(probs, axis=1)) for probs, labels in zip(all_val_probs, all_val_labels)])
avg_f1_macro = np.mean([f1_score(labels, np.argmax(probs, axis=1), average="macro") for probs, labels in zip(all_val_probs, all_val_labels)])
avg_prec_macro = np.mean([precision_score(labels, np.argmax(probs, axis=1), average="macro", zero_division=0) for probs, labels in zip(all_val_probs, all_val_labels)])
avg_rec_macro = np.mean([recall_score(labels, np.argmax(probs, axis=1), average="macro", zero_division=0) for probs, labels in zip(all_val_probs, all_val_labels)])

print(f"Average Accuracy (all folds):          {avg_acc:.4f}")
print(f"Average Precision (macro, all folds): {avg_prec_macro:.4f}")
print(f"Average Recall (macro, all folds):    {avg_rec_macro:.4f}")
print(f"Average F1 Score (macro, all folds):  {avg_f1_macro:.4f}")

print("\nPer-class average metrics across folds:")
print(f"{'Class':<20} {'Precision':<20} {'Recall':<20} {'F1-Score':<20}")
print("-"*80)
for class_idx in classes_unique:
    class_name = le.inverse_transform([class_idx])[0]
    print(f"{class_name:<20} {np.mean(class_wise_precision[class_idx]):<.4f} ± {np.std(class_wise_precision[class_idx]):.4f}    "
          f"{np.mean(class_wise_recall[class_idx]):<.4f} ± {np.std(class_wise_recall[class_idx]):.4f}    "
          f"{np.mean(class_wise_f1[class_idx]):<.4f} ± {np.std(class_wise_f1[class_idx]):.4f}")

# =====================================================
# 14. Validation threshold analysis
# =====================================================
print("\n=== VALIDATION THRESHOLD ANALYSIS ===")
all_val_probs_combined = np.vstack(all_val_probs)
all_val_labels_combined = np.concatenate(all_val_labels)

print("Threshold analysis on validation data:")
for thresh in [0.3, 0.35, 0.4, 0.45, 0.5]:
    preds = []
    for p in all_val_probs_combined:
        if p[1] > thresh:
            preds.append(1)
        else:
            preds.append(np.argmax(p))
    f1 = f1_score(all_val_labels_combined, preds, average="macro")
    print(f"Threshold {thresh}: F1 = {f1:.4f}")


CUDA available: True
Training dataset shape: (2476, 5)
Mistake_Identification
Yes               1932
No                 370
To some extent     174
Name: count, dtype: int64
Dev test dataset shape: (1214, 4)
Original 'To some extent' samples: 174
After minimal augmentation: Mistake_Identification
Yes               1932
To some extent     522
No                 370
Name: count, dtype: int64
Label mapping: {np.int64(0): 'No', np.int64(1): 'To some extent', np.int64(2): 'Yes'}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Class weights: {np.int64(0): np.float64(2.5441441441441444), np.int64(1): np.float64(2.704980842911877), np.int64(2): np.float64(0.8)}

=== FOLD 1 ===
Validation fold class distribution: {np.int64(0): np.int64(74), np.int64(1): np.int64(104), np.int64(2): np.int64(387)}


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtitulovesmam2215[0m ([33mtitulovesmam2215-iiser-bhopal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted,Accuracy
1,0.9892,0.744915,0.745993,0.82497,0.819469
2,0.4867,0.626987,0.769787,0.850867,0.851327
3,0.3315,0.6783,0.794845,0.863192,0.867257



Fold 1 overall metrics:
  Accuracy:          0.8673
  Precision (macro): 0.8293
  Recall (macro):    0.7693
  F1 Score (macro):  0.7948

Per-class metrics:
Class                Precision    Recall       F1-Score     Support   
----------------------------------------------------------------------
No                   0.7206       0.6622       0.6901       74        
To some extent       0.8780       0.6923       0.7742       104       
Yes                  0.8892       0.9535       0.9202       387       

=== FOLD 2 ===
Validation fold class distribution: {np.int64(0): np.int64(74), np.int64(1): np.int64(104), np.int64(2): np.int64(387)}


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted,Accuracy
1,1.0666,0.633288,0.776629,0.847057,0.840708
2,0.5534,0.498243,0.844467,0.889878,0.893805
3,0.3798,0.433666,0.849097,0.891283,0.892035



Fold 2 overall metrics:
  Accuracy:          0.8920
  Precision (macro): 0.8574
  Recall (macro):    0.8412
  F1 Score (macro):  0.8491

Per-class metrics:
Class                Precision    Recall       F1-Score     Support   
----------------------------------------------------------------------
No                   0.8310       0.7973       0.8138       74        
To some extent       0.8200       0.7885       0.8039       104       
Yes                  0.9213       0.9380       0.9296       387       

=== FOLD 3 ===
Validation fold class distribution: {np.int64(0): np.int64(74), np.int64(1): np.int64(105), np.int64(2): np.int64(386)}


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted,Accuracy
1,1.0424,0.680284,0.748748,0.822495,0.814159
2,0.537,0.806004,0.719314,0.801514,0.780531
3,0.3671,0.639996,0.784126,0.855895,0.854867



Fold 3 overall metrics:
  Accuracy:          0.8549
  Precision (macro): 0.7833
  Recall (macro):    0.7863
  F1 Score (macro):  0.7841

Per-class metrics:
Class                Precision    Recall       F1-Score     Support   
----------------------------------------------------------------------
No                   0.6173       0.6757       0.6452       74        
To some extent       0.8182       0.7714       0.7941       105       
Yes                  0.9143       0.9119       0.9131       386       

=== FOLD 4 ===
Validation fold class distribution: {np.int64(0): np.int64(74), np.int64(1): np.int64(105), np.int64(2): np.int64(386)}


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted,Accuracy
1,1.0461,0.794809,0.687467,0.772751,0.769912
2,0.5418,0.519108,0.818321,0.87283,0.876106
3,0.4182,0.522295,0.836575,0.882764,0.884956



Fold 4 overall metrics:
  Accuracy:          0.8850
  Precision (macro): 0.8657
  Recall (macro):    0.8133
  F1 Score (macro):  0.8366

Per-class metrics:
Class                Precision    Recall       F1-Score     Support   
----------------------------------------------------------------------
No                   0.8852       0.7297       0.8000       74        
To some extent       0.8081       0.7619       0.7843       105       
Yes                  0.9037       0.9482       0.9254       386       

=== FOLD 5 ===
Validation fold class distribution: {np.int64(0): np.int64(74), np.int64(1): np.int64(104), np.int64(2): np.int64(386)}


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted,Accuracy
1,1.0351,0.765065,0.727171,0.825121,0.842199
2,0.5666,0.541771,0.79678,0.855768,0.851064
3,0.3654,0.563168,0.816303,0.871076,0.870567



Fold 5 overall metrics:
  Accuracy:          0.8706
  Precision (macro): 0.8133
  Recall (macro):    0.8199
  F1 Score (macro):  0.8163

Per-class metrics:
Class                Precision    Recall       F1-Score     Support   
----------------------------------------------------------------------
No                   0.7089       0.7568       0.7320       74        
To some extent       0.8119       0.7885       0.8000       104       
Yes                  0.9193       0.9145       0.9169       386       

Ensembling predictions from all folds...

Final prediction distribution (threshold=0.5):
Yes               942
No                218
To some extent     54
Name: count, dtype: int64

Generated working predictions for 150 conversations
Saved as: /content/working_predictions.json
Average validation F1 across folds: 0.8162

=== CROSS-FOLD AVERAGE METRICS ===
Average Accuracy (all folds):          0.8739
Average Precision (macro, all folds): 0.8298
Average Recall (macro, all folds):    0