In [1]:
import os

os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    set_seed
)
from datasets import load_dataset
import evaluate
import numpy as np





In [3]:
NUM_CLASSES = 3
MAX_LENGTH = 128
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 2
CHECKPOINT = 'microsoft/deberta-base'
# CHECKPOINT = 'bert-large-uncased'
EVAL_STEPS = 500
set_seed(42)

In [4]:
dataset = load_dataset('snli')

In [5]:
from collections import Counter

def verify_label_distribution(dataset):
    for split in dataset.keys():
        labels = dataset[split]['label']
        label_counts = Counter(labels)
        print(f"Label distribution in '{split}' split: {label_counts}")

verify_label_distribution(dataset)

Label distribution in 'test' split: Counter({0: 3368, 2: 3237, 1: 3219, -1: 176})
Label distribution in 'validation' split: Counter({0: 3329, 2: 3278, 1: 3235, -1: 158})
Label distribution in 'train' split: Counter({0: 183416, 2: 183187, 1: 182764, -1: 785})


In [6]:
def remove_no_label(example):
    return example['label'] != -1

dataset = dataset.filter(remove_no_label)
verify_label_distribution(dataset)

Label distribution in 'test' split: Counter({0: 3368, 2: 3237, 1: 3219})
Label distribution in 'validation' split: Counter({0: 3329, 2: 3278, 1: 3235})
Label distribution in 'train' split: Counter({0: 183416, 2: 183187, 1: 182764})


In [7]:
# dataset['train'] = dataset['train'].select(range(6400))
# dataset

In [8]:
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)

def tokenize_function(examples):
    return tokenizer(
        examples['premise'],
        examples['hypothesis'],
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column('label', 'labels')
tokenized_dataset = tokenized_dataset.remove_columns(['premise', 'hypothesis', 'token_type_ids'])

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT, num_labels=NUM_CLASSES)
model

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (

In [11]:
def compute_metrics(eval_pred):
    metric = evaluate.load('accuracy')
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [12]:
training_args = TrainingArguments(
    output_dir='./results', 
    logging_dir='./logs',
    evaluation_strategy='steps', 
    eval_steps=EVAL_STEPS,
    logging_steps=EVAL_STEPS,
    save_steps=EVAL_STEPS,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.005,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_total_limit=3,
    greater_is_better=True,
    seed=42
)



In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

: 

In [None]:
trainer.train()
# checkpoint_path = './results/checkpoint-2500'
# trainer.train(resume_from_checkpoint=checkpoint_path)

  0%|          | 0/68672 [00:00<?, ?it/s]

In [15]:
test_results = trainer.predict(tokenized_dataset['test'])

  0%|          | 0/307 [00:00<?, ?it/s]

In [16]:
logits = test_results.predictions
true_labels = test_results.label_ids

predictions = np.argmax(logits, axis=-1)

In [17]:
misclassified_indices = np.where(predictions != true_labels)[0]
print(f"Number of misclassified examples: {len(misclassified_indices)}")

Number of misclassified examples: 811


In [18]:
misclassified_raw_examples = dataset['test'].select(misclassified_indices)

In [19]:
len(misclassified_indices)

811

In [21]:
import csv
with open('misclassified_examples.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['index', 'premise', 'hypothesis', 'true_label', 'predicted_label']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for idx in misclassified_indices:
        idx = int(idx)
        writer.writerow({
            'index': idx,
            'premise': dataset['test'][idx]['premise'],
            'hypothesis': dataset['test'][idx]['hypothesis'],
            'true_label': true_labels[idx],
            'predicted_label': predictions[idx]
        })