In [1]:
!pip install -U transformers huggingface_hub

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface_hub
  Downloading huggingface_hub-1.1.2-py3-none-any.whl.metadata (13 kB)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.57.1-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m90.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinu

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    get_linear_schedule_with_warmup,
    set_seed
)
from torch.optim import AdamW
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix
from tqdm import tqdm
import os
import json

In [3]:
def set_all_seeds(seed=42):
    set_seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

set_all_seeds(42)

2025-11-07 04:59:40.664949: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762491580.860505      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762491580.914386      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
class Config:
    model_name = "bert-large-uncased"
    num_labels = 3
    max_length = 512  # Increased to accommodate few-shot examples
    
    batch_size = 16  # Reduced due to longer sequences
    gradient_accumulation_steps = 4  # Increased to maintain effective batch size
    epochs = 5
    learning_rate = 2e-5
    weight_decay = 0.01
    warmup_ratio = 0.1
    max_grad_norm = 1.0
    
    patience = 3
    min_delta = 0.001
    adam_epsilon = 1e-8
    
    hidden_dropout_prob = 0.1
    attention_probs_dropout_prob = 0.1
    
    output_dir = './outputs_fewshot_cot'
    save_dir = './fine_tuned_nli_fewshot_cot'
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    fp16 = torch.cuda.is_available()

In [5]:
config = Config()

print(f"Using device: {config.device}")
print(f"Mixed precision (fp16): {config.fp16}")

Using device: cuda
Mixed precision (fp16): True


In [6]:
# Few-shot CoT prompt template
FEW_SHOT_COT_PROMPT = """Task: Natural Language Inference - Determine if a hypothesis is entailed by, neutral to, or contradicts a premise.

Example 1:
Premise: A man in a blue shirt is playing guitar on stage.
Hypothesis: A musician is performing.
Reasoning: The premise states a man is playing guitar on stage. Playing an instrument on stage indicates performing, and someone who plays guitar is a musician. Therefore, the hypothesis is supported by the premise.
Label: ENTAILMENT

Example 2:
Premise: Children are playing soccer in a park.
Hypothesis: The children are indoors.
Reasoning: The premise explicitly states the children are in a park, which is an outdoor location. The hypothesis claims they are indoors. These are contradictory locations.
Label: CONTRADICTION

Example 3:
Premise: A woman is reading a book at a table.
Hypothesis: The woman is reading a mystery novel.
Reasoning: The premise confirms a woman is reading a book, but it doesn't specify what type of book. It could be a mystery novel, but it could also be any other genre. We cannot determine this from the premise alone.
Label: NEUTRAL

Now analyze:
Premise: {premise}"""

In [7]:
def create_prompted_input(premise, hypothesis):
    """Create few-shot CoT prompted input"""
    prompted_premise = FEW_SHOT_COT_PROMPT.format(premise=premise)
    return prompted_premise, hypothesis

In [8]:
print("\nLoading ANLI dataset...")
ds = load_dataset("facebook/anli")

train_data = ds['train_r2']
dev_data = ds['dev_r2']
test_data = ds['test_r2']

print(f"\nDataset sizes:")
print(f"Train (R2): {len(train_data)}")
print(f"Dev (R2): {len(dev_data)}")
print(f"Test (R2): {len(test_data)}")


Loading ANLI dataset...


README.md: 0.00B [00:00, ?B/s]

plain_text/train_r1-00000-of-00001.parqu(…):   0%|          | 0.00/3.14M [00:00<?, ?B/s]

plain_text/dev_r1-00000-of-00001.parquet:   0%|          | 0.00/351k [00:00<?, ?B/s]

plain_text/test_r1-00000-of-00001.parque(…):   0%|          | 0.00/353k [00:00<?, ?B/s]

plain_text/train_r2-00000-of-00001.parqu(…):   0%|          | 0.00/6.53M [00:00<?, ?B/s]

plain_text/dev_r2-00000-of-00001.parquet:   0%|          | 0.00/351k [00:00<?, ?B/s]

plain_text/test_r2-00000-of-00001.parque(…):   0%|          | 0.00/362k [00:00<?, ?B/s]

plain_text/train_r3-00000-of-00001.parqu(…):   0%|          | 0.00/14.3M [00:00<?, ?B/s]

plain_text/dev_r3-00000-of-00001.parquet:   0%|          | 0.00/434k [00:00<?, ?B/s]

plain_text/test_r3-00000-of-00001.parque(…):   0%|          | 0.00/435k [00:00<?, ?B/s]

Generating train_r1 split:   0%|          | 0/16946 [00:00<?, ? examples/s]

Generating dev_r1 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test_r1 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_r2 split:   0%|          | 0/45460 [00:00<?, ? examples/s]

Generating dev_r2 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test_r2 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_r3 split:   0%|          | 0/100459 [00:00<?, ? examples/s]

Generating dev_r3 split:   0%|          | 0/1200 [00:00<?, ? examples/s]

Generating test_r3 split:   0%|          | 0/1200 [00:00<?, ? examples/s]


Dataset sizes:
Train (R2): 45460
Dev (R2): 1000
Test (R2): 1000


In [9]:
def convert_to_df(dataset):
    return pd.DataFrame({
        'premise': [p.lower().strip() for p in dataset['premise']],
        'hypothesis': [h.lower().strip() for h in dataset['hypothesis']],
        'label': dataset['label']
    })

In [10]:
train_df = convert_to_df(train_data)
val_df = convert_to_df(dev_data)
test_df = convert_to_df(test_data)

print(f"\nTrain label distribution:")
print(train_df['label'].value_counts().sort_index())


Train label distribution:
label
0    14448
1    20959
2    10053
Name: count, dtype: int64


In [11]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    config.model_name,
    num_labels=config.num_labels,
    hidden_dropout_prob=config.hidden_dropout_prob,
    attention_probs_dropout_prob=config.attention_probs_dropout_prob
)
model.to(config.device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [12]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")


Total parameters: 335,144,963
Trainable parameters: 335,144,963


In [13]:
class NLIDatasetWithCoT(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        premise = str(self.data.loc[idx, 'premise']).strip()
        hypothesis = str(self.data.loc[idx, 'hypothesis']).strip()
        label = int(self.data.loc[idx, 'label'])
        
        prompted_premise, hypothesis = create_prompted_input(premise, hypothesis)
        
        encoding = self.tokenizer(
            prompted_premise,
            hypothesis,
            max_length=self.max_length,
            padding='max_length',
            truncation='longest_first',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [14]:
train_dataset = NLIDatasetWithCoT(train_df, tokenizer, config.max_length)
val_dataset = NLIDatasetWithCoT(val_df, tokenizer, config.max_length)
test_dataset = NLIDatasetWithCoT(test_df, tokenizer, config.max_length)

print("\nExample of prompted input:")
sample = train_dataset[0]
decoded = tokenizer.decode(sample['input_ids'], skip_special_tokens=False)
print(f"\nTokenized length: {sample['attention_mask'].sum().item()} tokens")
print(f"\nFirst 500 chars of input:\n{decoded[:500]}...")


Example of prompted input:

Tokenized length: 308 tokens

First 500 chars of input:
[CLS] task : natural language inference - determine if a hypothesis is entailed by, neutral to, or contradicts a premise. example 1 : premise : a man in a blue shirt is playing guitar on stage. hypothesis : a musician is performing. reasoning : the premise states a man is playing guitar on stage. playing an instrument on stage indicates performing, and someone who plays guitar is a musician. therefore, the hypothesis is supported by the premise. label : entailment example 2 : premise : children ...


In [15]:
train_loader = DataLoader(
    train_dataset, 
    batch_size=config.batch_size, 
    shuffle=True,
    num_workers=4,
    pin_memory=True if config.device.type == 'cuda' else False
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=config.batch_size,
    num_workers=4,
    pin_memory=True if config.device.type == 'cuda' else False
)
test_loader = DataLoader(
    test_dataset, 
    batch_size=config.batch_size,
    num_workers=4,
    pin_memory=True if config.device.type == 'cuda' else False
)

In [16]:
no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']
optimizer_grouped_parameters = [
    {
        'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        'weight_decay': config.weight_decay
    },
    {
        'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0
    }
]
optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=config.learning_rate,
    eps=config.adam_epsilon
)

total_steps = len(train_loader) * config.epochs // config.gradient_accumulation_steps
warmup_steps = int(total_steps * config.warmup_ratio)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

scaler = torch.amp.GradScaler('cuda') if config.fp16 else None

In [17]:
def train_epoch(model, dataloader, optimizer, scheduler, device, scaler=None):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []
    
    optimizer.zero_grad()
    
    progress_bar = tqdm(dataloader, desc="Training")
    for step, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        if scaler:
            with torch.amp.autocast('cuda'):
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss / config.gradient_accumulation_steps
            
            scaler.scale(loss).backward()
            
            if (step + 1) % config.gradient_accumulation_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()
        else:
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss / config.gradient_accumulation_steps
            loss.backward()
            
            if (step + 1) % config.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
        
        total_loss += loss.item() * config.gradient_accumulation_steps
        
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())
        
        progress_bar.set_postfix({
            'loss': loss.item() * config.gradient_accumulation_steps,
            'lr': scheduler.get_last_lr()[0]
        })
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)
    f1_macro = f1_score(true_labels, predictions, average='macro')
    
    return avg_loss, accuracy, f1_macro

In [18]:
def validate(model, dataloader, device):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            if config.fp16:
                with torch.amp.autocast('cuda'):
                    outputs = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
            else:
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
            
            loss = outputs.loss
            logits = outputs.logits
            
            total_loss += loss.item()
            
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)
    f1_macro = f1_score(true_labels, predictions, average='macro')
    f1_weighted = f1_score(true_labels, predictions, average='weighted')
    
    return avg_loss, accuracy, f1_macro, f1_weighted, predictions, true_labels

In [19]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        
    def __call__(self, val_score):
        if self.best_score is None:
            self.best_score = val_score
        elif val_score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = val_score
            self.counter = 0

early_stopping = EarlyStopping(patience=config.patience, min_delta=config.min_delta)

In [20]:
best_val_f1 = 0
training_history = {
    'train_loss': [], 'train_acc': [], 'train_f1': [],
    'val_loss': [], 'val_acc': [], 'val_f1': []
}

os.makedirs(config.output_dir, exist_ok=True)

for epoch in range(config.epochs):
    print(f"\n{'='*70}")
    print(f"Epoch {epoch + 1}/{config.epochs}")
    print(f"{'='*70}")
    
    train_loss, train_acc, train_f1 = train_epoch(
        model, train_loader, optimizer, scheduler, config.device, scaler
    )
    print(f"Train - Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}, F1: {train_f1:.4f}")
    
    val_loss, val_acc, val_f1_macro, val_f1_weighted, val_preds, val_labels = validate(
        model, val_loader, config.device
    )
    print(f"Val   - Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}")
    print(f"Val F1 (Macro): {val_f1_macro:.4f}, F1 (Weighted): {val_f1_weighted:.4f}")
    
    training_history['train_loss'].append(train_loss)
    training_history['train_acc'].append(train_acc)
    training_history['train_f1'].append(train_f1)
    training_history['val_loss'].append(val_loss)
    training_history['val_acc'].append(val_acc)
    training_history['val_f1'].append(val_f1_macro)
    
    if val_f1_macro > best_val_f1:
        best_val_f1 = val_f1_macro
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'val_f1': val_f1_macro,
            'val_acc': val_acc,
        }, os.path.join(config.output_dir, 'best_model.pt'))
        print(f"Best model saved with F1: {val_f1_macro:.4f}")
    
    early_stopping(val_f1_macro)
    if early_stopping.early_stop:
        print(f"\nEarly stopping triggered after epoch {epoch + 1}")
        break


Epoch 1/5


Training:   0%|          | 0/2842 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used.

Train - Loss: 0.7879, Accuracy: 0.6271, F1: 0.5732


Validating:   0%|          | 0/63 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used.

Val   - Loss: 1.3780, Accuracy: 0.4090
Val F1 (Macro): 0.3920, F1 (Weighted): 0.3920
Best model saved with F1: 0.3920

Epoch 2/5


Training:   0%|          | 0/2842 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used.

Train - Loss: 0.4148, Accuracy: 0.8389, F1: 0.8178


Validating:   0%|          | 0/63 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used.

Val   - Loss: 1.5179, Accuracy: 0.4410
Val F1 (Macro): 0.4301, F1 (Weighted): 0.4301
Best model saved with F1: 0.4301

Epoch 3/5


Training:   0%|          | 0/2842 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used.

Train - Loss: 0.2344, Accuracy: 0.9155, F1: 0.9032


Validating:   0%|          | 0/63 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used.

Val   - Loss: 1.7508, Accuracy: 0.4630
Val F1 (Macro): 0.4608, F1 (Weighted): 0.4608
Best model saved with F1: 0.4608

Epoch 4/5


Training:   0%|          | 0/2842 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used.

Train - Loss: 0.1287, Accuracy: 0.9556, F1: 0.9488


Validating:   0%|          | 0/63 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used.

Val   - Loss: 2.2201, Accuracy: 0.4630
Val F1 (Macro): 0.4622, F1 (Weighted): 0.4622
Best model saved with F1: 0.4622

Epoch 5/5


Training:   0%|          | 0/2842 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used.

Train - Loss: 0.0742, Accuracy: 0.9755, F1: 0.9716


Validating:   0%|          | 0/63 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used.

Val   - Loss: 2.6975, Accuracy: 0.4630
Val F1 (Macro): 0.4593, F1 (Weighted): 0.4593





In [21]:
checkpoint = torch.load(os.path.join(config.output_dir, "best_model.pt"), weights_only=False)
model.load_state_dict(checkpoint['model_state_dict'])
print(f"\nLoaded best model from epoch {checkpoint['epoch'] + 1}")

test_loss, test_acc, test_f1_macro, test_f1_weighted, test_preds, test_labels = validate(
    model, test_loader, config.device
)

print("\n" + "="*70)
print("FINAL TEST SET EVALUATION (with Few-Shot CoT)")
print("="*70)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test F1 (Macro): {test_f1_macro:.4f}")
print(f"Test F1 (Weighted): {test_f1_weighted:.4f}")


Loaded best model from epoch 4


Validating:   0%|          | 0/63 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used.


FINAL TEST SET EVALUATION (with Few-Shot CoT)
Test Loss: 2.3098
Test Accuracy: 0.4540
Test F1 (Macro): 0.4495
Test F1 (Weighted): 0.4496





In [22]:
print("\nClassification Report:")
print(classification_report(
    test_labels, 
    test_preds, 
    target_names=['ENTAILMENT', 'NEUTRAL', 'CONTRADICTION'],
    digits=4
))

print("\nConfusion Matrix:")
cm = confusion_matrix(test_labels, test_preds)
print(cm)

for i, label_name in enumerate(['ENTAILMENT', 'NEUTRAL', 'CONTRADICTION']):
    class_acc = cm[i, i] / cm[i].sum() if cm[i].sum() > 0 else 0
    print(f"{label_name} Accuracy: {class_acc:.4f}")

os.makedirs(config.save_dir, exist_ok=True)
model.save_pretrained(config.save_dir)
tokenizer.save_pretrained(config.save_dir)


Classification Report:
               precision    recall  f1-score   support

   ENTAILMENT     0.4437    0.5659    0.4974       334
      NEUTRAL     0.4951    0.4595    0.4766       333
CONTRADICTION     0.4226    0.3363    0.3746       333

     accuracy                         0.4540      1000
    macro avg     0.4538    0.4539    0.4495      1000
 weighted avg     0.4538    0.4540    0.4496      1000


Confusion Matrix:
[[189  79  66]
 [ 93 153  87]
 [144  77 112]]
ENTAILMENT Accuracy: 0.5659
NEUTRAL Accuracy: 0.4595
CONTRADICTION Accuracy: 0.3363


('./fine_tuned_nli_fewshot_cot/tokenizer_config.json',
 './fine_tuned_nli_fewshot_cot/special_tokens_map.json',
 './fine_tuned_nli_fewshot_cot/vocab.txt',
 './fine_tuned_nli_fewshot_cot/added_tokens.json',
 './fine_tuned_nli_fewshot_cot/tokenizer.json')

In [23]:
with open(os.path.join(config.save_dir, 'training_history.json'), 'w') as f:
    json.dump(training_history, f, indent=2)

with open(os.path.join(config.save_dir, 'few_shot_prompt.txt'), 'w') as f:
    f.write(FEW_SHOT_COT_PROMPT)

print(f"\nModel saved to {config.save_dir}")


Model saved to ./fine_tuned_nli_fewshot_cot


In [24]:
def predict_nli(premise, hypothesis, model, tokenizer, device, return_probs=False):
    model.eval()
    
    prompted_premise, hypothesis = create_prompted_input(premise, hypothesis)
    
    encoding = tokenizer(
        prompted_premise,
        hypothesis,
        max_length=config.max_length,
        padding='max_length',
        truncation='longest_first',
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
        prediction = torch.argmax(logits, dim=1).item()
    
    labels = {0: "ENTAILMENT", 1: "NEUTRAL", 2: "CONTRADICTION"}
    
    if return_probs:
        prob_dict = {labels[i]: float(probs[i]) for i in range(3)}
        return labels[prediction], prob_dict
    
    return labels[prediction]

examples = [
    ("A person is riding a bike.", "Someone is cycling."),
    ("The sky is blue.", "It is raining."),
    ("A dog is running in the park.", "An animal is outside.")
]

print("\n" + "="*70)
print("EXAMPLE PREDICTIONS (with Few-Shot CoT)")
print("="*70)
for premise, hypothesis in examples:
    prediction, probs = predict_nli(premise, hypothesis, model, tokenizer, config.device, return_probs=True)
    print(f"\nPremise: {premise}")
    print(f"Hypothesis: {hypothesis}")
    print(f"Prediction: {prediction}")
    print(f"Confidence scores: {probs}")


EXAMPLE PREDICTIONS (with Few-Shot CoT)

Premise: A person is riding a bike.
Hypothesis: Someone is cycling.
Prediction: CONTRADICTION
Confidence scores: {'ENTAILMENT': 0.04756396636366844, 'NEUTRAL': 0.06240096688270569, 'CONTRADICTION': 0.8900350332260132}

Premise: The sky is blue.
Hypothesis: It is raining.
Prediction: NEUTRAL
Confidence scores: {'ENTAILMENT': 0.0023270025849342346, 'NEUTRAL': 0.9770240187644958, 'CONTRADICTION': 0.02064901776611805}

Premise: A dog is running in the park.
Hypothesis: An animal is outside.
Prediction: ENTAILMENT
Confidence scores: {'ENTAILMENT': 0.9734551906585693, 'NEUTRAL': 0.0024965349584817886, 'CONTRADICTION': 0.024048173800110817}


In [25]:
print("\n" + "="*70)
print("TRAINING COMPLETE")
print("="*70)
print(f"Best Validation F1: {best_val_f1:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test F1 (Macro): {test_f1_macro:.4f}")


TRAINING COMPLETE
Best Validation F1: 0.4622
Test Accuracy: 0.4540
Test F1 (Macro): 0.4495
