In [1]:
# !pip install -U transformers huggingface_hub

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    get_linear_schedule_with_warmup,
    set_seed
)
from torch.optim import AdamW  # Changed: Import from torch instead of transformers
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix
from tqdm import tqdm
import os
import json
from datetime import datetime

In [3]:
# Set seeds for reproducibility
def set_all_seeds(seed=42):
    set_seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

set_all_seeds(42)

2025-11-06 22:22:11.327008: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762467731.349785     335 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762467731.356735     335 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# Configuration class
class Config:
    # Model
    model_name = "bert-base-uncased"
    num_labels = 3
    max_length = 256
    
    # Training
    batch_size = 64
    gradient_accumulation_steps = 2
    epochs = 5
    learning_rate = 2e-5
    weight_decay = 0.01
    warmup_ratio = 0.1
    max_grad_norm = 1.0
    
    # Early stopping
    patience = 3
    min_delta = 0.001
    
    # Optimizer
    adam_epsilon = 1e-8
    
    # Dropout
    hidden_dropout_prob = 0.1
    attention_probs_dropout_prob = 0.1
    
    # Paths
    output_dir = './outputs'
    save_dir = './fine_tuned_nli_model'
    
    # Device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    fp16 = torch.cuda.is_available()

config = Config()

print(f"Using device: {config.device}")
print(f"Mixed precision (fp16): {config.fp16}")

Using device: cuda
Mixed precision (fp16): True


In [5]:
# Load ANLI dataset
print("\nLoading ANLI dataset...")
ds = load_dataset("facebook/anli")

# Extract only train_r2, dev_r2, and test_r2
train_data = ds['train_r2']
dev_data = ds['dev_r2']
test_data = ds['test_r2']

print(f"\nDataset sizes:")
print(f"Train (R2): {len(train_data)}")
print(f"Dev (R2): {len(dev_data)}")
print(f"Test (R2): {len(test_data)}")


Loading ANLI dataset...

Dataset sizes:
Train (R2): 45460
Dev (R2): 1000
Test (R2): 1000


In [6]:
# Convert to pandas DataFrames for easier handling
def convert_to_df(dataset):
    return pd.DataFrame({
        'premise': [p.lower().strip() for p in dataset['premise']],
        'hypothesis': [h.lower().strip() for h in dataset['hypothesis']],
        'label': dataset['label']
    })
    
train_df = convert_to_df(train_data)
val_df = convert_to_df(dev_data)
test_df = convert_to_df(test_data)

In [7]:
# Check label distribution
print(f"\nTrain label distribution:")
print(train_df['label'].value_counts().sort_index())
print(f"\nVal label distribution:")
print(val_df['label'].value_counts().sort_index())


Train label distribution:
label
0    14448
1    20959
2    10053
Name: count, dtype: int64

Val label distribution:
label
0    334
1    333
2    333
Name: count, dtype: int64


In [8]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    config.model_name,
    num_labels=config.num_labels,
    hidden_dropout_prob=config.hidden_dropout_prob,
    attention_probs_dropout_prob=config.attention_probs_dropout_prob
)
model.to(config.device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [9]:
# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")


Total parameters: 109,484,547
Trainable parameters: 109,484,547


In [10]:
# Custom Dataset
class NLIDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        premise = str(self.data.loc[idx, 'premise']).strip()
        hypothesis = str(self.data.loc[idx, 'hypothesis']).strip()
        label = int(self.data.loc[idx, 'label'])
        
        encoding = self.tokenizer(
            premise,
            hypothesis,
            max_length=self.max_length,
            padding='max_length',
            truncation='longest_first',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [11]:
# Create datasets
train_dataset = NLIDataset(train_df, tokenizer, config.max_length)
val_dataset = NLIDataset(val_df, tokenizer, config.max_length)
test_dataset = NLIDataset(test_df, tokenizer, config.max_length)

In [12]:
# DataLoaders
train_loader = DataLoader(
    train_dataset, 
    batch_size=config.batch_size, 
    shuffle=True,
    num_workers=4,
    pin_memory=True if config.device.type == 'cuda' else False
)
val_loader = DataLoader(
    val_dataset, 
    batch_size=config.batch_size,
    num_workers=4,
    pin_memory=True if config.device.type == 'cuda' else False
)
test_loader = DataLoader(
    test_dataset, 
    batch_size=config.batch_size,
    num_workers=4,
    pin_memory=True if config.device.type == 'cuda' else False
)

In [13]:
# Optimizer with weight decay
no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']
optimizer_grouped_parameters = [
    {
        'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        'weight_decay': config.weight_decay
    },
    {
        'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0
    }
]

In [14]:
optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=config.learning_rate,
    eps=config.adam_epsilon
)

In [15]:
# Scheduler
total_steps = len(train_loader) * config.epochs // config.gradient_accumulation_steps
warmup_steps = int(total_steps * config.warmup_ratio)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

In [16]:
# Mixed precision scaler
scaler = torch.cuda.amp.GradScaler() if config.fp16 else None

  scaler = torch.cuda.amp.GradScaler() if config.fp16 else None


In [17]:
# Training function
def train_epoch(model, dataloader, optimizer, scheduler, device, scaler=None):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []
    
    optimizer.zero_grad()
    
    progress_bar = tqdm(dataloader, desc="Training")
    for step, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        if scaler:
            with torch.cuda.amp.autocast():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss / config.gradient_accumulation_steps
            
            scaler.scale(loss).backward()
            
            if (step + 1) % config.gradient_accumulation_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()
        else:
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss / config.gradient_accumulation_steps
            loss.backward()
            
            if (step + 1) % config.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
        
        total_loss += loss.item() * config.gradient_accumulation_steps
        
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())
        
        progress_bar.set_postfix({
            'loss': loss.item() * config.gradient_accumulation_steps,
            'lr': scheduler.get_last_lr()[0]
        })
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)
    f1_macro = f1_score(true_labels, predictions, average='macro')
    
    return avg_loss, accuracy, f1_macro

In [18]:
# Validation function
def validate(model, dataloader, device):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            if config.fp16:
                with torch.cuda.amp.autocast():
                    outputs = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
            else:
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
            
            loss = outputs.loss
            logits = outputs.logits
            
            total_loss += loss.item()
            
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)
    f1_macro = f1_score(true_labels, predictions, average='macro')
    f1_weighted = f1_score(true_labels, predictions, average='weighted')
    
    return avg_loss, accuracy, f1_macro, f1_weighted, predictions, true_labels

In [19]:
# Early stopping
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        
    def __call__(self, val_score):
        if self.best_score is None:
            self.best_score = val_score
        elif val_score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = val_score
            self.counter = 0

early_stopping = EarlyStopping(patience=config.patience, min_delta=config.min_delta)

In [20]:
# Training loop
best_val_f1 = 0
training_history = {
    'train_loss': [], 'train_acc': [], 'train_f1': [],
    'val_loss': [], 'val_acc': [], 'val_f1': []
}

os.makedirs(config.output_dir, exist_ok=True)

for epoch in range(config.epochs):
    print(f"\n{'='*70}")
    print(f"Epoch {epoch + 1}/{config.epochs}")
    print(f"{'='*70}")
    
    # Train
    train_loss, train_acc, train_f1 = train_epoch(
        model, train_loader, optimizer, scheduler, config.device, scaler
    )
    print(f"Train - Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}, F1: {train_f1:.4f}")
    
    # Validate
    val_loss, val_acc, val_f1_macro, val_f1_weighted, val_preds, val_labels = validate(
        model, val_loader, config.device
    )
    print(f"Val   - Loss: {val_loss:.4f}, Accuracy: {val_acc:.4f}")
    print(f"Val F1 (Macro): {val_f1_macro:.4f}, F1 (Weighted): {val_f1_weighted:.4f}")
    
    # Save metrics
    training_history['train_loss'].append(train_loss)
    training_history['train_acc'].append(train_acc)
    training_history['train_f1'].append(train_f1)
    training_history['val_loss'].append(val_loss)
    training_history['val_acc'].append(val_acc)
    training_history['val_f1'].append(val_f1_macro)
    
    # Save best model
    if val_f1_macro > best_val_f1:
        best_val_f1 = val_f1_macro
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'val_f1': val_f1_macro,
            'val_acc': val_acc,
        }, os.path.join(config.output_dir, 'best_model.pt'))
        print(f"Best model saved with F1: {val_f1_macro:.4f}")
    
    # Early stopping
    early_stopping(val_f1_macro)
    if early_stopping.early_stop:
        print(f"\nEarly stopping triggered after epoch {epoch + 1}")
        break



Epoch 1/5


  with torch.cuda.amp.autocast():
Training: 100%|██████████| 711/711 [07:58<00:00,  1.49it/s, loss=0.621, lr=1.78e-5]


Train - Loss: 0.8723, Accuracy: 0.5836, F1: 0.5052


  with torch.cuda.amp.autocast():
Validating: 100%|██████████| 16/16 [00:03<00:00,  4.04it/s]


Val   - Loss: 1.2974, Accuracy: 0.4130
Val F1 (Macro): 0.3962, F1 (Weighted): 0.3963
Best model saved with F1: 0.3962

Epoch 2/5


  with torch.cuda.amp.autocast():
Training: 100%|██████████| 711/711 [07:57<00:00,  1.49it/s, loss=0.566, lr=1.33e-5]


Train - Loss: 0.5531, Accuracy: 0.7758, F1: 0.7449


  with torch.cuda.amp.autocast():
Validating: 100%|██████████| 16/16 [00:04<00:00,  3.99it/s]


Val   - Loss: 1.3550, Accuracy: 0.4360
Val F1 (Macro): 0.4266, F1 (Weighted): 0.4266
Best model saved with F1: 0.4266

Epoch 3/5


  with torch.cuda.amp.autocast():
Training: 100%|██████████| 711/711 [07:57<00:00,  1.49it/s, loss=0.247, lr=8.9e-6] 


Train - Loss: 0.4110, Accuracy: 0.8397, F1: 0.8172


  with torch.cuda.amp.autocast():
Validating: 100%|██████████| 16/16 [00:03<00:00,  4.07it/s]


Val   - Loss: 1.4944, Accuracy: 0.4410
Val F1 (Macro): 0.4387, F1 (Weighted): 0.4387
Best model saved with F1: 0.4387

Epoch 4/5


  with torch.cuda.amp.autocast():
Training: 100%|██████████| 711/711 [07:57<00:00,  1.49it/s, loss=0.534, lr=4.46e-6]


Train - Loss: 0.3231, Accuracy: 0.8797, F1: 0.8622


  with torch.cuda.amp.autocast():
Validating: 100%|██████████| 16/16 [00:03<00:00,  4.02it/s]


Val   - Loss: 1.6463, Accuracy: 0.4330
Val F1 (Macro): 0.4285, F1 (Weighted): 0.4285

Epoch 5/5


  with torch.cuda.amp.autocast():
Training: 100%|██████████| 711/711 [07:56<00:00,  1.49it/s, loss=0.0637, lr=2.5e-8]


Train - Loss: 0.2683, Accuracy: 0.9031, F1: 0.8885


  with torch.cuda.amp.autocast():
Validating: 100%|██████████| 16/16 [00:04<00:00,  3.99it/s]

Val   - Loss: 1.7620, Accuracy: 0.4370
Val F1 (Macro): 0.4325, F1 (Weighted): 0.4325





In [27]:
checkpoint_path = os.path.join(config.output_dir, "best_model.pt")

# Load full checkpoint (not just weights)
checkpoint = torch.load(checkpoint_path, weights_only=False)  # <-- allow full loading
model.load_state_dict(checkpoint['model_state_dict'])
print(f"\nLoaded best model from epoch {checkpoint['epoch'] + 1}")


Loaded best model from epoch 3


In [30]:
# Final evaluation on test set
test_loss, test_acc, test_f1_macro, test_f1_weighted, test_preds, test_labels = validate(
    model, test_loader, config.device
)

print("\n" + "="*70)
print("FINAL TEST SET EVALUATION (R2)")
print("="*70)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test F1 (Macro): {test_f1_macro:.4f}")
print(f"Test F1 (Weighted): {test_f1_weighted:.4f}")

print("\nClassification Report:")
print(classification_report(
    test_labels, 
    test_preds, 
    target_names=['ENTAILMENT', 'NEUTRAL', 'CONTRADICTION'],
    digits=4
))

  with torch.cuda.amp.autocast():
Validating: 100%|██████████| 16/16 [00:04<00:00,  3.65it/s]



FINAL TEST SET EVALUATION (R2)
Test Loss: 1.5675
Test Accuracy: 0.4310
Test F1 (Macro): 0.4270
Test F1 (Weighted): 0.4271

Classification Report:
               precision    recall  f1-score   support

   ENTAILMENT     0.4108    0.5240    0.4605       334
      NEUTRAL     0.4571    0.4474    0.4522       333
CONTRADICTION     0.4315    0.3213    0.3683       333

     accuracy                         0.4310      1000
    macro avg     0.4331    0.4309    0.4270      1000
 weighted avg     0.4331    0.4310    0.4271      1000



In [31]:
print("\nConfusion Matrix:")
cm = confusion_matrix(test_labels, test_preds)
print(cm)


Confusion Matrix:
[[175  89  70]
 [113 149  71]
 [138  88 107]]


In [32]:
# Per-class accuracy
for i, label_name in enumerate(['ENTAILMENT', 'NEUTRAL', 'CONTRADICTION']):
    class_acc = cm[i, i] / cm[i].sum() if cm[i].sum() > 0 else 0
    print(f"{label_name} Accuracy: {class_acc:.4f}")

ENTAILMENT Accuracy: 0.5240
NEUTRAL Accuracy: 0.4474
CONTRADICTION Accuracy: 0.3213


In [33]:
# Save final model and tokenizer
os.makedirs(config.save_dir, exist_ok=True)
model.save_pretrained(config.save_dir)
tokenizer.save_pretrained(config.save_dir)

('./fine_tuned_nli_model/tokenizer_config.json',
 './fine_tuned_nli_model/special_tokens_map.json',
 './fine_tuned_nli_model/vocab.txt',
 './fine_tuned_nli_model/added_tokens.json',
 './fine_tuned_nli_model/tokenizer.json')

In [34]:
# Save training history
with open(os.path.join(config.save_dir, 'training_history.json'), 'w') as f:
    json.dump(training_history, f, indent=2)

# Save configuration
with open(os.path.join(config.save_dir, 'config.json'), 'w') as f:
    json.dump(vars(config), f, indent=2, default=str)

print(f"\nModel saved to {config.save_dir}")


Model saved to ./fine_tuned_nli_model


In [35]:
# Enhanced inference function
def predict_nli(premise, hypothesis, model, tokenizer, device, return_probs=False):
    """
    Predict NLI label with optional probability scores
    """
    model.eval()
    
    encoding = tokenizer(
        premise,
        hypothesis,
        max_length=config.max_length,
        padding='max_length',
        truncation='longest_first',
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
        prediction = torch.argmax(logits, dim=1).item()
    
    labels = {0: "ENTAILMENT", 1: "NEUTRAL", 2: "CONTRADICTION"}
    
    if return_probs:
        prob_dict = {labels[i]: float(probs[i]) for i in range(3)}
        return labels[prediction], prob_dict
    
    return labels[prediction]

In [36]:
# Example predictions
examples = [
    ("A person is riding a bike.", "Someone is cycling."),
    ("The sky is blue.", "It is raining."),
    ("A dog is running in the park.", "An animal is outside.")
]

print("\n" + "="*70)
print("EXAMPLE PREDICTIONS")
print("="*70)
for premise, hypothesis in examples:
    prediction, probs = predict_nli(premise, hypothesis, model, tokenizer, config.device, return_probs=True)
    print(f"\nPremise: {premise}")
    print(f"Hypothesis: {hypothesis}")
    print(f"Prediction: {prediction}")
    print(f"Confidence scores: {probs}")

print("\n" + "="*70)
print("TRAINING COMPLETE")
print("="*70)
print(f"Best Validation F1: {best_val_f1:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test F1 (Macro): {test_f1_macro:.4f}")


EXAMPLE PREDICTIONS

Premise: A person is riding a bike.
Hypothesis: Someone is cycling.
Prediction: ENTAILMENT
Confidence scores: {'ENTAILMENT': 0.8905145525932312, 'NEUTRAL': 0.045931700617074966, 'CONTRADICTION': 0.06355368345975876}

Premise: The sky is blue.
Hypothesis: It is raining.
Prediction: NEUTRAL
Confidence scores: {'ENTAILMENT': 0.05453810095787048, 'NEUTRAL': 0.8963657021522522, 'CONTRADICTION': 0.049096208065748215}

Premise: A dog is running in the park.
Hypothesis: An animal is outside.
Prediction: ENTAILMENT
Confidence scores: {'ENTAILMENT': 0.5328313112258911, 'NEUTRAL': 0.3241102993488312, 'CONTRADICTION': 0.14305846393108368}

TRAINING COMPLETE
Best Validation F1: 0.4387
Test Accuracy: 0.4310
Test F1 (Macro): 0.4270
