                                                                NLP ASSIGNMENT - 4

In [1]:
import transformers, accelerate, datasets
print("Transformers:", transformers.__version__)
print("Accelerate:", accelerate.__version__)
print("Datasets:", datasets.__version__)



W0907 17:31:06.833000 12544 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


Transformers: 4.37.2
Accelerate: 0.27.2
Datasets: 2.19.1


In [None]:
import numpy as np
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    pipeline,
)
from sklearn.metrics import f1_score, accuracy_score
import random
import os

# Define a custom metric computation function
# The Trainer API requires a function that takes EvalPrediction and returns a dictionary
def compute_metrics(eval_pred):
    """
    Computes accuracy and F1 score for model evaluation.
    
    Args:
        eval_pred (EvalPrediction): A tuple containing model predictions and true labels.
    
    Returns:
        dict: A dictionary with 'accuracy' and 'f1' scores.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average='binary')
    accuracy = accuracy_score(labels, predictions)
    return {"f1": f1, "accuracy": accuracy}

def tokenize_function(examples, tokenizer):
    """
    Tokenizes a batch of text examples.
    """
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def finetune_and_evaluate(model_name, dataset, tokenizer, training_args, subset_size=None):
    """
    Fine-tunes and evaluates a model on a given dataset.
    
    Args:
        model_name (str): The name of the model from the Hugging Face Hub.
        dataset (DatasetDict): The dataset to use for training and evaluation.
        tokenizer (PreTrainedTokenizer): The tokenizer to use.
        training_args (TrainingArguments): The training arguments.
        subset_size (int, optional): The number of examples to use for the subset.
                                     If None, uses the full dataset.
    
    Returns:
        dict: A dictionary containing the evaluation metrics.
    """
    print(f"\n--- Fine-tuning and evaluating {model_name} ---")
    
    # Load the model for sequence classification
    try:
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=2,
            ignore_mismatched_sizes=True
        )
    except Exception as e:
        print(f"Could not load model {model_name}. Skipping. Error: {e}")
        return {"f1": 0, "accuracy": 0}
        
    # Prepare the subset if specified
    if subset_size:
        train_dataset = dataset["train"].shuffle(seed=42).select(range(subset_size))
        eval_dataset = dataset["test"].shuffle(seed=42).select(range(int(subset_size * 0.2)))
    else:
        train_dataset = dataset["train"]
        eval_dataset = dataset["test"]

    # Tokenize the datasets
    tokenized_train_dataset = train_dataset.map(lambda examples: tokenize_function(examples, tokenizer), batched=True)
    tokenized_eval_dataset = eval_dataset.map(lambda examples: tokenize_function(examples, tokenizer), batched=True)

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_eval_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )
    
    # Train the model
    trainer.train()
    
    # Evaluate the model
    eval_results = trainer.evaluate()
    print(f"Evaluation results for {model_name}: {eval_results}")
    
    return eval_results

def main():
    # Set up device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Step 1: Load the dataset
    print("Loading IMDB dataset...")
    imdb_dataset = load_dataset("imdb")

    # Step 2: Define models to test and their tokenizers
    models_to_test = {
        "distilbert-base-uncased": AutoTokenizer.from_pretrained("distilbert-base-uncased"),
        "bert-base-uncased": AutoTokenizer.from_pretrained("bert-base-uncased"),
        "roberta-base": AutoTokenizer.from_pretrained("roberta-base"),
        "google/electra-small-discriminator": AutoTokenizer.from_pretrained("google/electra-small-discriminator"),
        "GTE-small": AutoTokenizer.from_pretrained("TaylorAI/gte-tiny"),
    }
    
    # Define training arguments for the subset fine-tuning
    subset_train_args = TrainingArguments(
        output_dir="./subset_results",
        evaluation_strategy="epoch",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        logging_dir='./subset_logs',
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    # Step 3: Fine-tune and evaluate models on a subset
    subset_results = {}
    subset_size = 5000  # Use a subset of 5000 examples
    
    for model_name, tokenizer in models_to_test.items():
        results = finetune_and_evaluate(model_name, imdb_dataset, tokenizer, subset_train_args, subset_size)
        subset_results[model_name] = results
    
    # Find the best model based on F1 score
    best_model_name = max(subset_results, key=lambda k: subset_results[k].get("eval_f1", 0))
    print(f"\n--- Best model from subset fine-tuning is: {best_model_name} with F1 score: {subset_results[best_model_name].get('eval_f1')} ---")

    # Step 4: Fine-tune the best model on the entire dataset
    print(f"\n--- Fine-tuning {best_model_name} on the entire dataset ---")
    
    # Define training arguments for full dataset fine-tuning
    full_train_args = TrainingArguments(
        output_dir=f"./full_dataset_results_{best_model_name.replace('/', '_')}",
        evaluation_strategy="epoch",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir='./full_dataset_logs',
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    # Load the tokenizer for the best model
    best_model_tokenizer = models_to_test[best_model_name]

    # Perform the final fine-tuning
    finetune_and_evaluate(best_model_name, imdb_dataset, best_model_tokenizer, full_train_args)

    # Step 5: Inference on 10 random samples
    print("\n--- Running inference on 10 random samples from the test set ---")
    
    # Load the fine-tuned model for inference
    inference_model_path = full_train_args.output_dir
    final_model = AutoModelForSequenceClassification.from_pretrained(inference_model_path)
    final_tokenizer = best_model_tokenizer

    # Create a text classification pipeline
    classifier = pipeline(
        "text-classification",
        model=final_model,
        tokenizer=final_tokenizer,
        device=0 if torch.cuda.is_available() else -1,
        return_all_scores=False
    )
    
    # Get 10 random samples from the test set
    test_samples = imdb_dataset["test"].shuffle(seed=42).select(range(10))
    
    label_map = {0: "Negative", 1: "Positive"}
    
    for i, sample in enumerate(test_samples):
        review = sample["text"]
        true_label = label_map[sample["label"]]
        
        # Run inference
        prediction = classifier(review)[0]
        predicted_label = prediction['label']
        confidence = prediction['score']

        print(f"\n--- Review {i+1} ---")
        print(f"Review: {review[:200]}...")
        print(f"True Label: {true_label}")
        print(f"Predicted Label: {predicted_label} (Confidence: {confidence:.4f})")

if __name__ == "__main__":
    main()

Using device: cpu
Loading IMDB dataset...


Downloading readme: 0.00B [00:00, ?B/s]




--- Fine-tuning and evaluating distilbert-base-uncased ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  0%|          | 0/1250 [00:00<?, ?it/s]



{'loss': 0.3965, 'learning_rate': 3e-05, 'epoch': 0.8}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.28543055057525635, 'eval_f1': 0.8986693961105424, 'eval_accuracy': 0.901, 'eval_runtime': 212.8047, 'eval_samples_per_second': 4.699, 'eval_steps_per_second': 0.587, 'epoch': 1.0}




{'loss': 0.1817, 'learning_rate': 1e-05, 'epoch': 1.6}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.41365498304367065, 'eval_f1': 0.901010101010101, 'eval_accuracy': 0.902, 'eval_runtime': 211.7146, 'eval_samples_per_second': 4.723, 'eval_steps_per_second': 0.59, 'epoch': 2.0}
{'train_runtime': 10117.8051, 'train_samples_per_second': 0.988, 'train_steps_per_second': 0.124, 'train_loss': 0.26440618896484375, 'epoch': 2.0}




  0%|          | 0/125 [00:00<?, ?it/s]

Evaluation results for distilbert-base-uncased: {'eval_loss': 0.28543055057525635, 'eval_f1': 0.8986693961105424, 'eval_accuracy': 0.901, 'eval_runtime': 216.6654, 'eval_samples_per_second': 4.615, 'eval_steps_per_second': 0.577, 'epoch': 2.0}

--- Fine-tuning and evaluating bert-base-uncased ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

