# DiffusionBERT Model Evaluation

This notebook provides a comprehensive evaluation of the DiffusionBERT model.

## 0.1 Mount Google Drive

In [1]:
from google.colab import drive

# Unmount (if previously mounted) and then mount the Drive
drive.mount('/content/drive')

# Set working directory to your Drive folder
%cd /content/drive/MyDrive/DiffusionBERT   # Update this path if necessary

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 2] No such file or directory: '/content/drive/MyDrive/DiffusionBERT # Update this path if necessary'
/content


## 0.2 Clone Repository and Install Dependencies

In [2]:
# Clone the repository if it doesn't exist
!git clone https://github.com/KfirCohen-PyLab/Diffusion-BERT.git
%cd Diffusion-BERT

# Install compatible versions
!pip install -q transformers fastNLP nltk
!pip install -q torch==2.0.1
!pip install -q numpy==1.24.3
!pip install -q pandas==2.2.2
!pip install -q matplotlib==3.7.1
!pip install -q seaborn==0.12.2
!pip install -q tqdm==4.65.0
!pip install -q tensorboard==2.13.0

# Reinstall the exact same NumPy version to prevent dependency breakage
!pip install --upgrade --force-reinstall numpy==1.26


# Verify installation
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")

Cloning into 'Diffusion-BERT'...
remote: Enumerating objects: 7129, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 7129 (delta 49), reused 58 (delta 42), pack-reused 7057 (from 3)[K
Receiving objects: 100% (7129/7129), 48.41 MiB | 16.84 MiB/s, done.
Resolving deltas: 100% (511/511), done.
Updating files: 100% (6462/6462), done.
/content/Diffusion-BERT/Diffusion-BERT
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m126.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.4.0 requires rich<14,>=12.4.4, but you have rich 11.2.0 which is incompatible.
pymc 5.22.0 requires numpy>=1.25.0, but you have numpy 1.24.3 which is incompatible.
pymc 5.22.0 requires rich>=13.7.1, but you have rich 11.2.0 which is in

## 0.3 Setup File Structure

In [3]:
import shutil
import os

# Create necessary directories
!mkdir -p word_freq
!mkdir -p checkpoints
!mkdir -p evaluation_results

# Copy files from Drive
drive_paths = {
    'checkpoint': '/content/drive/MyDrive/DiffusionBERT/diffusion_bert_lm1b_final.pt',
    'word_freq': '/content/drive/MyDrive/DiffusionBERT/word_freq.pt',
    'config': '/content/drive/MyDrive/DiffusionBERT/word_freq.json'
}

local_paths = {
    'checkpoint': 'diffusion_bert_lm1b_final.pt',
    'word_freq': 'word_freq.pt',
    'config': 'word_freq.json',
}

for key in drive_paths:
    if os.path.exists(drive_paths[key]):
        shutil.copy2(drive_paths[key], local_paths[key])
        print(f"Copied {key} file successfully")
    else:
        print(f"Warning: {key} file not found in Drive")

Copied checkpoint file successfully
Copied word_freq file successfully
Copied config file successfully


## 1. Setup and Imports

In [4]:
# -- Imports --
import torch
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import json
import logging
from datetime import datetime
from tqdm.notebook import tqdm

try:
    from transformers import AutoConfig, AutoModel, AutoTokenizer
except ImportError:
    !pip install transformers
    from transformers import AutoConfig, AutoModel, AutoTokenizer

# If your custom model is in a local file, make sure the path is correct
# e.g., /content/drive/MyDrive/DiffusionBERT/models/modeling_diffusion_bert.py
try:
    from models.modeling_diffusion_bert import DiffusionBertForMaskedLM
except ImportError as e:
    raise ImportError(
        "Could not import DiffusionBertForMaskedLM. Make sure your models/ directory "
        "is in the current working directory and contains modeling_diffusion_bert.py."
    ) from e

# -- Logging setup --
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# -- Set seeds --
torch.manual_seed(42)
np.random.seed(42)

# -- Device setup --
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Collecting numpy
  Using cached numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Using cached numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.0
    Uninstalling numpy-1.26.0:
      Successfully uninstalled numpy-1.26.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.6 which is incompatible.
tensorflow 2.18.0 requires tensorboard<2.19,>=2.18, but you have tensorboard 2.13.0 which is incompatible.
bigframes 2.4.0 requires rich<14,>=12.4.4, but you have rich 11.2.0 which is incompatible.
torchvision 0.21.0+cu124 requires torch==2.6.0, but you have torch 2.0.1 which is incompatible.
numba 0.60.0 requires numpy

  if dtype.type == np.bool:


Using device: cuda


## 2. Configuration

In [11]:
config = {
    'model_name': 'bert-base-uncased',
    'model_checkpoint_path': 'diffusion_bert_lm1b_final.pt',
    'word_freq_path': 'word_freq.pt',
    'output_dir': 'evaluation_results',
    'max_position_embeddings': 512,
    'max_seq_length': 128,
    'batch_size': 32,
    'num_samples': 1000,
    'temperature': 1.0,
    'top_k': 50,
    'top_p': 0.9
}

## 3. Model Loading

In [12]:
def load_model_and_tokenizer(config):
    try:
        # Register custom model
        AutoConfig.register("diffusion-bert", DiffusionBertForMaskedLM)
        AutoModel.register(DiffusionBertForMaskedLM, "diffusion-bert")

        # Load tokenizer and config
        tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
        model_config = AutoConfig.from_pretrained(config['model_name'])
        model_config.max_position_embeddings = config['max_position_embeddings']

        # Initialize and load model
        model = DiffusionBertForMaskedLM(model_config)
        checkpoint = torch.load(config['model_checkpoint_path'], map_location='cpu')
        model.load_state_dict(checkpoint['model'])

        # Move to device
        model = model.to(device)
        model.eval()

        return model, tokenizer

    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        raise

model, tokenizer = load_model_and_tokenizer(config)
print("Model loaded successfully")

ERROR:__main__:Error loading model: 'model'


KeyError: 'model'

## 4. Load Word Frequencies

In [None]:
def load_word_frequencies(config):
    try:
        word_freq = torch.load(config['word_freq_path'])
        word_freq = word_freq + 1  # Add smoothing
        word_freq = word_freq.log()
        word_freq = word_freq / word_freq.max()
        return word_freq.to(device)
    except Exception as e:
        logger.error(f"Error loading word frequencies: {str(e)}")
        raise

word_freq = load_word_frequencies(config)
print(f"Word frequencies loaded with shape: {word_freq.shape}")

## 5. Evaluation Functions

In [None]:
def evaluate_model(model, tokenizer, word_freq, config):
    results = {
        'perplexities': [],
        'word_freq_scores': [],
        'samples': []
    }

    with torch.no_grad():
        for _ in tqdm(range(config['num_samples']), desc="Evaluating"):
            # Generate sample
            input_ids = torch.randint(100, 1000, (1, config['max_seq_length'])).to(device)
            attention_mask = torch.ones_like(input_ids)

            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=config['max_seq_length'],
                temperature=config['temperature'],
                top_k=config['top_k'],
                top_p=config['top_p'],
                do_sample=True
            )

            # Compute metrics
            loss = model(input_ids=outputs, labels=outputs).loss
            perplexity = torch.exp(loss).item()
            word_freq_score = word_freq.gather(0, outputs.view(-1)).mean().item()

            # Store results
            results['perplexities'].append(perplexity)
            results['word_freq_scores'].append(word_freq_score)
            results['samples'].append({
                'input': tokenizer.decode(input_ids[0]),
                'generated': tokenizer.decode(outputs[0]),
                'perplexity': perplexity,
                'word_freq_score': word_freq_score
            })

    return results

## 6. Run Evaluation

In [6]:
print("Starting evaluation...")
results = evaluate_model(model, tokenizer, word_freq, config)

# Compute metrics
metrics = {
    'avg_perplexity': np.mean(results['perplexities']),
    'std_perplexity': np.std(results['perplexities']),
    'avg_word_freq_score': np.mean(results['word_freq_scores']),
    'std_word_freq_score': np.std(results['word_freq_scores'])
}

results['metrics'] = metrics
print("\nEvaluation Results:")
print(f"Average Perplexity: {metrics['avg_perplexity']:.2f} ± {metrics['std_perplexity']:.2f}")
print(f"Word Freq Score: {metrics['avg_word_freq_score']:.4f} ± {metrics['std_word_freq_score']:.4f}")

Starting evaluation...


NameError: name 'evaluate_model' is not defined

## 7. Save Results

In [None]:
# Save results
output_dir = Path(config['output_dir'])
output_dir.mkdir(parents=True, exist_ok=True)

timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
output_file = output_dir / f"eval_results_{timestamp}.json"

with open(output_file, 'w') as f:
    json.dump({
        'config': config,
        'metrics': metrics,
        'samples': results['samples'][:10]  # Save first 10 samples
    }, f, indent=2)

print(f"Results saved to {output_file}")