# 1. Setup

In [3]:
%pip install -q -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [1]:
import sys
import os

# Ajoutez le chemin de votre répertoire src
sys.path.append(os.path.abspath(".."))

In [34]:
from collections.abc import Sequence
import enum
import gc

import datasets
import huggingface_hub

from src.synthid_text import detector_mean
from src.synthid_text import logits_processing
from src.synthid_text import synthid_mixin
from src.synthid_text import detector_bayesian
import tensorflow as tf
import torch
import tqdm
import transformers
import accelerate
import matplotlib as plt
import numpy as np

In [21]:
class ModelName(enum.Enum):
  GPT2 = 'gpt2'
  GEMMA_2B = 'google/gemma-2b-it'
  GEMMA_7B = 'google/gemma-7b-it'
  OLMO = 'allenai/OLMo-1B'
  LLAMA = 'meta-llama/Llama-3.2-1B'

MODEL_NAME_1 = ModelName('meta-llama/Llama-3.2-1B')
MODEL_NAME_2 = ModelName('allenai/OLMo-1B')
MODEL_NAME_3 = ModelName('google/gemma-7b-it')
MODEL_NAME_4 = ModelName('gpt2')

huggingface_hub.notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
DEVICE = (
    torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
)
DEVICE

device(type='cpu')

In [23]:
CONFIG = synthid_mixin.DEFAULT_WATERMARKING_CONFIG
CONFIG

immutabledict({'ngram_len': 5, 'keys': [654, 400, 836, 123, 340, 443, 597, 160, 57, 29, 590, 639, 13, 715, 468, 990, 966, 226, 324, 585, 118, 504, 421, 521, 129, 669, 732, 225, 90, 960], 'sampling_table_size': 65536, 'sampling_table_seed': 0, 'context_history_size': 1024, 'device': device(type='cpu')})

In [36]:
BATCH_SIZE = 8
NUM_BATCHES = 320
OUTPUTS_LEN = 1024
TEMPERATURE = 0.1
TOP_K = 40
TOP_P = 0.99

tokenizer_1 = transformers.AutoTokenizer.from_pretrained(MODEL_NAME_1.value)
tokenizer_1.pad_token = tokenizer_1.eos_token
tokenizer_1.padding_side = "left"

tokenizer_2 = transformers.AutoTokenizer.from_pretrained(MODEL_NAME_2.value)
tokenizer_2.pad_token = tokenizer_2.eos_token
tokenizer_2.padding_side = "left"

tokenizer_3 = transformers.AutoTokenizer.from_pretrained(MODEL_NAME_3.value)
tokenizer_3.pad_token = tokenizer_3.eos_token
tokenizer_3.padding_side = "left"

tokenizer_4 = transformers.AutoTokenizer.from_pretrained(MODEL_NAME_4.value)
tokenizer_4.pad_token = tokenizer_4.eos_token
tokenizer_4.padding_side = "left"

logits_processor = logits_processing.SynthIDLogitsProcessor(
    **CONFIG, top_k=TOP_K, temperature=TEMPERATURE
)

In [29]:
from hf_olmo import OLMoForCausalLM

def load_model(
    model_name: ModelName,
    expected_device: torch.device,
    enable_watermarking: bool = False,
) -> transformers.PreTrainedModel:
  if model_name == ModelName.GPT2:
    model_cls = (
        synthid_mixin.SynthIDGPT2LMHeadModel
        if enable_watermarking
        else transformers.GPT2LMHeadModel
    )
    model = model_cls.from_pretrained(model_name.value, device_map='auto')
  elif model_name == ModelName.GEMMA_2B or model_name == ModelName.GEMMA_7B:
    model_cls = (
        synthid_mixin.SynthIDGemmaForCausalLM
        if enable_watermarking
        else transformers.GemmaForCausalLM
    )
    
    model = model_cls.from_pretrained(
        model_name.value,
        device_map='auto',
        torch_dtype=torch.bfloat16,
    )
  elif model_name == ModelName.OLMO:
      model_cls = (
          synthid_mixin.SynthIDOlmoForCausalLM
          if enable_watermarking
          else OLMoForCausalLM
      )
      model = model_cls.from_pretrained(
          model_name.value,
          revision="step20000-tokens84B",
          device_map='auto',
          torch_dtype=torch.bfloat16
      )
  else:
      model_cls = (
          synthid_mixin.SynthIDAutoModelForCausalLM
          if enable_watermarking
          else transformers.AutoModelForCausalLM
      )
      model = model_cls.from_pretrained(
          model_name.value,
          device_map='auto',
          torch_dtype=torch.bfloat16
      )    


  if str(model.device) != str(expected_device):
    raise ValueError('Model device not as expected.')

  return model

## Benchmarking watermark detection with Mean Detector
In order to find out the impact of the watermarking algorithm on the different models, we decide to do the following:
We make the temperature of each model vary from 0.1 to 1.0, and check the Mean scores of each response, while generating responses that are sometimes watermarked, sometimes not.
We then plot the Means to the temperature and compare them to the changes with and without watermarking in order to visualize if there is a visible difference between one and the other.

In [30]:
NUM_NEGATIVES = 10000
POS_BATCH_SIZE = 32
NUM_POS_BATCHES = 313
NEG_BATCH_SIZE = 32
# Truncate outputs to this length for training.
POS_TRUNCATION_LENGTH = 200
NEG_TRUNCATION_LENGTH = 200
# Pad trucated outputs to this length for equal shape across all batches.
MAX_PADDED_LENGTH = 1000
TEMPERATURE = 1.0

In [32]:
def generate_responses(example_inputs, enable_watermarking, tokenizer, model_name):
  inputs = tokenizer(
      example_inputs,
      return_tensors='pt',
      padding=True,
      return_token_type_ids=False
  ).to(DEVICE)

  gc.collect()
  torch.cuda.empty_cache()

  model = load_model(
      model_name,
      expected_device=DEVICE,
      enable_watermarking=enable_watermarking,
  )
  torch.manual_seed(0)
  _, inputs_len = inputs['input_ids'].shape

  outputs = model.generate(
      **inputs,
      do_sample=True,
      max_length=inputs_len + OUTPUTS_LEN,
      temperature=TEMPERATURE,
      top_k=TOP_K,
      top_p=TOP_P,
      pad_token_id=tokenizer.eos_token_id,
  )

  outputs = outputs[:, inputs_len:]

  # eos mask is computed, skip first ngram_len - 1 tokens
  # eos_mask will be of shape [batch_size, output_len]
  eos_token_mask = logits_processor.compute_eos_token_mask(
      input_ids=outputs,
      eos_token_id=tokenizer.eos_token_id,
  )[:, CONFIG['ngram_len'] - 1 :]

  # context repetition mask is computed
  context_repetition_mask = logits_processor.compute_context_repetition_mask(
      input_ids=outputs,
  )
  # context repitition mask shape [batch_size, output_len - (ngram_len - 1)]

  combined_mask = context_repetition_mask * eos_token_mask

  g_values = logits_processor.compute_g_values(
      input_ids=outputs,
  )
  # g values shape [batch_size, output_len - (ngram_len - 1), depth]

  return g_values, combined_mask

example_inputs = [
    'I enjoy walking with my cute dog',
    'I am from New York',
    'The test was not so very hard after all',
    "I don't think they can score twice in so short a time",
]

wm_g_values_1, wm_mask_1 = generate_responses(
    example_inputs, enable_watermarking=True, tokenizer=tokenizer_1, model_name=MODEL_NAME_1
)
uwm_g_values_1, uwm_mask_1 = generate_responses(
    example_inputs, enable_watermarking=False, tokenizer=tokenizer_1, model_name=MODEL_NAME_1
)

wm_g_values_2, wm_mask_2 = generate_responses(
    example_inputs, enable_watermarking=True, tokenizer=tokenizer_2, model_name=MODEL_NAME_2
)
uwm_g_values_2, uwm_mask_2 = generate_responses(
    example_inputs, enable_watermarking=False, tokenizer=tokenizer_2, model_name=MODEL_NAME_2
)

wm_g_values_3, wm_mask_3 = generate_responses(
    example_inputs, enable_watermarking=True, tokenizer=tokenizer_3, model_name=MODEL_NAME_3
)
uwm_g_values_3, uwm_mask_3 = generate_responses(
    example_inputs, enable_watermarking=False, tokenizer=tokenizer_3, model_name=MODEL_NAME_3
)

wm_g_values_4, wm_mask_4 = generate_responses(
    example_inputs, enable_watermarking=True, tokenizer=tokenizer_4, model_name=MODEL_NAME_4
)
uwm_g_values_4, uwm_mask_4 = generate_responses(
    example_inputs, enable_watermarking=False, tokenizer=tokenizer_4, model_name=MODEL_NAME_4
)

KeyboardInterrupt: 

In [None]:
# Watermarked responses tend to have higher Mean scores than unwatermarked
# responses. To classify responses you can set a score threshold, but this will
# depend on the distribution of scores for your use-case and your desired false
# positive / false negative rates.

wm_mean_scores_1 = detector_mean.mean_score(
    wm_g_values_1.cpu().numpy(), wm_mask_1.cpu().numpy()
)
uwm_mean_scores_1 = detector_mean.mean_score(
    uwm_g_values_1.cpu().numpy(), uwm_mask_1.cpu().numpy()
)

print('Mean scores for watermarked responses: ', wm_mean_scores_1)
print('Mean scores for unwatermarked responses: ', uwm_mean_scores_1)


# You may find that the Weighted Mean scoring function gives better
# classification performance than the Mean scoring function (in particular,
# higher scores for watermarked responses). See the paper for full details.

wm_weighted_mean_scores_1 = detector_mean.weighted_mean_score(
    wm_g_values_1.cpu().numpy(), wm_mask_1.cpu().numpy()
)
uwm_weighted_mean_scores = detector_mean.weighted_mean_score(
    uwm_g_values_1.cpu().numpy(), uwm_mask_1.cpu().numpy()
)

print(
    'Weighted Mean scores for watermarked responses: ', wm_weighted_mean_scores_1
)
print(
    'Weighted Mean scores for unwatermarked responses: ',
    uwm_weighted_mean_scores_1,
)

NameError: name 'wm_g_values' is not defined

## Visualizing 

In [37]:
def plot_benchmarking_results(results, temperature_values):

    plt.figure(figsize=(12, 8))

    for model_name, scores in results.items():
        plt.plot(
            temperature_values, 
            scores["watermarked"], 
            label=f"{model_name} (Watermarked)", 
            linestyle='--'
        )
        plt.plot(
            temperature_values, 
            scores["unwatermarked"], 
            label=f"{model_name} (Unwatermarked)"
        )

    plt.xlabel("Temperature")
    plt.ylabel("Mean Scores")
    plt.title("Benchmarking Mean Scores by Temperature and Watermarking")
    plt.legend()
    plt.grid()
    plt.show()

In [38]:
temperature_values = np.linspace(0.1, 1.0, 10)
results = {}

for model_name, tokenizer in [
    (MODEL_NAME_1, tokenizer_1),
    (MODEL_NAME_2, tokenizer_2),
    (MODEL_NAME_3, tokenizer_3),
    (MODEL_NAME_4, tokenizer_4),
]:
    wm_scores = []
    uwm_scores = []

    for temp in temperature_values:
        TEMPERATURE = temp
        logits_processor = logits_processing.SynthIDLogitsProcessor(
            **CONFIG, top_k=TOP_K, temperature=TEMPERATURE
        )

        wm_g_values, wm_mask = generate_responses(
            example_inputs, enable_watermarking=True, tokenizer=tokenizer, model_name=model_name
        )
        uwm_g_values, uwm_mask = generate_responses(
            example_inputs, enable_watermarking=False, tokenizer=tokenizer, model_name=model_name
        )

        wm_mean_scores = detector_mean.mean_score(
            wm_g_values.cpu().numpy(), wm_mask.cpu().numpy()
        )
        uwm_mean_scores = detector_mean.mean_score(
            uwm_g_values.cpu().numpy(), uwm_mask.cpu().numpy()
        )

        wm_scores.append(np.mean(wm_mean_scores))
        uwm_scores.append(np.mean(uwm_mean_scores))

    results[model_name.value] = {
        "watermarked": wm_scores,
        "unwatermarked": uwm_scores
    }

    plot_benchmarking_results(results, temperature_values)

KeyboardInterrupt: 