In [1]:
!git clone https://github.com/jthickstun/watermark.git

Cloning into 'watermark'...
remote: Enumerating objects: 80, done.[K
remote: Counting objects: 100% (80/80), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 80 (delta 25), reused 67 (delta 14), pack-reused 0[K
Unpacking objects: 100% (80/80), 264.81 KiB | 4.81 MiB/s, done.


In [2]:
import os
os.chdir("/kaggle/working/watermark/demo")

In [3]:
import os
import argparse
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from mersenne import mersenne_rng
import pandas as pd
from tqdm import tqdm
import numpy as np  # Import numpy for array slicing

def generate_shift(model, prompt, vocab_size, n, m, key):
    rng = mersenne_rng(key)
    xi = torch.tensor([rng.rand() for _ in range(n*vocab_size)]).view(n, vocab_size)
    shift = torch.randint(n, (1,))

    inputs = prompt.to(model.device)
    attn = torch.ones_like(inputs)
    past = None
    for i in range(m):
        with torch.no_grad():
            if past:
                output = model(inputs[:, -1:], past_key_values=past, attention_mask=attn)
            else:
                output = model(inputs)

        probs = torch.nn.functional.softmax(output.logits[:, -1, :vocab_size], dim=-1).cpu()
        token = exp_sampling(probs, xi[(shift+i)%n, :]).to(model.device)
        inputs = torch.cat([inputs, token], dim=-1)

        past = output.past_key_values
        attn = torch.cat([attn, attn.new_ones((attn.shape[0], 1))], dim=-1)

    return inputs.detach().cpu()

def exp_sampling(probs, u):
    return torch.argmax(u ** (1/probs), axis=1).unsqueeze(-1)

def main():
    torch.manual_seed(0)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model_name = "facebook/opt-350m"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

    # Load the Sunbird dataset
    data_path = '/kaggle/input/sunbird-english-prompts/english_prompts.csv'
    df = pd.read_csv(data_path)

    # Generate watermarked text for each prompt
    watermarked_texts = []

    i = 2
    loop_ranges = [(0, 834), (834, 1667), (1667, 2500)]
    start = loop_ranges[i][0]
    end = loop_ranges[i][1]  # Ensure end doesn't exceed dataframe length
    df_slice = df.iloc[start:end]  # Slice the dataframe for this iteration

    for idx, row in tqdm(df_slice.iterrows(), total=len(df_slice)):
        tokens = tokenizer.encode(row['Text'], return_tensors='pt', truncation=True, max_length=2048).to(device)
        watermarked_tokens = generate_shift(model, tokens, len(tokenizer), 256, 80, 42)[0]
        watermarked_text = tokenizer.decode(watermarked_tokens, skip_special_tokens=True)
        watermarked_texts.append(watermarked_text)
    #  print(watermarked_texts)


    # Save the output to a .csv file
    df_output = pd.DataFrame({'Generated Text': watermarked_texts})
    df_output.to_csv('watermarked_texts.csv', index=False)

if __name__ == '__main__':
    main()


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

100%|██████████| 833/833 [10:23:48<00:00, 44.93s/it]
