In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!git clone https://github.com/jthickstun/watermark.git

Cloning into 'watermark'...
remote: Enumerating objects: 80, done.[K
remote: Counting objects: 100% (80/80), done.[K
remote: Compressing objects: 100% (65/65), done.[K
remote: Total 80 (delta 25), reused 67 (delta 14), pack-reused 0[K
Unpacking objects: 100% (80/80), 264.81 KiB | 1.64 MiB/s, done.


In [3]:
import os
os.chdir("/kaggle/working/watermark/demo")

In [4]:
import os, argparse

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from mersenne import mersenne_rng

def generate_shift(model,prompt,vocab_size,n,m,key):
    rng = mersenne_rng(key)
    xi = torch.tensor([rng.rand() for _ in range(n*vocab_size)]).view(n,vocab_size)
    shift = torch.randint(n, (1,))

    inputs = prompt.to(model.device)
    attn = torch.ones_like(inputs)
    past = None
    for i in range(m):
        with torch.no_grad():
            if past:
                output = model(inputs[:,-1:], past_key_values=past, attention_mask=attn)
            else:
                output = model(inputs)

        probs = torch.nn.functional.softmax(output.logits[:,-1, :vocab_size], dim=-1).cpu()
        token = exp_sampling(probs,xi[(shift+i)%n,:]).to(model.device)
        inputs = torch.cat([inputs, token], dim=-1)

        past = output.past_key_values
        attn = torch.cat([attn, attn.new_ones((attn.shape[0], 1))], dim=-1)

    return inputs.detach().cpu()

def exp_sampling(probs,u):
    return torch.argmax(u ** (1/probs),axis=1).unsqueeze(-1)

def main():
    torch.manual_seed(0)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

    tokens = tokenizer.encode("A good thesis", return_tensors='pt', truncation=True, max_length=2048)

    watermarked_tokens = generate_shift(model,tokens,len(tokenizer),256,80,42)[0]
    watermarked_text = tokenizer.decode(watermarked_tokens, skip_special_tokens=True)

    print(watermarked_text)

if __name__ == '__main__':
    main()

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

A good thesis statement, by definition, tells your reader what your essay is about in a clear and concise way. Do not write "An essay about living in a suburb" or "An essay about floating.` A thesis statement is the last sentence in your rough draft. It should summarize your essay topic and answer the question of what you plan to discuss. Before you write your


In [5]:
import os, sys, argparse, time

import numpy as np
from transformers import AutoTokenizer
from mersenne import mersenne_rng

import pyximport
pyximport.install(reload_support=True, language_level=sys.version_info[0],
                  setup_args={'include_dirs':np.get_include()})
from levenshtein import levenshtein

def permutation_test(tokens,key,n,k,vocab_size,n_runs=100):
    rng = mersenne_rng(key)
    xi = np.array([rng.rand() for _ in range(n*vocab_size)], dtype=np.float32).reshape(n,vocab_size)
    test_result = detect(tokens,n,k,xi)

    p_val = 0
    for run in range(n_runs):
        xi_alternative = np.random.rand(n, vocab_size).astype(np.float32)
        null_result = detect(tokens,n,k,xi_alternative)

        # assuming lower test values indicate presence of watermark
        p_val += null_result <= test_result

    return (p_val+1.0)/(n_runs+1.0)


def detect(tokens,n,k,xi,gamma=0.0):
    m = len(tokens)
    n = len(xi)

    A = np.empty((m-(k-1),n))
    for i in range(m-(k-1)):
        for j in range(n):
            A[i][j] = levenshtein(tokens[i:i+k],xi[(j+np.arange(k))%n],gamma)

    return np.min(A)


def main():
    text = '''A good thesis statement, by definition, tells your reader what your essay is about in a clear and concise way. Do not write "An essay about living in a suburb" or "An essay about floating.` A thesis statement is the last sentence in your rough draft. It should summarize your essay topic and answer the question of what you plan to discuss. Before you write your'''
#     with open(args.document, 'r') as f:
#         text = f.read()

    tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    tokens = tokenizer.encode(text, return_tensors='pt', truncation=True, max_length=2048).numpy()[0]
    
    t0 = time.time()
    pval = permutation_test(tokens,42,256,len(tokens),len(tokenizer))
    print('p-value: ', pval)
    print(f'(elapsed time: {time.time()-t0}s)')


if __name__ == '__main__':
    main()

In file included from /opt/conda/lib/python3.10/site-packages/numpy/core/include/numpy/ndarraytypes.h:1929,
                 from /opt/conda/lib/python3.10/site-packages/numpy/core/include/numpy/ndarrayobject.h:12,
                 from /opt/conda/lib/python3.10/site-packages/numpy/core/include/numpy/arrayobject.h:5,
                 from /root/.pyxbld/temp.linux-x86_64-cpython-310/kaggle/working/watermark/demo/levenshtein.c:1205:
      |  ^~~~~~~


p-value:  0.009900990099009901
(elapsed time: 86.51769661903381s)


In [9]:
def main():
    torch.manual_seed(0)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

    inputs = tokenizer("A good thesis", return_tensors = "pt")
    generate_ids = model.generate(inputs.input_ids, max_length=2048)
    answer = tokenizer.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]

    print(answer)

if __name__ == '__main__':
    main()

A good thesis statement should be clear, concise, and specific. It should clearly state the main idea of your essay and provide a clear focus for your argument. A good thesis statement should also be well-supported by evidence and arguments. Avoid using vague or general statements, and make sure your thesis statement is specific enough to guide your essay.

2. Use a clear and concise language: Your thesis statement should be written in a clear and concise language. Avoid using jargon, technical terms, or complex language that may confuse your reader. Use simple and straightforward language that is easy to understand.

3. Be specific: Your thesis statement should be specific enough to guide your essay. Avoid using vague or general statements, and make sure your thesis statement is specific enough to guide your argument.

4. Use examples: Use examples to support your thesis statement. This will help your reader understand your argument and see how it applies to real-world situations.

5.

In [10]:
import os, sys, argparse, time

import numpy as np
from transformers import AutoTokenizer
from mersenne import mersenne_rng

import pyximport
pyximport.install(reload_support=True, language_level=sys.version_info[0],
                  setup_args={'include_dirs':np.get_include()})
from levenshtein import levenshtein

def permutation_test(tokens,key,n,k,vocab_size,n_runs=100):
    rng = mersenne_rng(key)
    xi = np.array([rng.rand() for _ in range(n*vocab_size)], dtype=np.float32).reshape(n,vocab_size)
    test_result = detect(tokens,n,k,xi)

    p_val = 0
    for run in range(n_runs):
        xi_alternative = np.random.rand(n, vocab_size).astype(np.float32)
        null_result = detect(tokens,n,k,xi_alternative)

        # assuming lower test values indicate presence of watermark
        p_val += null_result <= test_result

    return (p_val+1.0)/(n_runs+1.0)


def detect(tokens,n,k,xi,gamma=0.0):
    m = len(tokens)
    n = len(xi)

    A = np.empty((m-(k-1),n))
    for i in range(m-(k-1)):
        for j in range(n):
            A[i][j] = levenshtein(tokens[i:i+k],xi[(j+np.arange(k))%n],gamma)

    return np.min(A)


def main():
    text = '''A good thesis statement should be clear, concise, and specific. It should clearly state the main idea of your essay and provide a clear focus for your argument. A good thesis statement should also be well-supported by evidence and arguments. Avoid using vague or general statements, and make sure your thesis statement is specific enough to guide your essay.

2. Use a clear and concise language: Your thesis statement should be written in a clear and concise language. Avoid using jargon, technical terms, or complex language that may confuse your reader. Use simple and straightforward language that is easy to understand.

3. Be specific: Your thesis statement should be specific enough to guide your essay. Avoid using vague or general statements, and make sure your thesis statement is specific enough to guide your argument.

4. Use examples: Use examples to support your thesis statement. This will help your reader understand your argument and see how it applies to real-world situations.

5. Be consistent: Your thesis statement should be consistent throughout your essay. Make sure your thesis statement is repeated at the beginning of each paragraph and at the end of your essay.

6. Edit and revise: Before submitting your essay, edit and revise it to ensure that your thesis statement is clear, concise, and specific. Make sure your thesis statement is well-supported by evidence and arguments.

7. Use a strong conclusion: Your thesis statement should be followed by a strong and clear conclusion. This will help your reader understand the main idea of your essay and how it applies to real-world situations.'''
#     with open(args.document, 'r') as f:
#         text = f.read()

    tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    tokens = tokenizer.encode(text, return_tensors='pt', truncation=True, max_length=2048).numpy()[0]
    
    t0 = time.time()
    pval = permutation_test(tokens,42,256,len(tokens),len(tokenizer))
    print('p-value: ', pval)
    print(f'(elapsed time: {time.time()-t0}s)')


if __name__ == '__main__':
    main()

p-value:  0.6831683168316832
(elapsed time: 397.0473415851593s)
