In [24]:
import chromadb

In [25]:
# import
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from IPython.display import Markdown, display


In [26]:
from langchain.llms import Ollama
from langchain.embeddings import OllamaEmbeddings

In [27]:
llm = Ollama(model="llama2")
embed_model = OllamaEmbeddings(base_url="http://localhost:11434", model="llama2")

In [28]:
# create client and a new collection
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")

In [29]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)

In [46]:
# load documents
documents = SimpleDirectoryReader("data").load_data()

In [47]:
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, service_context=service_context
)

In [48]:
# load from disk
db2 = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db2.get_or_create_collection("quickstart")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index2 = VectorStoreIndex.from_vector_store(
    vector_store,
    service_context=service_context,
)

In [49]:
query_engine = index2.as_query_engine()

In [33]:
response = query_engine.query("What is a watermark z hypothesis score?")

In [34]:
response

Response(response='Based on the provided context information, a watermark Z hypothesis score refers to a metric used to evaluate the performance of a language model in detecting or resisting attacks on its ability to generate coherent and natural-sounding text. The term "watermark" likely refers to the idea of embedding a hidden message or signature within the language model\'s output, which can be detected later to assess its authenticity or tampering.\n\nThe "Z" in the hypothesis score may represent the experimenter\'s z-score, which is a statistical measure used to compare the observed frequency of an event to the expected frequency under a null hypothesis. In this context, the Z score could indicate how many standard deviations the language model\'s performance deviates from the expected performance when tested against various attacks or manipulations.\n\nWithout prior knowledge of the specific context or experiment, it is difficult to provide a more detailed explanation of the wat

In [35]:
response = query_engine.query("Who is the main author who introduced soft watermark?")

In [36]:
response

Response(response='The main author who introduced soft watermark is Wilson, A. Soft watermarking is discussed in their paper "A Watermark for Large Language Models" published in 2014.', source_nodes=[NodeWithScore(node=TextNode(id_='67d12342-c9c1-4c34-8991-a798946b36e7', embedding=None, metadata={'file_path': 'data/harsha.txt', 'file_name': 'harsha.txt', 'file_type': 'text/plain', 'file_size': 418, 'creation_date': '2023-11-19', 'last_modified_date': '2023-11-19', 'last_accessed_date': '2023-11-19'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='8f119c43-f062-48bf-81cc-44a15bc44210', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'data/harsha.txt', 'file_name': 'harsha.txt', 'file_type': 'te

In [44]:
response = query_engine.query("What does the algoroithm introduced by Kirchenbauer do?")
response

Response(response='The algorithm introduced by Kirchenbauer is a hard watermarking algorithm, which means it embeds a hidden signature or mark into the image or audio that cannot be removed or tampered with. The algorithm uses a Z-score hypothesis to ensure the watermark is robust and resistant to attacks. The Z-score hypothesis is a statistical measure that calculates the number of standard deviations an observed value is away from the mean. In the context of hard watermarking, the algorithm embeds the watermark in a way that makes it highly likely to be detected even if the image or audio is modified or distorted.', source_nodes=[NodeWithScore(node=TextNode(id_='4ab55a27-3868-4ca8-a493-1f03ec359d76', embedding=None, metadata={'page_label': '17', 'file_name': 'A Watermark for Large Language Models.pdf', 'file_path': 'data/A Watermark for Large Language Models.pdf', 'file_type': 'application/pdf', 'file_size': 1472891, 'creation_date': '2023-11-19', 'last_modified_date': '2023-11-04', 

In [45]:
response = query_engine.query("What does Harsha love to do?")
response

Response(response='Based on the context provided, we can infer that Harsha loves to trek in the Himalayas and go on adventures. Specifically, he had a recent adventure to Kedarkantha and plans to go to Manaslu circuit in 2027. Additionally, the passage mentions that Harsha was seeing the Watermarking algorithm in his trip to Manali, which suggests that he may have been working on or interested in this topic related to language models during his travels. However, there is no direct mention of what Harsha loves to do in his free time.', source_nodes=[NodeWithScore(node=TextNode(id_='67d12342-c9c1-4c34-8991-a798946b36e7', embedding=None, metadata={'file_path': 'data/harsha.txt', 'file_name': 'harsha.txt', 'file_type': 'text/plain', 'file_size': 418, 'creation_date': '2023-11-19', 'last_modified_date': '2023-11-19', 'last_accessed_date': '2023-11-19'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded

In [50]:
response = query_engine.query("What was the improvement in watermarking that was distortion free?")
response

Response(response='Based on the provided context information, the article "A Watermark for Large Language Models" by Wilson et al. (2014) mentions an improvement in watermarking that is distortion-free. Specifically, the authors propose a method for linguistic steganography on Twitter that uses hierarchical language modeling with manual interaction. They claim that their approach is able to embed a watermark into a text message without introducing any distortions or degradations in the message\'s meaning or content.\n\nThe article Zellers et al. (2019) also discusses the use of neural network-based methods for defending against neural fake news, and mentions that their approach is able to achieve a high degree of accuracy without introducing any distortions into the input data.\n\nOverall, these articles suggest that there have been improvements in watermarking and other natural language processing techniques that allow for the embedding of information into texts or other forms of medi

In [51]:
response = query_engine.query("Summarize what the paper 'Robust Distortion-free watermarking' proposes")
response

Response(response='Based on the context provided in the text, it seems that Harsha is a software engineer who loves to trek in the Himalayas. He has recently been on a trip to Manali where he encountered the Watermarking algorithm. A shopkeeper asked him to explain Kirchenbauer\'s Z score hypothesis for hard watermarking, which suggests a method for embedding watermarks into images in a way that makes them robust to distortions.\n\nThe paper "Robust Distortion-free Watermarking" proposes a new approach to watermarking that is designed to be robust against various types of attacks, including cropping, scaling, and rotation. The proposed method uses a combination of data embedding and spread spectrum techniques to create a hidden signature that can be detected even if the image is distorted.\n\nIn summary, the paper "Robust Distortion-free Watermarking" proposes a new method for watermarking that is designed to be robust against various types of attacks, and can be used to protect images

In [52]:
response = query_engine.query("Create 2 questions from the concepts of watermarking")
response

Response(response='Sure! Here are two possible questions that could be generated based on the concepts of watermarking discussed in the provided text:\n\nQuestion 1: What is the optimal value of γ for a pareto-optimal small green list in a watermarking scheme, and why?\n\nThis question is prompted by the statement in the text that "a small green list, γ=.1is pareto-optimal." The question asks for the reason behind this optimization and the optimal value of γ that leads to the best performance.\n\nQuestion 2: How does beam search affect the tradeoff between watermark strength and accuracy, and what are the implications of this interaction for the design of a watermarking scheme?\n\nThis question is inspired by the observation in the text that "beam search has a synergistic interaction with the soft watermarking rule." The question seeks to understand the nature of this interaction and how it impacts the performance of the watermarking scheme.', source_nodes=[NodeWithScore(node=TextNode(