In [24]:
import chromadb

In [25]:
# import
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from IPython.display import Markdown, display


In [26]:
from langchain.llms import Ollama
from langchain.embeddings import OllamaEmbeddings

In [27]:
llm = Ollama(model="llama2")
embed_model = OllamaEmbeddings(base_url="http://localhost:11434", model="llama2")

In [28]:
# create client and a new collection
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")

In [29]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)

In [30]:
# load documents
documents = SimpleDirectoryReader("data").load_data()

In [31]:
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, service_context=service_context
)

In [None]:
'''# load from disk
#db2 = chromadb.PersistentClient(path="./chroma_db")
#chroma_collection = db2.get_or_create_collection("quickstart")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    service_context=service_context,
)'''

In [32]:
query_engine = index.as_query_engine()

In [33]:
response = query_engine.query("What is a watermark z hypothesis score?")

In [34]:
response

Response(response='Based on the provided context information, a watermark Z hypothesis score refers to a metric used to evaluate the performance of a language model in detecting or resisting attacks on its ability to generate coherent and natural-sounding text. The term "watermark" likely refers to the idea of embedding a hidden message or signature within the language model\'s output, which can be detected later to assess its authenticity or tampering.\n\nThe "Z" in the hypothesis score may represent the experimenter\'s z-score, which is a statistical measure used to compare the observed frequency of an event to the expected frequency under a null hypothesis. In this context, the Z score could indicate how many standard deviations the language model\'s performance deviates from the expected performance when tested against various attacks or manipulations.\n\nWithout prior knowledge of the specific context or experiment, it is difficult to provide a more detailed explanation of the wat

In [35]:
response = query_engine.query("Who is the main author who introduced soft watermark?")

In [36]:
response

Response(response='The main author who introduced soft watermark is Wilson, A. Soft watermarking is discussed in their paper "A Watermark for Large Language Models" published in 2014.', source_nodes=[NodeWithScore(node=TextNode(id_='67d12342-c9c1-4c34-8991-a798946b36e7', embedding=None, metadata={'file_path': 'data/harsha.txt', 'file_name': 'harsha.txt', 'file_type': 'text/plain', 'file_size': 418, 'creation_date': '2023-11-19', 'last_modified_date': '2023-11-19', 'last_accessed_date': '2023-11-19'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='8f119c43-f062-48bf-81cc-44a15bc44210', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'data/harsha.txt', 'file_name': 'harsha.txt', 'file_type': 'te

In [37]:
response = query_engine.query("What does the algoroithm introduced by Kirchenbauer do?")
response

Response(response='The algorithm introduced by Kirchenbauer is a hard watermarking algorithm, which means it embeds a watermark into the media in a way that makes it difficult to remove or tamper with. The algorithm uses a Z-score hypothesis to generate the watermark, which is a statistical technique used to detect and identify potential tampering of the media.\n\nIn simple terms, the algorithm takes a media sample (such as an image or audio file) and calculates a Z-score for each pixel or sample. The Z-score represents how many standard deviations away from the mean the sample is. The algorithm then uses these Z-scores to generate a watermark that is embedded into the media in a way that makes it difficult to remove or tamper with.\n\nThe idea behind this algorithm is that any attempt to remove or tamper with the watermark will alter the Z-scores of the media, making it possible to detect the tampering. This provides a high level of security and authenticity to the media, ensuring tha