In [74]:
import os

while not os.getcwd().endswith("hackathon"):
    os.chdir("..")
    print(f"Now in {os.getcwd()}")

In [75]:
from hackathon.utils.settings.settings_provider import SettingsProvider
import warnings
from langchain_chroma.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

with warnings.catch_warnings(action="ignore"):
    from langchain_ibm import ChatWatsonx
import dotenv
from langchain.schema import Document
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS


dotenv.load_dotenv();

In [76]:
settings_provider = SettingsProvider()

In [77]:
llm = ChatWatsonx(
    model_id=settings_provider.get_ibm_model_name(),  # type: ignore
    url=settings_provider.get_ibm_endpoint_url(),  # type: ignore
    project_id=settings_provider.get_ibm_project_id(),  # type: ignore
)

In [78]:
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}

embeddings = HuggingFaceEmbeddings(
            model_name=settings_provider.get_embeddings_model_name(),
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs,
        )

In [20]:
# import faiss
# from langchain_community.docstore.in_memory import InMemoryDocstore
# from langchain_community.vectorstores import FAISS

# index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

# vectorstore = FAISS(
#     embedding_function=embeddings,
#     index=index,
#     docstore=InMemoryDocstore(),
#     index_to_docstore_id={},
# )

In [None]:
chunk_page_content = "ciao"
chunk_metadata = {"ciao": ["ciao"]}

doc = Document(
    page_content=chunk_page_content,
    metadata=chunk_metadata,
)

In [12]:
vector_store = FAISS.from_documents([doc], embeddings)

In [16]:
vector_store.save_local("data/debug_vectorstore")

In [79]:
new_vector_store = FAISS.load_local(
    "data/debug_vectorstore", embeddings, allow_dangerous_deserialization=True
)

In [47]:
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocalate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet", "options": ["pancakes", "eggs"]},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news", "options": ["weather", "forecast"]},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet", "options": ["LangChain", "project"]},
)

documents = [
    document_1,
    document_2,
    document_3,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store = FAISS.from_documents(documents=documents, embedding=embeddings)
vector_store.save_local(folder_path="data/debug_vectorstore")

In [95]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [107]:
vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=3,
    filter=lambda x: "LangChain" in x["options"]
)

[Document(id='a830722e-2cd2-45e6-b606-390ab6f62a8d', metadata={'source': 'tweet', 'options': ['LangChain', 'project']}, page_content='Building an exciting new project with LangChain - come check it out!')]

In [101]:
# Alternative: If you want exact array match
retriever.invoke(
    "Building with LangChain",
    filter={"options": "LangChain"},
)

[]

In [46]:
query_ex = "Quali piatti del ristorante Cosmica Essenza sono preparati utilizzando i Funghi dell’Etere o la tecnica di Marinatura Temporale Sincronizzata?"

In [82]:
import numpy as np


embedding_dim = vector_store.index.d
dummy_vector = np.zeros(embedding_dim)

# Retrieve all documents by setting k to the total number of documents
docs_and_scores = vector_store.similarity_search_with_score_by_vector(
    dummy_vector,
    k=vector_store.index.ntotal
)

docs_and_scores

[(Document(id='a830722e-2cd2-45e6-b606-390ab6f62a8d', metadata={'source': 'tweet', 'options': ['LangChain', 'project']}, page_content='Building an exciting new project with LangChain - come check it out!'),
  0.9999999),
 (Document(id='61309ad3-30f4-4f7a-a262-ac17d866e39b', metadata={'source': 'news', 'options': ['weather', 'forecast']}, page_content='The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.'),
  1.0),
 (Document(id='59efcc11-7164-43e9-b99b-0d3f7cacc0a5', metadata={'source': 'tweet', 'options': ['pancakes', 'eggs']}, page_content='I had chocalate chip pancakes and scrambled eggs for breakfast this morning.'),
  1.0000001)]

[]

In [36]:
import chromadb
chroma_client = chromadb.Client()

In [37]:
collection = chroma_client.create_collection(name="my_collection")

In [46]:
collection.add(
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges"
    ],
    ids=["id3", "id5"],
    metadatas=[{"fruit": ["pineapple"]}, {"fruit": "orange"}],
)

ValueError: Expected metadata value to be a str, int, float or bool, got ['pineapple'] which is a list in add.

In [34]:


# Example documents
documents = [
    "Document about regression and classification",
    "Document about CNN architecture",
    "Document about both regression and CNN"
]

# Method 1: Using lists in metadata (Recommended)
metadata_list = [
    {
        "techniques": ["regression", "classification"],
        "difficulty": "intermediate"
    },
    {
        "techniques": ["CNN"],
        "difficulty": "advanced"
    },
    {
        "techniques": ["regression", "CNN"],
        "difficulty": "advanced"
    }
]

# Add documents with metadata using lists
collection.add(
    documents=documents,
    ids=["doc1", "doc2", "doc3"],
    metadatas=metadata_list
)

# Query examples
# 1. Find documents about regression
results = collection.query(
    query_texts=["regression"],
    where={"techniques": {"$in": ["regression"]}}
)

# 2. Find advanced documents about CNN
results_advanced_cnn = collection.query(
    query_texts=["CNN architecture"],
    where={
        "$and": [
            {"techniques": {"$in": ["CNN"]}},
            {"difficulty": "advanced"}
        ]
    }
)