In [None]:
"""
%pip install -Uqq langchain-weaviate langchain-community
%pip install langchain-openai tiktoken langchain pypdf"""

In [3]:
import weaviate
from weaviate.classes.init import Auth
import google.generativeai as genai
from typing import List, Dict
import os
from typing import List, Dict
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain_weaviate.vectorstores import WeaviateVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from weaviate.classes import query as wvc
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [6]:
# Connect to Weaviate Cloud
client = weaviate.connect_to_weaviate_cloud(
    cluster_url= WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)

print(client.is_ready())

True


In [42]:
huggingface_key = huggingface_api_key
headers = {
    "X-HuggingFace-Api-Key": huggingface_key,
}

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,                       # `weaviate_url`:  Weaviate URL
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),      # `weaviate_key`:  Weaviate API key
    headers=headers
)

In [None]:
"""# Correct way to set up headers for HuggingFace
headers = {
    "X-HuggingFace-Api-Key": huggingface_api_key,
    "Authorization": f"Bearer {huggingface_api_key}"  # Add proper authorization
}

# Initialize Weaviate client with correct authentication
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
    headers=headers
)"""

            Please make sure to close the connection using `client.close()`.


In [7]:

genai.configure(api_key=gemini_api_key)

In [43]:


print("Client is Ready?", client.is_ready())

Client is Ready? True


In [None]:
from weaviate import classes as wvc

"""# lets make sure its vectorizer is what the one we want
collection = client.collections.create(
    name="WikipediaLangChain",
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),
    generative_config=wvc.config.Configure.Generative.openai(),
)"""

'# lets make sure its vectorizer is what the one we want\ncollection = client.collections.create(\n    name="WikipediaLangChain",\n    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),\n    generative_config=wvc.config.Configure.Generative.openai(),\n)'

https://weaviate.io/developers/weaviate/model-providers/huggingface/embeddings

In [60]:
client.collections.delete("WikipediaLangChain")

In [61]:
from weaviate.classes.config import Configure

client.collections.create(
    "WikipediaLangChain",
    vectorizer_config= None,

)

"""# Create collection with correct model configuration
client.collections.create(
    "WikipediaLangChain",
    vectorizer_config=Configure.Vectorizer.none(),  # We'll use our own vectorizer
)"""

'# Create collection with correct model configuration\nclient.collections.create(\n    "WikipediaLangChain",\n    vectorizer_config=Configure.Vectorizer.none(),  # We\'ll use our own vectorizer\n)'

In [10]:
"""embedding_model_name = "sentence-transformers/all-mpnet-base-v2"

embeddings = HuggingFaceEmbeddings(
    model_name = embedding_model_name
)"""

embeddings = GoogleGenerativeAIEmbeddings(
        model="models/embedding-001",  # Google's text embedding model
        google_api_key= gemini_api_key
    )

In [11]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)


In [62]:



# import first article
loader = PyPDFLoader("brazil-wikipedia-article-text.pdf", extract_images=False)
docs = loader.load_and_split(text_splitter)
print(f"GOT {len(docs)} docs for Brazil")
db = WeaviateVectorStore.from_documents(docs, embeddings, client=client, index_name="WikipediaLangChain")


# import second article
loader = PyPDFLoader("netherlands-wikipedia-article-text.pdf", extract_images=False)
docs = loader.load_and_split(text_splitter)
print(f"GOT {len(docs)} docs for Netherlands")
db = WeaviateVectorStore.from_documents(docs, embeddings, client=client, index_name="WikipediaLangChain")

GOT 247 docs for Brazil
GOT 274 docs for Netherlands


In [63]:
# Create vector store
vector_store = WeaviateVectorStore(
    client=client,
    index_name="WikipediaLangChain",
    text_key="text",
    embedding=embeddings,  
    attributes=["source"]
)

In [64]:
vector_store.add_documents(docs)


['da6c19e1-68d8-45be-a8a4-71bde6b34e77',
 '787711b5-9696-4489-9f3f-3142443e4b10',
 '2e528f8d-d081-4bc2-a80b-f5599a8d8f5d',
 '88919295-5a81-4e60-b436-9fac1875d768',
 'f5fd5870-0013-43a4-8d98-cd8dbde8930e',
 '137dc2b0-8be9-44b4-a89c-9d151080189d',
 'a7493614-cefe-4361-bab8-d1319acc26d3',
 'b5f24e1a-c33f-4f6a-851e-2c1371ce92a6',
 'c7fd1ea2-9500-4ba1-8741-015ce5529427',
 '45e69180-8f00-44c1-842f-725f412963a4',
 '65b9a76c-06b5-4242-8226-2b4ade9346b0',
 '023861f3-aea5-4822-8b0c-be8be267d7d4',
 '06e26714-64dc-4613-b56e-0f63e43e52b7',
 'a14303cc-4b66-4f35-8b63-2eb8641c3da4',
 'f870d840-26d3-4efb-9211-ff92c1499f55',
 '1b12b9f1-9bcd-45e4-a6c6-4820579ba9d1',
 '9f388af2-88cc-4777-ad30-302bafde2649',
 '692912b1-5017-4d04-95d8-118ddc3ef1f0',
 '8d6d75a3-63bb-48b9-8026-47c359622371',
 '21e951f2-05f3-4224-b37a-a2217eb2d49f',
 '404a0683-abbd-4140-a751-6d04f5e697df',
 '911a309c-4eda-499c-afb4-560af82f3f33',
 'c615e8b3-e56e-4b4e-96d8-f99033d29f3c',
 '11e7681e-eff8-4191-aad8-9a3ea556f7b9',
 'd97c8938-4af3-

In [66]:
collection = client.collections.get("WikipediaLangChain")

dir(collection)

['_CollectionBase__consistency_level',
 '_CollectionBase__properties',
 '_CollectionBase__references',
 '_CollectionBase__tenant',
 '_Collection__cluster',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__orig_class__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_config',
 '_connection',
 '_is_protocol',
 '_query',
 '_validate_arguments',
 'aggregate',
 'backup',
 'batch',
 'config',
 'consistency_level',
 'data',
 'exists',
 'generate',
 'iterator',
 'name',
 'query',
 'shards',
 'tenant',
 'tenants',
 'with_consistency_level',
 'with_tenant']

In [67]:
# collection
collection = client.collections.get("WikipediaLangChain")

let's count how many objects we have in total

In [68]:
response = collection.aggregate.over_all(total_count=True)
print(response)

AggregateReturn(properties={}, total_count=795)


In [69]:
# Group by source
response = collection.aggregate.over_all(group_by="source")
for group in response.groups:
    print(group.grouped_by.value, group.total_count)

netherlands-wikipedia-article-text.pdf 548
brazil-wikipedia-article-text.pdf 247


In [70]:
# View object properties
object = collection.query.fetch_objects(limit=1).objects[0]
print(object.properties.keys())
print(object.properties.get("source"))
print(object.properties.get("page"))
print(object.properties.get("text"))

dict_keys(['text', 'page', 'source'])
netherlands-wikipedia-article-text.pdf
6.0
with 1.30 m (4.3 ft) of additional flood protection. Climate change will not only threaten the Netherlands from the seaside but could
also alter rainfall patterns and river run-off. To protect the country from river flooding, another programme is already being executed.


In [71]:
# Query in French using Gemini
generateTask = "Quelle est la nourriture traditionnelle de ce pays?"
source_file = "brazil-wikipedia-article-text.pdf"

In [72]:
model = ChatGoogleGenerativeAI(
    model="gemini-pro", 
    google_api_key= gemini_api_key
)

In [73]:
dir(wvc)

['ConsistencyLevel',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'aggregate',
 'batch',
 'config',
 'data',
 'generics',
 'init',
 'query',
 'rbac',
 'tenants']

In [74]:

generateTask = "Quelle est la nourriture traditionnelle de ce pays?"
# lets filter it out, and only use this specific file
source_file = "brazil-wikipedia-article-text.pdf"
#source_file = "netherlands-wikipedia-article-text.pdf"

In [78]:
query_embedding = embeddings.embed_query("traditional food")
query = collection.query.near_vector(
    near_vector=query_embedding,
    limit=10,
    return_metadata=True
)

WeaviateInvalidInputError: Invalid input provided: Argument 'return_metadata' must be one of: [typing.Sequence[str], <class 'weaviate.collections.classes.grpc.MetadataQuery'>, None], but got <class 'bool'>.

those were some of the objects used for this generation

Note that we used a filter, so the content will be searched and generated only for that specific pdf.
Let's change the filter to the second pdf file.

In [13]:
# We can filter it out, now for Netherlands
generateTask = "Qual é a comida tradicional deste país?. Answer in english"
# now generating the answer using Wikipedia
source_file = "netherlands-wikipedia-article-text.pdf"

query = collection.generate.near_text(
    query="tradicional food",
    filters=wvc.query.Filter.by_property("source").equal(source_file),
    limit=10,
    grouped_task=generateTask
)

print(query.generated)

The traditional food of the Netherlands typically consists of potatoes, meat, and seasonal vegetables for dinner. The diet was historically high in carbohydrates and fat, reflecting the needs of laborers. Some typical Dutch foods include mayonnaise, whole-grain mustards, chocolate, buttermilk, seafood like herring and mussels, and pastries like stroopwafel and gevulde koek. The cuisine varies by region, with different specialties in the north, south, and western parts of the country. Beer and Jenever are traditional alcoholic beverages in the region.


In [14]:
# We can filter it out for multilpe sources
generateTask = "What is in common on the food of thouse two countries?"
# now generating the answer using Wikipedia
source_files = ["netherlands-wikipedia-article-text.pdf", "brazil-wikipedia-article-text.pdf"]

query = collection.generate.near_text(
    query="tradicional food",
    filters=wvc.query.Filter.by_property("source").contains_any(source_files),
    limit=10,
    grouped_task=generateTask
)

print(query.generated)

Both Brazil and the Netherlands have a variety of fried foods in their cuisine. In Brazil, fried potatoes, fried cassava, fried banana, fried meat, and fried cheese are commonly eaten, while in the Netherlands, fried fish dishes like kibbeling and lekkerbek are popular. Additionally, both countries have a tradition of using flour in their dishes, such as in Brazilian farofa and Dutch cookies and pastries.


In [32]:

db = WeaviateVectorStore(embedding=embeddings, client=client, index_name="WikipediaLangChain", text_key="text")

In [None]:
# similarity search on all objects
docs = db.similarity_search("traditional food")
print(docs)

[Document(metadata={'page': 14.0, 'source': 'netherlands-wikipedia-article-text.pdf'}, page_content='cream, custard or fruits. Cakes, such as the \nVlaai\n from Limburg and the \nMoorkop\n and \nBossche Bol\n from Brabant, are typical\npastries. Savoury pastries also occur, with the \nworstenbroodje\n (a roll with a sausage of ground beef, literally translates into sausage\nbread) being the most popular. The traditional alcoholic beverage of the region is beer. There are many local brands, ranging from'), Document(metadata={'page': 14.0, 'source': 'netherlands-wikipedia-article-text.pdf'}, page_content='cream, custard or fruits. Cakes, such as the \nVlaai\n from Limburg and the \nMoorkop\n and \nBossche Bol\n from Brabant, are typical\npastries. Savoury pastries also occur, with the \nworstenbroodje\n (a roll with a sausage of ground beef, literally translates into sausage\nbread) being the most popular. The traditional alcoholic beverage of the region is beer. There are many local bra

In [34]:
# change bellow to get chunks per different files / countries
source_file = "brazil-wikipedia-article-text.pdf"
#source_file = "netherlands-wikipedia-article-text.pdf"
where_filter = wvc.query.Filter.by_property("source").equal(source_file)
docs = db.similarity_search("traditional food", filters=where_filter)
print(docs)

[Document(metadata={'page': 12.0, 'source': 'brazil-wikipedia-article-text.pdf'}, page_content="Japanese, Jewish and Arab immigrants who arrived in large numbers in the South and Southeast of Brazil during the 19th\nand 20th centuries. The indigenous Amerindians influenced Brazil's language and cuisine; and the Africans influenced\nlanguage, cuisine, music, dance and religion."), Document(metadata={'page': 13.0, 'source': 'brazil-wikipedia-article-text.pdf'}, page_content='flour (farofa). Fried potatoes, fried cassava, fried banana, fried meat and fried cheese are very often eaten in lunch and\nserved in most typical restaurants. Popular snacks are pastel (a fried pastry); coxinha (a variation of chicken croquete); pão\nde queijo (cheese bread and cassava flour / tapioca); pamonha (corn and milk paste); esfirra (a variation of Lebanese'), Document(metadata={'page': 13.0, 'source': 'brazil-wikipedia-article-text.pdf'}, page_content="acarajé (from African cuisine).\nThe national beverage

In [35]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


from weaviate.classes.query import Filter

In [38]:
models = genai.list_models()


print("Available Gemini Models:")
for model in models:
    if "gemini" in model.name.lower():
        print(f"- {model.name}")

Available Gemini Models:
- models/gemini-1.0-pro-latest
- models/gemini-1.0-pro
- models/gemini-pro
- models/gemini-1.0-pro-001
- models/gemini-1.0-pro-vision-latest
- models/gemini-pro-vision
- models/gemini-1.5-pro-latest
- models/gemini-1.5-pro-001
- models/gemini-1.5-pro-002
- models/gemini-1.5-pro
- models/gemini-1.5-pro-exp-0801
- models/gemini-1.5-pro-exp-0827
- models/gemini-1.5-flash-latest
- models/gemini-1.5-flash-001
- models/gemini-1.5-flash-001-tuning
- models/gemini-1.5-flash
- models/gemini-1.5-flash-exp-0827
- models/gemini-1.5-flash-002
- models/gemini-1.5-flash-8b
- models/gemini-1.5-flash-8b-001
- models/gemini-1.5-flash-8b-latest
- models/gemini-1.5-flash-8b-exp-0827
- models/gemini-1.5-flash-8b-exp-0924
- models/gemini-2.0-flash-exp
- models/gemini-exp-1206
- models/gemini-exp-1121
- models/gemini-exp-1114
- models/gemini-2.0-flash-thinking-exp
- models/gemini-2.0-flash-thinking-exp-1219


In [None]:


# client = weaviate.connect_to_weaviate_cloud(...)

db = WeaviateVectorStore.from_documents([], embeddings, client=client, index_name="WikipediaLangChain")

source_file = "brazil-wikipedia-article-text.pdf"
#source_file = "netherlands-wikipedia-article-text.pdf"
where_filter = Filter.by_property("source").equal(source_file)

# retriever to filter the results
retriever = db.as_retriever(search_kwargs={"filters": where_filter})

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)



In [40]:
llm = ChatGoogleGenerativeAI(
    model="models/gemini-1.5-flash", 
    google_api_key= gemini_api_key
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

response = rag_chain.invoke({"input": "What is he traditional food of this country?"})
print(response["answer"])

Feijoada is considered Brazil's national dish.  A typical meal often includes rice and beans with beef, salad, french fries, and a fried egg.  Regional variations exist, reflecting the country's diverse population.



In [79]:
#lets close our embedded server
client.close()