In [3]:
# Run this cell and paste the API key in the prompt
import os
import getpass

os.environ['GOOGLE_API_KEY'] = getpass.getpass('Gemini API Key:')

In [5]:

from langchain import PromptTemplate
from langchain import hub
from langchain.docstore.document import Document
from langchain.document_loaders import WebBaseLoader
from langchain.schema import StrOutputParser
from langchain.schema.prompt_template import format_document
from langchain.schema.runnable import RunnablePassthrough
from langchain.vectorstores import Chroma


USER_AGENT environment variable not set, consider setting it to identify your requests.


In [60]:
loader = WebBaseLoader("https://www.eea.europa.eu/en/topics/in-depth/plastics")
docs = loader.load()

docs[0]

Document(metadata={'source': 'https://www.eea.europa.eu/en/topics/in-depth/plastics', 'title': "Plastics | European Environment Agency's home page", 'description': 'Plastics are everywhere — from food packaging to healthcare, construction materials, furniture and textiles. They are, unfortunately, also bad for the environment. They are not only polluting the seas and land, but also contributing to climate change and air emissions.', 'language': 'en'}, page_content="\nPlastics | European Environment Agency's home pageSkip to main contentSkip to navigationSkip to footerAn official website of the European Union | How do you know?All official European Union website addresses are in the europa.eu domain.See all EU institutions and bodiesEnvironmental information systemsEuropean Environment Agency websiteWISE marine - Marine information system for EuropeWISE freshwater - Freshwater information system for EuropeBISE - Biodiversity information system for EuropeFISE - Forest information system 

In [61]:
print(docs[0])

page_content='
 More about human exposure to bisphenol A in EuropeHow to move towards sustainable plastics?Circular and sustainability practices throughout the lifecycle of plastics can help reduce greenhouse gas emissions, pollution and waste. Many such good practice examples already exist and would need to be scaled up to enable a circular plastics economy in Europe.Our briefing 'Pathways towards circular plastics in Europe' aims to inspire businesses, policymakers and citizens to make the production and consumption of plastics more circular and sustainable. The briefing is based on an underpinning technical report by the EEA’s European Topic Centre on Circular Economy and Resource Use.More in EEA briefing on plasticsJacek Dylag on UnsplashMore informationOther topics you might be interested in:Buildings and constructionChemicalsCircular economyEconomy and resourcesProduction and consumptionResource use and materialsTextilesWaste and recyclingExternal links:EU plastics strategyCircul

In [62]:
# Extract the text from the website data document
text_content = docs[0].page_content

# The text content between the substrings "code, audio, image and video." to
# "Cloud TPU v5p" is relevant for this tutorial. You can use Python's `split()`
# to select the required content.
# text_content_1 = text_content.split("Hirakud.",1)[1]
# final_text = text_content_1.split("Powerhouses",1)[0]

# Convert the text to LangChain's `Document` format
docs =  [Document(page_content=text_content, metadata={"source": "local"})]

print(text_content)


 More about human exposure to bisphenol A in EuropeHow to move towards sustainable plastics?Circular and sustainability practices throughout the lifecycle of plastics can help reduce greenhouse gas emissions, pollution and waste. Many such good practice examples already exist and would need to be scaled up to enable a circular plastics economy in Europe.Our briefing 'Pathways towards circular plastics in Europe' aims to inspire businesses, policymakers and citizens to make the production and consumption of plastics more circular and sustainable. The briefing is based on an underpinning technical report by the EEA’s European Topic Centre on Circular Economy and Resource Use.More in EEA briefing on plasticsJacek Dylag on UnsplashMore informationOther topics you might be interested in:Buildings and constructionChemicalsCircular economyEconomy and resourcesProduction and consumptionResource use and materialsTextilesWaste and recyclingExternal links:EU plastics strategyCircular economy act

In [63]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# If there is no environment variable set for the API key, you can pass the API
# key to the parameter `google_api_key` of the `GoogleGenerativeAIEmbeddings`
# function: `google_api_key = "key"`.

gemini_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
     

In [64]:

# Save to disk
vectorstore = Chroma.from_documents(
                     documents=docs,                 # Data
                     embedding=gemini_embeddings,    # Embedding model
                     persist_directory="./chroma_db" # Directory to save data
                     )

In [65]:
# Load from disk
vectorstore_disk = Chroma(
                        persist_directory="./chroma_db",       # Directory of db
                        embedding_function=gemini_embeddings   # Embedding model
                   )
# Get the Retriever interface for the store to use later.
# When an unstructured query is given to a retriever it will return documents.
# Read more about retrievers in the following link.
# https://python.langchain.com/docs/modules/data_connection/retrievers/
#
# Since only 1 document is stored in the Chroma vector store, search_kwargs `k`
# is set to 1 to decrease the `k` value of chroma's similarity search from 4 to
# 1. If you don't pass this value, you will get a warning.
retriever = vectorstore_disk.as_retriever(search_kwargs={"k": 1})

# Check if the retriever is working by trying to fetch the relevant docs related
# to the word 'MMLU' (Massive Multitask Language Understanding). If the length is greater than zero, it means that
# the retriever is functioning well.
print(len(retriever.get_relevant_documents("Hirakud")))

1


In [66]:
from langchain_google_genai import ChatGoogleGenerativeAI

# If there is no environment variable set for the API key, you can pass the API
# key to the parameter `google_api_key` of the `ChatGoogleGenerativeAI` function:
# `google_api_key="key"`.
llm = ChatGoogleGenerativeAI(model="gemini-pro",
                 temperature=0.7, top_p=0.85)
     

In [67]:
# Prompt template to query Gemini
llm_prompt_template = """You are an assistant for question-answering tasks.
Use the following context to answer the question.
If you don't know the answer, just say that you don't know.
Use five sentences maximum and keep the answer concise.\n
Question: {question} \nContext: {context} \nAnswer:"""

llm_prompt = PromptTemplate.from_template(llm_prompt_template)

print(llm_prompt)

input_variables=['context', 'question'] template="You are an assistant for question-answering tasks.\nUse the following context to answer the question.\nIf you don't know the answer, just say that you don't know.\nUse five sentences maximum and keep the answer concise.\n\nQuestion: {question} \nContext: {context} \nAnswer:"


In [68]:
# Combine data from documents to readable string format.
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Create stuff documents chain using LCEL.
#
# This is called a chain because you are chaining together different elements
# with the LLM. In the following example, to create the stuff chain, you will
# combine the relevant context from the website data matching the question, the
# LLM model, and the output parser together like a chain using LCEL.
#
# The chain implements the following pipeline:
# 1. Extract the website data relevant to the question from the Chroma
#    vector store and save it to the variable `context`.
# 2. `RunnablePassthrough` option to provide `question` when invoking
#    the chain.
# 3. The `context` and `question` are then passed to the prompt where they
#    are populated in the respective variables.
# 4. This prompt is then passed to the LLM (`gemini-pro`).
# 5. Output from the LLM is passed through an output parser
#    to structure the model's response.
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | llm_prompt
    | llm
    | StrOutputParser()
)
  

In [15]:
rag_chain.invoke("What is Gemini?")

"Gemini is Google's largest and most capable AI model. It is designed to be natively multimodal, pre-trained from the start on different modalities. Gemini 1.0 was trained to recognize and understand text, images, audio and more at the same time. It can also understand, explain and generate high-quality code in the world’s most popular programming languages. Gemini is also our most flexible model yet — able to efficiently run on everything from data centers to mobile devices."

In [19]:
rag_chain.invoke("What are u?")

"I am Gemini, Google's largest and most capable AI model. I am able to efficiently run on everything from data centers to mobile devices. My state-of-the-art capabilities will significantly enhance the way developers and enterprise customers build and scale with AI."

In [30]:
rag_chain.invoke("What does the provided text say?")

"The provided text introduces Gemini, Google's largest and most capable AI model to date. Gemini is designed to be natively multimodal, pre-trained from the start on different modalities, and fine-tuned with additional multimodal data. It has sophisticated multimodal reasoning capabilities and can understand and reason about all kinds of inputs from the ground up. Gemini excels in several coding benchmarks, including HumanEval and Natural2Code, and can be used as the engine for more advanced coding systems like AlphaCode 2. Gemini is trained at scale on Google's AI-optimized infrastructure using Tensor Processing Units (TPUs) v4 and v5e, and is designed to be reliable, scalable, and efficient."

In [54]:
rag_chain.invoke("What is Hirakud?")

'Hirakud Dam is a man-made structure of earth, concrete, and masonry, and the longest major earthen dam in the world, measuring 25.8 km (16.0 mi) including dykes, and stands across the river Mahanadi.'

In [71]:
rag_chain.invoke("What is a plastic?")

'Plastic is a relatively new invention in human history with some of the first examples dating back to the late 1800s. It is a versatile and low-cost material, which has made it one of the most widely used materials of modern times. Plastic is used in a wide variety of applications, including food packaging, healthcare, construction materials, furniture, and textiles.'