In [1]:
import os
import pickle
import faiss
import time
import langchain
from dotenv import load_dotenv
from langchain_google_genai import GoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [3]:
llm = GoogleGenerativeAI(model="gemini-1.5-pro")
llm.invoke('What is llm?')

'LLM stands for **Large Language Model**.\n\nIt\'s a type of artificial intelligence (AI) model designed to understand and generate human-like text.  These models are trained on massive amounts of text data, allowing them to learn patterns, grammar, and even some reasoning abilities.  They can perform various tasks, including:\n\n* **Text generation:** Writing stories, poems, articles, summaries, and more.\n* **Translation:** Converting text from one language to another.\n* **Question answering:** Providing answers based on the information they\'ve been trained on.\n* **Dialogue:** Engaging in conversations with users.\n* **Code generation:** Writing code in various programming languages.\n\nLLMs achieve these feats through complex algorithms and architectures, often based on transformer networks.  These networks allow the models to process and understand the relationships between words and phrases in a given text.  The "large" in LLM refers to the sheer size of these models, both in t

In [4]:
loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])
data = loaders.load()

In [5]:
data

[Document(metadata={'source': 'https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html'}, page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\nMy Profile\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nMy Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nLogout\n\nLoans up to ₹15 LAKHS\n\nFixed Deposits\n\nCredit CardsLifetime Free\n\nCredit Score\n\nChat with Us\n\nDownload App\n\nFollow us on:\n\nGo Ad-Free\n\nMy Alerts\n\n>->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_MARKETS_AS/MC_ENG_ROS_NWS_MKTS_AS_ATF_728\n\nGo PRO @₹99 PRO\n\nAdvertisement\n\nRemove Ad\n\nBusiness\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\n\nHomeNewsBusinessMarketsWall Street rises as Tesla soars on AI optimism\n\nTrending Topics\n\nSensex TodayHEG Share PriceIndus Towers sh

### (2) Split data to create chunks

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
docs = text_splitter.split_documents(data)

In [7]:
docs

[Document(metadata={'source': 'https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html'}, page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\nMy Profile\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nMy Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nLogout\n\nLoans up to ₹15 LAKHS\n\nFixed Deposits\n\nCredit CardsLifetime Free\n\nCredit Score\n\nChat with Us\n\nDownload App\n\nFollow us on:\n\nGo Ad-Free\n\nMy Alerts\n\n>->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_MARKETS_AS/MC_ENG_ROS_NWS_MKTS_AS_ATF_728\n\nGo PRO @₹99 PRO\n\nAdvertisement\n\nRemove Ad\n\nBusiness\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\n\nHomeNewsBusinessMarketsWall Street rises as Tesla soars on AI optimism\n\nTrending Topics\n\nSensex TodayHEG Share PriceIndus Towers sh

In [8]:
len(docs)

16

In [9]:
docs[0].metadata['source']

'https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html'

### (3) Create embeddings for these chunks and save them to FAISS index

In [10]:
embeddings = GoogleGenerativeAIEmbeddings(model='models/embedding-001')
vectorindex = FAISS.from_documents(docs,embeddings)

In [11]:
vectorindex

<langchain_community.vectorstores.faiss.FAISS at 0x28553eec9a0>

In [12]:
# Saving
vectorindex.save_local("faiss_index")

# Loading
from langchain_community.vectorstores import FAISS
loaded_vectorindex = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [13]:
vectorindex

<langchain_community.vectorstores.faiss.FAISS at 0x28553eec9a0>

In [14]:
loaded_vectorindex

<langchain_community.vectorstores.faiss.FAISS at 0x28553eefb20>

### (4) Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [15]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm,retriever=loaded_vectorindex.as_retriever())

In [16]:
chain

RetrievalQAWithSourcesChain(verbose=False, combine_documents_chain=MapReduceDocumentsChain(verbose=False, llm_chain=LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Use the following portion of a long document to see if any of the text is relevant to answer the question. \nReturn any relevant text verbatim.\n{context}\nQuestion: {question}\nRelevant text, if any:'), llm=GoogleGenerativeAI(model='gemini-1.5-pro', google_api_key=SecretStr('**********'), client=genai.GenerativeModel(
    model_name='models/gemini-1.5-pro',
    generation_config={},
    safety_settings={},
    tools=None,
    system_instruction=None,
    cached_content=None
    model_name='models/gemini-1.5-pro',
    generation_config={},
    safety_settings={},
    tools=None,
    system_instruction=None,
    cached_content=None
)), output_parser=StrOutputParser(), llm_kwargs={}), document_prompt=PromptTemplate(input_variables=['page_con

In [18]:
query = "what is the price of Tiago iCNG?"
langchain.debug=True
chain({'question':query},return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what is the price of Tiago iCNG?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "The company also said it has also introduced the twin-cylinder technology on its Tiago and Tigor models.\n\nThe Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh, while the Tigor iCNG comes at a price range of Rs 7.8 lakh to Rs 8.95 lakh.\n\nTata Motors Passenger Vehicles Ltd Head-Marketing, Vinay Pant said these introductions put together will make the company's CNG line up \"appealing, holistic, and stronger than ever\".\n\nPTI\n\nfirst published: Aug 4, 2023 02:17 pm\n\nDiscover 

Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


[36;1m[1;3m[llm/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain > llm:GoogleGenerativeAI] [16.84s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "The Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh\n",
        "generation_info": {
          "usage_metadata": {
            "prompt_token_count": 292,
            "candidates_token_count": 23,
            "total_token_count": 315,
            "cached_content_token_count": 0
          }
        },
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null,
  "type": "LLMResult"
}
[36;1m[1;3m[llm/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain > llm:GoogleGenerativeAI] [16.84s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "The provided text mentions the price of the *Tata Punch iCNG* starts at Rs 7.1 lakh.  It does *not* mention th

Retrying langchain_google_genai.llms._completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


[36;1m[1;3m[llm/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain > llm:GoogleGenerativeAI] [8.03s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "The Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh.\n\nSOURCES: https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html\n",
        "generation_info": {
          "usage_metadata": {
            "prompt_token_count": 1822,
            "candidates_token_count": 77,
            "total_token_count": 1899,
            "cached_content_token_count": 0
          }
        },
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null,
  "type": "LLMResult"
}
[36;1m[1;3m[chain/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] [8.03s] Exiting Chain run with output:
[0m{
  "text": "The Tiago iCNG is priced between Rs 6.55

{'answer': 'The Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh.\n\n',
 'sources': 'https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html'}