In [2]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [37]:
os.environ['OPENAI_API_KEY'] = ''

In [38]:
## Initialising LLM with necessary params
llm = OpenAI(temperature=0.9, max_tokens=500)

loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])
data = loaders.load() 
len(data)

2

In [39]:
## Chunks - segments of original text after it has been split
## - Allows for handling long docs in AI models
## Chunk overlap - Allowing parts of the text to appear in more than one chunk
## - Ensures no info is lost
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)
print(f"Number of chunks: {len(docs)}")

Number of chunks: 34


In [40]:
docs[9]

Document(page_content='PRO\n\nBusiness\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\n\nHome\n\nNews\n\nBusiness\n\nMarkets\n\nWall Street rises as Tesla soars on AI optimism\n\nTesla (TSLA.O) rallied 10% after Morgan Stanley upgraded the electric car maker to "overweight" from "equal-weight," saying its Dojo supercomputer could boost the company\'s market value by nearly $600 billion.\n\nReuters\n\nSeptember 12, 2023 / 06:56 AM IST\n\n\n\n\n\n\n\n\n\n\n\n\n\nWall Street rises as Tesla soars on AI optimism\n\nThe Nasdaq closed sharply higher on Monday as Tesla surged on optimism around artificial intelligence and investors awaited inflation data due later this week.\n\nTesla\xa0(TSLA.O)\xa0rallied 10% after Morgan Stanley\xa0upgraded\xa0the electric car maker to "overweight" from "equal-weight," saying its Dojo supercomputer could boost the company\'s market value by nearly $600 billion.\n\nStory continues below Advertisement\n\nRemove Ad\n\nOth

In [None]:
## Creating OpenAI embedding
## from_documents accepts the chunks/docs we've created
## it then takes another paramater which will be our embedding
## our vector_index now has knowledge of our articles,
## - its like a database

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

vectorindex_openai = FAISS.from_documents(docs, embeddings)

In [None]:
## Storing vector index created locally
file_path = "vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex_openai, f)

In [None]:
## Loading the pkl file into memory
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

In [None]:
## Retrieval query with sources chain
## retriever is for how we want to retrieve the vector database
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorIndex.as_retriever())
chain

In [None]:
## Asking the question
## It first retrieves the chunks in the db
## It generates filtered content of each chunk by extracting 
## - only relevant part to answer the question
## It runs the 4 chunks through the 4 llms
## Then combines those 4 answers in a summary chunk
query = 'What is the price of Tiago iCNG?'

langchain.debug=True

chain({"question": query}, return_only_outputs=True)