In [1]:
import os
import streamlit as st
import pickle
import time

from langchain.chat_models import ChatOpenAI  # ✅ Replaces 'OpenAI'
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS


In [2]:
import langchain

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
llm = ChatOpenAI(temperature=0.9, max_tokens=500, model_name="gpt-3.5-turbo")


In [13]:
loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])
data = loaders.load() 
len(data)

2

In [14]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [16]:
 # Attach source URLs to chunks
for i, doc in enumerate(docs):
    doc.metadata["source"] = urls[i % len(urls)] 

NameError: name 'urls' is not defined

In [7]:
# Create the embeddings of the chunks using openAIEmbeddings
embeddings = OpenAIEmbeddings()

# Pass the documents and embeddings inorder to create FAISS vector index
vectorindex_openai = FAISS.from_documents(docs, embeddings)

In [8]:

# Storing vector index create in local
file_path="vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex_openai, f)

In [9]:
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)


In [10]:
chain = RetrievalQA.from_chain_type(
    llm=llm,  # ChatOpenAI model
    chain_type="stuff",  # "stuff" is simple, works fine; can also use "map_reduce" or "refine"
    retriever=vectorIndex.as_retriever()
)

In [12]:
query = "what is the price of Tiago iCNG?"

result = chain.invoke({"query": query})

print("Answer:", result["result"])

# Get sources (URLs)
source_docs = result.get("source_documents", [])
print("\nSources:")
for doc in source_docs:
    source = doc.metadata.get("source", "Unknown Source")
    print(source)


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "what is the price of Tiago iCNG?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "what is the price of Tiago iCNG?",
  "context": "The company also said it has also introduced the twin-cylinder technology on its Tiago and Tigor models.\n\nThe Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh, while the Tigor iCNG comes at a price range of Rs 7.8 lakh to Rs 8.95 lakh.\n\nTata Motors Passenger Vehicles Ltd Head-Marketing, Vinay Pant said these introductions put together will make the company's CNG line up \"appealing, holistic, and stronger than ever\".\n\nPTI\n\nfirst published: Aug 4, 2023 02:17 pm\n\nBusiness News,\n\nBudget 2025 News

In [21]:
import os
import pickle
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# Load API key from .env file (OPENAI_API_KEY=your_key_here)
load_dotenv()

# Step 1: URLs to Load
urls = [
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-tiago-and-tigor-icng-twin-cylinder-tech-prices-start-at-rs-6-55-lakh-11098752.html",
    "https://www.business-standard.com/industry/automobile/tata-motors-launches-new-cng-models-check-prices-and-other-details-123080401130_1.html"
]

file_path = "faiss_store.pkl"

# Step 2: Load Data
loader = UnstructuredURLLoader(urls=urls)
print("Loading data from URLs...")
data = loader.load()

# Step 3: Text Splitting
text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n\n', '\n', '.', ','],
    chunk_size=1000
)
docs = text_splitter.split_documents(data)

# Step 4: Embedding & FAISS
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)

# Save Vectorstore
with open(file_path, "wb") as f:
    pickle.dump(vectorstore, f)

print("Vectorstore built and saved ✅")

# Step 5: Question Answering
query = "Tata Motors launches Punch iCNG, price starts at"

with open(file_path, "rb") as f:
    vectorstore = pickle.load(f)

llm = ChatOpenAI(temperature=0.9, max_tokens=500, model_name="gpt-3.5-turbo")
chain = RetrievalQA.from_chain_type(
    llm=llm, retriever=vectorstore.as_retriever(), return_source_documents=True
)

result = chain.invoke({"query": query})

# Print Answer
print("\nAnswer:")
print(result["result"])

# Print Sources (Deduplicated)
print("\nSources:")
source_docs = result.get("source_documents", [])
unique_sources = set()

for doc in source_docs:
    if hasattr(doc.metadata, 'get'):
        source = doc.metadata.get("source", "")
        if source:
            unique_sources.add(source)

if unique_sources:
    for src in unique_sources:
        print(src)
else:
    print("No sources found.")


Loading data from URLs...
Vectorstore built and saved ✅
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Tata Motors launches Punch iCNG, price starts at"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Tata Motors launches Punch iCNG, price starts at",
  "context": "Home\n\nNews\n\nBusiness\n\nTata Motors launches Punch iCNG, price starts at Rs 7.1 lakh\n\nTrending Topics\n\nSensex Live\n\nSmartworks Coworking Spaces IPO\n\nVedanta Share Price\n\nCryogenic OGS Shares\n\nTravel Food Services IPO allotment\n\nTata Motors launches Punch iCNG, price starts at Rs 7.1 lakh\n\nThe Punch iCNG is equipped with the company's proprietary twin-cylinder technology with enhanced safety features like a mi

In [24]:
import os
import pickle
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# ✅ Load API key from .env (must contain OPENAI_API_KEY)
load_dotenv()

# ✅ Step 1: URLs to Load
urls = [
    "https://www.moneycontrol.com/news/business/tata-motors-mahindra-gain-certificates-for-production-linked-payouts-11281691.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-tiago-and-tigor-icng-twin-cylinder-tech-prices-start-at-rs-6-55-lakh-11098752.html",
    "https://www.business-standard.com/industry/automobile/tata-motors-launches-new-cng-models-check-prices-and-other-details-123080401130_1.html"
]

file_path = "faiss_store.pkl"

# ✅ Step 2: Load Data from URLs
loader = UnstructuredURLLoader(urls=urls)
print("Loading data from URLs...")
data = loader.load()

# ✅ Step 3: Text Splitting
text_splitter = RecursiveCharacterTextSplitter(
    separators=['\n\n', '\n', '.', ','],
    chunk_size=1000
)
docs = text_splitter.split_documents(data)

# ✅ Step 4: Embedding & FAISS Vector Store
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)

# ✅ Save Vectorstore to file
with open(file_path, "wb") as f:
    pickle.dump(vectorstore, f)

print("✅ Vectorstore built and saved successfully!")

# ✅ Step 5: Load Vectorstore & Answer Query
query = "What is the starting price of Punch iCNG?"

with open(file_path, "rb") as f:
    vectorstore = pickle.load(f)

llm = ChatOpenAI(temperature=0.0, max_tokens=500, model_name="gpt-3.5-turbo")
chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

# ✅ Run Query
result = chain.invoke({"query": query})

# ✅ Print Answer
print("\nAnswer:")
print(result["result"])

# ✅ Print Sources (Unique)
print("\nSources:")
source_docs = result.get("source_documents", [])
unique_sources = set()

for doc in source_docs:
    source = doc.metadata.get("source", "")
    if source:
        unique_sources.add(source)

if unique_sources:
    for src in unique_sources:
        print(src)
else:
    print("No sources found.")


Loading data from URLs...
✅ Vectorstore built and saved successfully!
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What is the starting price of Punch iCNG?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What is the starting price of Punch iCNG?",
  "context": "Launched in 2020, India's $24 billion production-linked incentive programme covers 14 sectors, ranging from electronic products to autos, and is crucial to boosting jobs in manufacturing, an area where India has struggled.\n\nStory continues below Advertisement\n\nRemove Ad\n\nReuters reported this month that India's top bureaucrat reviewed the scheme, amid a push from industry for faster payouts.\n\nThe Indian government said o

In [None]:
import os
import streamlit as st
import pickle
import time

from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# Load API key from .env
load_dotenv()

st.title("RockyBot: News Research Tool 📈")
st.sidebar.title("News Article URLs")

# Input URLs
urls = []
for i in range(3):
    url = st.sidebar.text_input(f"URL {i+1}")
    urls.append(url)

process_url_clicked = st.sidebar.button("Process URLs")
file_path = "faiss_store.pkl"

main_placeholder = st.empty()
llm = ChatOpenAI(temperature=0.9, max_tokens=500, model_name="gpt-3.5-turbo")

if process_url_clicked:
    # Load data from URLs
    loader = UnstructuredURLLoader(urls=urls)
    main_placeholder.text("Loading data from URLs... ✅")
    data = loader.load()

    # Text Splitting
    text_splitter = RecursiveCharacterTextSplitter(
        separators=['\n\n', '\n', '.', ','],
        chunk_size=1000
    )
    main_placeholder.text("Splitting text... ✅")
    docs = text_splitter.split_documents(data)

    # Embedding & FAISS
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(docs, embeddings)
    main_placeholder.text("Building vectorstore... ✅")
    time.sleep(2)

    # Save vectorstore
    with open(file_path, "wb") as f:
        pickle.dump(vectorstore, f)

query = main_placeholder.text_input("Ask Your Question:")
if query:
    if os.path.exists(file_path):
        with open(file_path, "rb") as f:
            vectorstore = pickle.load(f)

        chain = RetrievalQA.from_chain_type(
            llm=llm, retriever=vectorstore.as_retriever(), return_source_documents=True
        )

        result = chain.invoke({"query": query})

        # Show Answer
        st.header("Answer")
        st.write(result["result"])

        # Show Unique Sources
        source_docs = result.get("source_documents", [])
        unique_sources = set()
        for doc in source_docs:
            source = doc.metadata.get("source", "")
            if source:
                unique_sources.add(source)

        if unique_sources:
            st.subheader("Sources:")
            for src in unique_sources:
                st.write(src)
        else:
            st.info("No sources found.")
