In [1]:
import langchain
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import MapReduceDocumentsChain
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains import create_retrieval_chain
from langchain_groq import ChatGroq
from langchain_google_genai import ChatGoogleGenerativeAI

In [2]:
groq_api_key = ""
#gemini_api_key = ""


llm = ChatGroq(model="llama3-70b-8192", temperature=0.5, groq_api_key=groq_api_key)
#llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.5, google_api_key = gemini_api_key)

## Load data

In [3]:
# loader = UnstructuredURLLoader(urls=[
#     "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
#     "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
# ])

# loader = UnstructuredURLLoader(urls=[
#     "https://www.thehindu.com/sport/cricket/2024-t20-world-cup-analysis-of-a-famous-indian-victory/article68354339.ece",
#    "https://www.thehindu.com/opinion/editorial/new-beginnings-the-hindu-editorial-on-indias-icc-t20-world-cup-2024-win/article68351915.ece"
# ])

loader = UnstructuredURLLoader(urls=[
    "https://sportstar.thehindu.com/cricket/champions-trophy/india-wins-champions-trophy-2025-ind-vs-nz-final-match-report-score-highlights/article69310397.ece",
    "https://apnews.com/article/india-new-zealand-cricket-champions-trophy-final-d36fb7f4ec4845c02daddce01c9a696a",
    "https://indianexpress.com/section/sports/cricket/live-score/india-vs-new-zealand-final-odi-live-score-full-scorecard-highlights-icc-champions-trophy-2025-innz03092025255197/" 
])


data = loader.load() 
len(data)

3

In [4]:
data

[Document(metadata={'source': 'https://sportstar.thehindu.com/cricket/champions-trophy/india-wins-champions-trophy-2025-ind-vs-nz-final-match-report-score-highlights/article69310397.ece'}, page_content='Cricket\n\nFootball\n\nChampions Trophy\n\nWomen\'s Cricket\n\nACES 2025\n\nHockey\n\nInd vs Nz CT Final live\n\nShorts\n\nCricket\n\nFootball\n\nChampions Trophy\n\nWomen\'s Cricket\n\nACES 2025\n\nHockey\n\nInd vs Nz CT Final live\n\nShorts\n\nMagazineBuy Print\n\nLoginAccountSubscribe\n\nPREMIUM\n\nMagazine\n\nPosters\n\nColumns\n\nSpecial Editions\n\nStar Life\n\nStatsman\n\nPLAYGROUNDS\n\nArchery\n\nAthletics\n\nBadminton\n\nBasketball\n\nBoxing\n\nChess\n\nCricket\n\nCue Sport\n\nESPORTS\n\nFootball\n\nGolf\n\nHockey\n\nKabaddi\n\nMMA\n\nMotorsport\n\nShooting\n\nSquash\n\nSwimming\n\nTable Tennis\n\nTennis\n\nVolleyball\n\nWrestling\n\nMULTIMEDIA\n\nVideos\n\nGallery\n\nPodcast\n\nSportoon\n\nON STAGE\n\nACES Awards\n\nSportstar Conclave\n\nLatest News\n\nPREMIUM\n\nMagazine\n\nP

## Split data to create chunks

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)

docs = text_splitter.split_documents(data)

In [6]:
len(docs)

68

In [7]:
docs[0]

Document(metadata={'source': 'https://sportstar.thehindu.com/cricket/champions-trophy/india-wins-champions-trophy-2025-ind-vs-nz-final-match-report-score-highlights/article69310397.ece'}, page_content="Cricket\n\nFootball\n\nChampions Trophy\n\nWomen's Cricket\n\nACES 2025\n\nHockey\n\nInd vs Nz CT Final live\n\nShorts\n\nCricket\n\nFootball\n\nChampions Trophy\n\nWomen's Cricket\n\nACES 2025\n\nHockey\n\nInd vs Nz CT Final live\n\nShorts\n\nMagazineBuy Print\n\nLoginAccountSubscribe\n\nPREMIUM\n\nMagazine\n\nPosters\n\nColumns\n\nSpecial Editions\n\nStar Life\n\nStatsman\n\nPLAYGROUNDS\n\nArchery\n\nAthletics\n\nBadminton\n\nBasketball\n\nBoxing\n\nChess\n\nCricket\n\nCue Sport\n\nESPORTS\n\nFootball\n\nGolf\n\nHockey\n\nKabaddi\n\nMMA\n\nMotorsport\n\nShooting\n\nSquash\n\nSwimming\n\nTable Tennis\n\nTennis\n\nVolleyball\n\nWrestling\n\nMULTIMEDIA\n\nVideos\n\nGallery\n\nPodcast\n\nSportoon\n\nON STAGE\n\nACES Awards\n\nSportstar Conclave\n\nLatest News\n\nPREMIUM\n\nMagazine\n\nPost

## Create embeddings for these chunks and save them to FAISS index

In [8]:
#embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key = gemini_api_key)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

vector_db = FAISS.from_documents(docs, embeddings)

retriever  = vector_db.as_retriever()
#retriever  = vector_db.as_retriever(search_kwargs={"k": 5})

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
#save and load a FAISS index in local

vector_db.save_local("faiss_index")

#vector_db = FAISS.load_local("faiss_index", embeddings)

## Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [10]:
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=retriever)
chain



In [11]:
#query = "what is the price of Tiago iCNG?"

query = "Who won the ICC Champions Trophy 2025?"

langchain.debug=True

chain.invoke({"question": query}, return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "Who won the ICC Champions Trophy 2025?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "to , The Kiwis were without the ever-reliable Matt Henry to set the tone in their defence of 251, and his absence was felt as Rohit Sharma took the attack to them. He threw caution to the wind, while Shubman Gill played the supporting role. The Black Caps needed a moment of inspiration to break the flourishing opening stand, and it came from Glenn Phillips, adding another stunner to his highlights reel.\n\nto , A game truly worthy of being called a final, where the pendulum swung one 

Token indices sequence length is longer than the specified maximum sequence length for this model (2031 > 1024). Running this sequence through the model will result in indexing errors


[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Who won the ICC Champions Trophy 2025?",
  "summaries": "Content: The relevant text is:\n\n\"After faltering at the final hurdle in 2017, they have now accomplished their mission and reclaimed the title. Despair for New Zealand, but they can hold their heads high after a spirited campaign that saw them battle fiercely till the very end, something you expect from any Kiwi side, never willing to back down.\n\n...\n\nIndia win the game by 4 wickets to clinch the Champions Trophy 2025!\"\n\nSo, the answer is: India won the ICC Champions Trophy 2025.\nSource: https://indianexpress.com/section/sports/cricket/live-score/india-vs-new-zealand-final-odi-live-score-full-scorecard-highlights-icc-champions-trophy-2025-innz03092025255197/\n\nContent: There is no relevant text that answers the question of who won the ICC Champions 

{'answer': 'FINAL ANSWER: India won the ICC Champions Trophy 2025.\n',
 'sources': 'https://indianexpress.com/section/sports/cricket/live-score/india-vs-new-zealand-final-odi-live-score-full-scorecard-highlights-icc-champions-trophy-2025-innz03092025255197/, https://sportstar.thehindu.com/cricket/champions-trophy/india-wins-champions-trophy-2025-ind-vs-nz-final-match-report-score-highlights/article69310397.ece'}

In [12]:
langchain.debug=False

answer = chain.invoke({"question": query}, return_only_outputs=True)['answer']
answer.strip()

'FINAL ANSWER: India won the ICC Champions Trophy 2025.'

In [13]:
query = "Where was the ICC Champions Trophy 2025 final played?"
answer = chain.invoke({"question": query}, return_only_outputs=True)['answer']
answer.strip()

'FINAL ANSWER: The ICC Champions Trophy 2025 final was played at the Dubai International Stadium in Dubai.'

In [14]:
query = "Who was the Player of the Match in the final?"
answer = chain.invoke({"question": query}, return_only_outputs=True)['answer']
answer.strip()

'FINAL ANSWER: Rohit Sharma was the Player of the Match for his knock of 76 runs off 83 balls.'