In [35]:
!pip uninstall -y sentence-transformers huggingface_hub
!pip uninstall -y langchain langchain-community langchain-core

!pip install langchain==0.0.353
!pip install faiss-cpu transformers
!pip install scikit-learn
!pip install unstructured beautifulsoup4 accelerate


[0mFound existing installation: huggingface_hub 0.36.2
Uninstalling huggingface_hub-0.36.2:
  Successfully uninstalled huggingface_hub-0.36.2
Found existing installation: langchain 0.0.353
Uninstalling langchain-0.0.353:
  Successfully uninstalled langchain-0.0.353
Found existing installation: langchain-community 0.0.20
Uninstalling langchain-community-0.0.20:
  Successfully uninstalled langchain-community-0.0.20
Found existing installation: langchain-core 0.1.23
Uninstalling langchain-core-0.1.23:
  Successfully uninstalled langchain-core-0.1.23
Collecting langchain==0.0.353
  Using cached langchain-0.0.353-py3-none-any.whl.metadata (13 kB)
Collecting langchain-community<0.1,>=0.0.2 (from langchain==0.0.353)
  Using cached langchain_community-0.0.38-py3-none-any.whl.metadata (8.7 kB)
Collecting langchain-core<0.2,>=0.1.4 (from langchain==0.0.353)
  Using cached langchain_core-0.1.53-py3-none-any.whl.metadata (5.9 kB)
INFO: pip is looking at multiple versions of langchain-community to

In [36]:
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline


In [37]:
urls = [
    "https://www.cricbuzz.com/live-cricket-scores/139062/uae-vs-nz-11th-match-group-d-icc-mens-t20-world-cup-2026",
    "https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2025-26-1502138/namibia-vs-netherlands-10th-match-group-a-1512728/full-scorecard"
]


In [38]:
loader = UnstructuredURLLoader(urls=urls)
documents = loader.load()

print("Documents loaded:", len(documents))


Documents loaded: 2


In [39]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

chunks = splitter.split_documents(documents)
texts = [c.page_content for c in chunks]

print("Chunks created:", len(texts))


Chunks created: 34


In [40]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
embeddings = vectorizer.fit_transform(texts).toarray()


In [41]:
class TfidfEmbedding:
    def __init__(self, vectorizer):
        self.vectorizer = vectorizer

    def embed_documents(self, texts):
        return self.vectorizer.transform(texts).toarray().tolist()

    def embed_query(self, text):
        return self.vectorizer.transform([text]).toarray()[0].tolist()

    # ðŸ‘‡ THIS IS THE KEY FIX
    def __call__(self, text):
        return self.embed_query(text)


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

vectorizer.fit(texts)

# Embedding wrapper
embedding_function = TfidfEmbedding(vectorizer)

# FAISS vector store
vectorstore = FAISS.from_texts(
    texts=texts,
    embedding=embedding_function
)




In [43]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})


In [44]:
pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-large",
    max_length=512
)

llm = HuggingFacePipeline(pipeline=pipe)


Device set to use cuda:0


In [45]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff"
)


In [47]:
qa_chain.run("what is todays scorecard of uae vs nz match in t20 worldcup")


'UAE 173/6(20) NZ 175/0(15.2)'