Downloading the required dependencies

In [30]:
# Requirement
!pip install openai -q
!pip install langchain -q
!pip install chromadb -q
!pip install tiktoken -q
!pip install pypdf -q
!pip install unstructured[local-inference] -q
!pip install gradio -q
!pip install requests
!pip install beautifulsoup4



Web Scraping

In [31]:
import requests
from bs4 import BeautifulSoup

# Specifying the URL which we want to scrape
url = 'https://en.wikipedia.org/wiki/2023_Cricket_World_Cup'

# Sending a GET request to the URL
response = requests.get(url)

# Checking if the request was successful (status code 200)
if response.status_code == 200:
    # If successful, parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    paragraphs = soup.find_all('p')
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [33]:
# Checking the scraped data
for paragraph in paragraphs[:5]:
  print(paragraph)

<p class="mw-empty-elt">
</p>
<p><b>The 2023 Cricket World Cup</b>, officially known as the <b>2023 ICC Men's Cricket World Cup</b>, was the 13th edition of the <a href="/wiki/Cricket_World_Cup" title="Cricket World Cup">Cricket World Cup</a>. It started on 5 October and concluded on 19 November 2023, with <a href="/wiki/Australia_national_cricket_team" title="Australia national cricket team">Australia</a> winning the tournament.<sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup> A quadrennial <a href="/wiki/One_Day_International" title="One Day International">One Day International</a> (ODI) <a href="/wiki/Cricket" title="Cricket">cricket</a> tournament contested by national teams, it was organised by the <a href="/wiki/International_Cricket_Council" title="International Cricket Council">International Cricket Council</a> (ICC). Ten national teams participated in the tournament.
</p>
<p>It was the first men's Cricket World Cup which India hosted solely. The tourn

Creating tokens and chunks for each of the paragraph

In [34]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain

In [35]:
import os
os.environ["OPENAI_API_KEY"] = ""
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(temperature=0,model_name="gpt-4")

In [36]:
pip install pymupdf



In [37]:
from bs4 import BeautifulSoup, Tag

# Assuming paragraphs is a list of BeautifulSoup objects
paragraph_texts = []
for index, paragraph in enumerate(paragraphs):
    try:
        if paragraph and (isinstance(paragraph, Tag) or isinstance(paragraph, str)):
            # If it's a Tag, get the text. If it's a string, use it directly.
            text = paragraph.get_text() if isinstance(paragraph, Tag) else paragraph
            paragraph_texts.append(text)
        else:
            print(f"Skipping empty or non-BeautifulSoup paragraph at index {index}")
    except Exception as e:
        print(f"Error processing paragraph at index {index}")
        print(f"Error details: {e}")

document_objects = [MyDocument(text) for text in paragraph_texts if text]

# Chunk and Embeddings
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(document_objects)

# Create instances of MyDocument for each piece of text
document_objects = [MyDocument(text) for text in paragraph_texts if text]

# Chunk and Embeddings
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(document_objects)


In [38]:
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(documents, embeddings)

# Initialise Langchain - Conversation Retrieval Chain
qa = ConversationalRetrievalChain.from_llm(ChatOpenAI(temperature=0), vectorstore.as_retriever())

Input queries and model response

In [39]:
while True:

    user_question = input("Ask a question (type 'no' to exit): ")

    # Check if the user wants to exit
    if user_question.lower() == 'no':
        print("Thank You!")
        break

    # Get response from QA chain
    response = qa({"question": user_question, "chat_history": []})

    # Extract the answer from the response
    answer = response.get("answer", "No answer found.")

    # Print the response
    print(f"User Question: {user_question}\nAnswer: {answer}\n")


Ask a question (type 'no' to exit): who won the icc world cup 2023
User Question: who won the icc world cup 2023
Answer: Australia won the ICC World Cup 2023.

Ask a question (type 'no' to exit): where was it held?
User Question: where was it held?
Answer: The tournament was held in ten different stadiums, situated in ten different cities across India. The final took place at Narendra Modi Stadium in Ahmedabad.

Ask a question (type 'no' to exit): who were the indian players
User Question: who were the indian players
Answer: The context does not provide information about the specific Indian players in the cricket World Cup. Therefore, I don't know the answer to your question.

Ask a question (type 'no' to exit): who was the captain of the indian team?
User Question: who was the captain of the indian team?
Answer: Rohit Sharma was the captain of the Indian team in the tournament.

Ask a question (type 'no' to exit): who were the batsmen for india
User Question: who were the batsmen for 