# **Importing Package & Dependencies**

In [None]:
! pip install langchain
! pip install tiktoken
! pip install -qU langchain-openai
! pip install pypdf
! pip install sentencepiece
! pip install sentence-transformers
! pip install chromadb

In [None]:
import tiktoken
from langchain_openai import OpenAIEmbeddings
import os
import getpass
from langchain.document_loaders import PyPDFLoader
from transformers import AutoTokenizer
from langchain.text_splitter import CharacterTextSplitter
import nltk
nltk.download('punkt')
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain_openai.llms import OpenAI
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import Chroma

In [3]:
os.environ["OPENAI_API_KEY"] = "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"

# **PDF Loader**

In [4]:
loader = PyPDFLoader("/content/World Cup 2023.pdf")
pdf_pages = loader.load_and_split()

# **Splitter**

In [5]:
documents = []
metadatas = []

for i,document in enumerate(pdf_pages) :
  documents.append(document.page_content)
  document.metadata["page_no"] = "page_{}".format(i)
  metadatas.append(document.metadata)

In [6]:
text_splitter = CharacterTextSplitter(
    separator = " ",
    chunk_size = 2000,
    chunk_overlap = 150,
    length_function = len
)
text_chunks = text_splitter.create_documents(documents, metadatas=metadatas)

In [7]:
metadatas = [chunk.metadata for chunk in text_chunks]

# **Embedding**

In [None]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"

embedding_llm = SentenceTransformerEmbeddings(model_name=model_name)

# **Vectors Stores**

In [9]:
save_to_dir = "/content/pdf-chroma_db"
vector_db = Chroma.from_documents(text_chunks,embedding_llm,persist_directory=save_to_dir)

# **Q/A Chain | Stuff**

In [10]:
llms = OpenAI(model_name = "gpt-3.5-turbo-instruct" , temperature = .5)

In [11]:
qna_template = "\n".join([
    "Answer the question below based on the given context",
    "If you couldn't find the answer in that context just say 'The answer is not available' ",
    "### CONTEXT",
    "{context}",
    "",
    "### Question",
    "{question}",
    "",
    "### ANSWER : "
])

qna_prompt = PromptTemplate(template=qna_template,input_variables=["context","question"])

In [12]:
stuff_chain = load_qa_chain(llms,chain_type="stuff",prompt=qna_prompt)

In [13]:
def get_answer(question) :
  similar_documents = vector_db.similarity_search(question,k=5)
  answer = stuff_chain({
    "input_documents" : similar_documents ,
    "question" : question }, return_only_outputs=True)

  return answer["output_text"]

In [18]:
print(get_answer("what do u know about the world cup 2026 ?"))

The 2026 World Cup was a celebration of the beautiful game and its ability to unite people from all corners of the globe. It was won by Brazil, who emerged triumphant as the champions, reaffirming their status as footballing royalty. The tournament showcased the best of football and its power to inspire, unite, and transcend boundaries. The group stage saw drama and upsets, with teams battling for survival and a chance to etch their names in footballing history. The knockout stage brought intense matches and unexpected results, with underdogs Japan and Sweden pulling off stunning upsets. The final match between Brazil and Germany was a titanic battle for supremacy, with Brazil ultimately emerging victorious. The tournament also saw traditional powerhouses like Spain, Portugal, and Argentina clash with upstart challengers. The 2026 World Cup was a thrilling ride, with every match bringing new twists and turns, and its memories will endure for generations to come.


In [15]:
print(get_answer("Who won the world cup ?"))
# Ans : Brazil

 Brazil


In [19]:
print(get_answer("Which match had the highest number of goals in it?"))
# Ans : The match between Sweden and Argentina had the highest number of goals, with a total of 5 goals scored.

 The match between Sweden and Argentina had the highest number of goals with a total of 5 goals.
