In [13]:
# Install the OpenAI library, which provides access to OpenAI's powerful language models
!pip install -q openai

# Install the Enai library
!pip install -q langchain

# Install faiss-cpu (Facebook AI Similarity Search), a library for efficient similarity search
!pip install -q faiss-cpu

# Install tiktoken, a Python library for counting the number of tokens in a text string
!pip install -q tiktoken

# Install PyMuPDF, a Python library for working with PDF documents. We will use it to extract text.
!pip install -q PyMuPDF

# Install docx2txt, a library for extracting text from Microsoft Word (.docx) documents. Useful for working with Word documents.
!pip install -q docx2txt

In [14]:
#import necessary libraries
import fitz
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
import docx2txt

In [15]:
!pip install python-dotenv



In [16]:
from dotenv import load_dotenv

In [None]:
# Load environment variables from the .env file
env_file_path = '/content/token.env'
load_dotenv(dotenv_path=env_file_path)

In [30]:
#We started by converting the .pdf format to .txt format so that we will be able to add the markdown character to text titles and divide it into chuncks
from langchain_community.document_loaders import TextLoader
#we loaded the text file and visualize it in order to determine the best splitting approach
loader = TextLoader("./Scrum-Guide-2020.txt")
document = loader.load()[0].page_content
document

'      # Ken Schwaber & Jeff Sutherland\n\nThe Scrum Guide\n\n The Definitive Guide to Scrum: The Rules of the Game\n\n                November 2020\n# Purpose of the Scrum Guide\n\nWe developed Scrum in the early 1990s. We wrote the first version of the Scrum Guide in 2010 to help\npeople worldwide understand Scrum. We have evolved the Guide since then through small, functional\nupdates. Together, we stand behind it.\nThe Scrum Guide contains the definition of Scrum. Each element of the framework serves a specific\npurpose that is essential to the overall value and results realized with Scrum. Changing the core design\nor ideas of Scrum, leaving out elements, or not following the rules of Scrum, covers up problems and\nlimits the benefits of Scrum, potentially even rendering it useless.\nWe follow the growing use of Scrum within an ever-growing complex world. We are humbled to see\nScrum being adopted in many domains holding essentially complex work, beyond software product\ndevelopme

In [20]:
#Split the text based on titles and with reference to the markdown header splitter

from langchain.text_splitter import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Title"),
]

# MD splits
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=True
)
md_header_splits = markdown_splitter.split_text(document)

In [21]:
#visualize the results
for doc in md_header_splits:
  print(doc)

page_content='The Scrum Guide  \nThe Definitive Guide to Scrum: The Rules of the Game  \nNovember 2020' metadata={'Title': 'Ken Schwaber & Jeff Sutherland'}
page_content='We developed Scrum in the early 1990s. We wrote the first version of the Scrum Guide in 2010 to help\npeople worldwide understand Scrum. We have evolved the Guide since then through small, functional\nupdates. Together, we stand behind it.\nThe Scrum Guide contains the definition of Scrum. Each element of the framework serves a specific\npurpose that is essential to the overall value and results realized with Scrum. Changing the core design\nor ideas of Scrum, leaving out elements, or not following the rules of Scrum, covers up problems and\nlimits the benefits of Scrum, potentially even rendering it useless.\nWe follow the growing use of Scrum within an ever-growing complex world. We are humbled to see\nScrum being adopted in many domains holding essentially complex work, beyond software product\ndevelopment where Sc

In [24]:
"""splitting the resulted chuncks from the previous split into new chuncks
based on the character text splitter in order to reduce the chunck size and enhance
the retrieval accuracy
****the choice of CharacterTextSplitte arguments was made after we tried different params
using this chunckresizer tool :
https://chunkerizer.streamlit.app/
"""

text_splitter=CharacterTextSplitter(
    separator="\n",
    chunk_size= 350,
    chunk_overlap=50,
    length_function= len,
)

splits = text_splitter.split_documents(md_header_splits)
splits

[Document(page_content='The Scrum Guide  \nThe Definitive Guide to Scrum: The Rules of the Game  \nNovember 2020', metadata={'Title': 'Ken Schwaber & Jeff Sutherland'}),
 Document(page_content='We developed Scrum in the early 1990s. We wrote the first version of the Scrum Guide in 2010 to help\npeople worldwide understand Scrum. We have evolved the Guide since then through small, functional\nupdates. Together, we stand behind it.\nThe Scrum Guide contains the definition of Scrum. Each element of the framework serves a specific', metadata={'Title': 'Purpose of the Scrum Guide'}),
 Document(page_content='purpose that is essential to the overall value and results realized with Scrum. Changing the core design\nor ideas of Scrum, leaving out elements, or not following the rules of Scrum, covers up problems and\nlimits the benefits of Scrum, potentially even rendering it useless.', metadata={'Title': 'Purpose of the Scrum Guide'}),
 Document(page_content='We follow the growing use of Scrum w

In [25]:
#calling the embedding system to vectorize the following text
embeddings = OpenAIEmbeddings()

  warn_deprecated(


ValidationError: 1 validation error for OpenAIEmbeddings
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)

In [26]:
#create an index for document search based on embeddings
docsearch = FAISS.from_texts(splits, embeddings)

NameError: name 'embeddings' is not defined

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [27]:
#creating a question answering chain by loading the openAI model
chain= load_qa_chain(OpenAI(), chain_type="stuff")

NameError: name 'load_qa_chain' is not defined

In [28]:
#retrieve relevant documents based on the query and generate a response using the Q&A chain
query= "Summarize this document."
docs= docsearch.similarity_search(query)

chain.run(input_documents=docs, question=query)

NameError: name 'docsearch' is not defined

In [None]:
#retrieve relevant documents based on the query and generate a response using the Q&A chain
query= "the purpose of Scrum Events"
docs= docsearch.similarity_search(query)

chain.run(input_documents=docs, question=query)

In [29]:
#retrieve relevant documents based on the query and generate a response using the Q&A chain
query= "What are the three artifacts of SCRUM?"
docs= docsearch.similarity_search(query)

chain.run(input_documents=docs, question=query)

NameError: name 'docsearch' is not defined

In [None]:
#retrieve relevant documents based on the query and generate a response using the Q&A chain
query= "What are the topics discussed during the sprint planning?"
docs= docsearch.similarity_search(query)

chain.run(input_documents=docs, question=query)

In [None]:
#retrieve relevant documents based on the query and generate a response using the Q&A chain
query= "What are the tasks of the product owner?"
docs= docsearch.similarity_search(query)

chain.run(input_documents=docs, question=query)