In [None]:
print("OK")

In [2]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

In [3]:
# Read the PDF files from the folder

file_dir = "../dataset"
loader = PyPDFDirectoryLoader(file_dir)
docs = loader.load()

In [None]:
len(docs)

In [None]:
docs[11].page_content

In [6]:
# Extract the text from the PDF
pdf_str = ""

for page in docs:
    pdf_str += page.page_content

In [None]:
pdf_str[0:25]

In [None]:
len(pdf_str)

In [9]:
# Split the documents into chunks
model = "gpt-3.5-turbo"

text_splitter = TokenTextSplitter(
    model_name = model,
    chunk_size = 10000,
    chunk_overlap = 200
)
pdf_doc = text_splitter.split_text(pdf_str)

In [None]:
pdf_doc

In [None]:
len(pdf_doc)

In [None]:
type(pdf_doc[0])

In [13]:
# Convert string to langchain documents
from langchain.docstore.document import Document

doc = [Document(page_content = t) for t in pdf_doc]

In [None]:
doc[0].page_content

In [None]:
type(doc[0])

In [16]:
# Split the documents into chunks
model = "gpt-3.5-turbo"

text_splitter = TokenTextSplitter(
    model_name = model,
    chunk_size = 1000,
    chunk_overlap = 100
)
pdf_docs = text_splitter.split_documents(doc)

In [None]:
pdf_docs[0].page_content

In [None]:
len(pdf_docs)

In [19]:
# Import ollama chat model
from langchain_community.chat_models import ChatOllama

In [None]:
# Define llm model
model_name = "llama3.2:1b"

model = ChatOllama(
    model = model_name,
    temperature = 0.3
)

In [21]:
prompt_template = """
You are an expert at creating questions based on coding materials and documentation.
Your goal is to prepare a coder or programmer for their exam and coding tests.
You do this by asking questions about the text below:

------------
{text}
------------

Create questions that will prepare the coders or programmers for their tests.
Make sure not to lose any important information.

QUESTIONS:
"""

In [22]:
# Designing a ChatPrompt Template
from langchain_core.prompts import PromptTemplate

In [23]:
prompt_questions = PromptTemplate(
    template = prompt_template,
    input_variables = ["text"]
)

In [24]:
refine_template = ("""
You are an expert at creating practice questions based on coding material and documentation.
Your goal is to help a coder or programmer prepare for a coding test.
We have received some practice questions to a certain extent: {existing_answer}.
We have the option to refine the existing questions or add new ones.
(only if necessary) with some more context below.

------------
{text}
------------

Given the new context, refine the original questions in English.
If the context is not helpful, please provide the original questions.
QUESTIONS:
"""
)

In [25]:
refine_prompt_questions = PromptTemplate(
    input_variables = ["existing_answer", "text"],
    template = refine_template
)

In [26]:
# Create chat
from langchain.chains.summarize import load_summarize_chain

In [27]:
ques_chain = load_summarize_chain(
    llm = model,
    chain_type = "refine",
    verbose = True,
    question_prompt = prompt_questions,
    refine_prompt = refine_prompt_questions
)

In [None]:
ques = ques_chain.run(doc)

In [None]:
print(ques)

In [None]:
embeds = OllamaEmbeddings(
    model = model_name
)

In [31]:
db = FAISS.from_documents(pdf_docs, embeds)

In [None]:
foler_path = "./vectorstore"
file_name = "embeds"
db.save_local(folder_path=foler_path, file_name=file_name)

In [None]:
new_db = FAISS.load_local(folder_path=foler_path, file_name=file_name, allow_dangerous_deserialization=True)

In [32]:
llm = ChatOllama(model = model_name, temperature = 0.1)

In [None]:
ques

In [43]:
ques_list = ques.split("\n")

In [None]:
ques_list

In [None]:
len(ques_list)

In [None]:
filtered_ques_list = [element for element in ques_list if element.endswith('?') or element.endswith('.')]

In [None]:
filtered_ques_list

In [None]:
len(filtered_ques_list)

In [45]:
from langchain.chains import RetrievalQA

In [46]:
answer_generation_chain = RetrievalQA.from_chain_type(llm=llm, 
                                               chain_type="stuff", 
                                               retriever=db.as_retriever()
                                            )

In [None]:
# Answer each question and save to a file
for i, question in enumerate(ques_list):
    if i <= 10:
        print(f"{i}. Question: {question}")
        print("--------------------------------------------------")
        answer = answer_generation_chain.run(question)
        print("Answer: ", answer)
        print("--------------------------------------------------\\n\\n")
        # Save answer to file
        with open("answers.txt", "a") as f:
            f.write("Question: " + question + "\\n")
            f.write("Answer: " + answer + "\\n")
            f.write("--------------------------------------------------\\n\\n")
        break