# Math Knowledge Worker

In [31]:
# system imports
import os
import re
from dotenv import load_dotenv
from glob import glob
from pathlib import Path
from openai import OpenAI 
import chromadb
from sentence_transformers import SentenceTransformer

# ui imports
from IPython.display import display, Markdown
import gradio as gr

In [32]:
!pip install pypdf langchain langchain-community langchain-openai langchain-chroma -q

In [33]:
!pip install -q sentence_transformers

In [34]:
# Langchain RAG imports

from langchain.document_loaders import TextLoader,PyPDFLoader, CSVLoader, JSONLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain.schema import Document
# from langchain_openai import OpenAIEmbeddings, ChatOpenAI
# from langchain_chroma import Chroma
# from langchain.memory import ConversationBufferMemory
# from langchain.chains import ConversationalRetrievalChain


In [35]:
# setup env
load_dotenv()
print("OpenAI key Looks Good")if os.getenv("OPENAI_API_KEY")[:2]=="sk"  else print("Error retreiving OpenAI API Key")
openai = OpenAI()

gptModel = "gpt-4o-mini"
db_name = "vector_db"
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
collection_name = "math_rag"

OpenAI key Looks Good


### Load and chunk

In [39]:
# Read uploaded file

def load_files_from(files):
    documents=[]
    for file in files:
        # file = file.name
        print(file)
        file_type = Path(file).suffix
        if file_type in ['.txt', '.md']:
            loader = TextLoader(file)
        elif file_type in ['.pdf']:
            loader = PyPDFLoader(file)
        elif file_type in ['.csv']:
            loader = CSVLoader(file)
        elif file_type in ['.json']:
            loader = JSONLoader(file)
        else:
            return "#### Unrecognzed File Type"
        document = loader.load()
        documents.extend(document)
    return recursive_text_splitter(documents)


In [5]:
# Chunk files

def recursive_text_splitter(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = text_splitter.split_documents(documents)
    return chunks

In [84]:
pdf_path = "/Users/admin/Projects/RAG/testgen2/sources/pdf"
files = glob(f"{pdf_path}/*.pdf")
# print(type(files))
# for file in files:
#     print(type(file))
documents = load_files_from(files)
print(type(documents))
for file in documents:
    print(type(file))

/Users/admin/Projects/RAG/testgen2/sources/pdf/8th-grade-algebra-i-textbook.pdf
<class 'list'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.

In [86]:
print (documents[0])

page_content='PGC
Education
Algebra I
California
Student TextbookStudent Textbook
California Standards-Driven ProgramCalifornia Standards-Driven Program' metadata={'source': '/Users/admin/Projects/RAG/testgen2/sources/pdf/8th-grade-algebra-i-textbook.pdf', 'page': 0}


In [37]:
chunks = recursive_text_splitter(documents)
print(documents[2])

page_content='iiiiiiiiii
This Textbook provides comprehensive coverage of all the California Algebra I Standards.  The Textbook is divided
into eight Chapters.  Each of the Chapters is broken down into small, manageable Topics and each Topic covers a
specific Standard or part of a Standard.
Section 1.1 — Sets and ExprSection 1.1 — Sets and ExprSection 1.1 — Sets and ExprSection 1.1 — Sets and ExprSection 1.1 — Sets and Expr essionsessionsessionsessionsessions
Topic 1.1.1 The Basics of Sets .................................................................................................. ............. 2
Topic 1.1.2 Subsets of the Real Numbers ......................................................................................... ..... 5
Topic 1.1.3 Unions and Intersections ............................................................................................ .......... 7
Topic 1.1.4 Algebraic and Numeric Expressions ...............................................................

### Vector Embedding

In [47]:
def create_collection(chunks, embedding_model=embedding_model):
    client = chromadb.PersistentClient(path=db_name)
    existing_collection_names = [collection.name for collection in client.list_collections()]
    if collection_name in existing_collection_names:
        client.delete_collection(collection_name)
        print(f"Deleted existing collection: {collection_name}")

    collection = client.create_collection(collection_name)

    for i, chunk in enumerate(chunks):
        collection.add(
            ids=[f"doc_{i}"],
            documents=[chunk.page_content],
            embeddings=[embedding_model.encode(chunk.page_content)],
            metadatas=[chunk.metadata]
        )
    print(f"Added {len(chunks)} chunks to collection: {collection.name}")
    return collection

### Retreival Augementation

In [71]:
# Create custom prompt template
custom_template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}

Chat History: {chat_history}

Current Time: {current_time}
User Question: {question}

Please provide your response in a helpful and informative manner:"""

In [79]:
def collection_from(files):
    document = load_files_from(files)
    chunks = recursive_text_splitter(document)
    collection = create_collection(chunks)
    return collection, gr.update(visible=True), gr.update(visible=True),  gr.update(visible=False)

In [51]:
pdf_path = "/Users/admin/Projects/RAG/testgen2/sources/pdf"
files = glob(f"{pdf_path}/*.pdf")
collection = collection_from(files)

/Users/admin/Projects/RAG/testgen2/sources/pdf/8th-grade-algebra-i-textbook.pdf
Deleted existing collection: math_rag
Added 1278 chunks to collection: math_rag


In [52]:
def find_similars(prompt, n=5):
    results = collection.query(query_embeddings=embedding_model.encode(prompt).astype(float).tolist(), n_results=n)
    documents = results['documents'][0][:]
    return documents

In [67]:
def messages_for(question, similars):
    system_message = "You write a question for a math test. If you are not sure say you don't know."
    user_prompt = "The subject of the question is: "
    user_prompt += question
    user_prompt += "To make context for the question, here is a piece of text from a math textbook:"
    for similar in similars:
        user_prompt += similar
    print(similars)
    return [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt},
        {"role": "assistant", "content": "The question you are writing is:"},
    ]

In [69]:
# The function for gpt-4o-mini

def gpt_4o_mini_rag(question):
    documents = find_similars(question)
    stream = openai.chat.completions.create(
        model="gpt-4o-mini", 
        messages=messages_for(question, documents),
        stream=True
    )
    result = ""
    for chunk in stream:
        result += chunk.choices[0].delta.content or ""
        yield result

In [72]:
# view = gr.Interface(
#     fn=gpt_4o_mini_rag,
#     inputs=[gr.Textbox(label="Your message:")],
#     outputs=[gr.Markdown(label="Response:")],
#     flagging_mode="never"
# )
# view.launch()

In [80]:
# Gradio interface

with gr.Blocks() as ui:
    gr.Markdown("## Custom Personal Knowledge Assistant")
    interim =  gr.Markdown("#### Upload a File to continue", visible=True)
    with gr.Row():
            file_uploader = gr.File(file_count="multiple")
    # with gr.Row():
    #     rag_button = gr.Button("Create Your Custom Assistant")
    with gr.Row():

        chatbot = gr.Chatbot(type='messages', visible=False)
        output = gr.Markdown(label="Response:")
    with gr.Row():
        chat_entry = gr.Textbox(visible=False)
    collection = gr.State()
    file_uploader.change(collection_from, inputs=[file_uploader], outputs=[collection, chatbot, chat_entry, interim])
    chat_entry.submit(gpt_4o_mini_rag, inputs=[chat_entry], outputs=[output])


ui.launch()
        

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


* Running on local URL:  http://127.0.0.1:7887

To create a public link, set `share=True` in `launch()`.




/private/var/folders/sn/z9t3m2s11ml9z7s806xf76_m0000gn/T/gradio/e044d7a4cf955c7799a4f2904f5ff1f567df3699e3c3a6a3cea772b79aa02d96/further.pdf
Deleted existing collection: math_rag
Added 775 chunks to collection: math_rag


Traceback (most recent call last):
  File "/Users/admin/anaconda3/envs/testg/lib/python3.11/site-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/admin/anaconda3/envs/testg/lib/python3.11/site-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/admin/anaconda3/envs/testg/lib/python3.11/site-packages/gradio/blocks.py", line 2047, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/admin/anaconda3/envs/testg/lib/python3.11/site-packages/gradio/blocks.py", line 1606, in call_function
    prediction = await utils.async_iteration(iterator)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/admin/anaconda3/envs/testg/lib/python3.11/site-packages/gradio/utils.py", line 714,