In [1]:
import torch
torch.cuda.empty_cache()

# Preprocessing the scanned textbook with Pix2Text

In [None]:
!pip install -qU pix2text langchain jq tiktoken langchain_community langchain_chroma langchain-huggingface huggingface-hub sentence_transformers
!pip uninstall onnxruntime
!pip install -qU onnxruntime-gpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.6/163.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m737.4/737.4 kB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from pix2text import Pix2Text
from pix2text.latex_ocr import *

img_fp = '/content/chapter1.pdf'
p2t = Pix2Text.from_config()


## Page by page


In [None]:
#page by page
from langchain_core.documents import Document
pages=[]
for i in range(0,25):
  doc = p2t.recognize_pdf(img_fp,table_as_image=False,page_numbers=[i])
  pages.append(Document(page_content=doc.to_markdown('Page'), metadata={'page':i}))

In [None]:
print(pages)

# Initialize the Embeddings Model

In [None]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cuda:0"}
encode_kwargs = {"normalize_embeddings": True}

hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

# Define the Chunking Function

In [None]:
document = pages

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_document_into_chunks(doc):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_documents(doc)
    return chunks

chunked_document = split_document_into_chunks(document)

# Add Embeddings and Store in ChromaDB

In [None]:
from google.colab import drive
drive.mount('/content/MyDrive')

In [None]:
from langchain.vectorstores import Chroma

vectorstore = Chroma.from_documents(documents=chunked_document, embedding=hf, persist_directory="/content/drive/MyDrive/CITS5553_Capstone/vector")

In [None]:
from langchain.vectorstores import Chroma

# Load the vector store
vectorstore = Chroma(
    persist_directory="/content/drive/MyDrive/CITS5553_Capstone/vector",
    embedding_function=hf
)

# Initialize the Language Model & Creating self-querying retriever

In [None]:
!pip uninstall lark
!pip install lark

In [None]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.schema import AttributeInfo
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline, HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings, HuggingFaceHubEmbeddings
from langchain_huggingface import HuggingFacePipeline

# Define metadata fields based on what is available (page numbers)
metadata_field_info = [
    AttributeInfo(
        name="page",
        description="The page number of the document",
        type="integer",
    ),
]

document_content_description = "Textbook content split by page"

# Initialize HuggingFace model
llm = HuggingFacePipeline.from_model_id(
    model_id="microsoft/Phi-3-mini-4k-instruct",
    task="text-generation",
    device=None,  # Use CPU
    pipeline_kwargs={
        "max_new_tokens": 100,
        "top_k": 50,
        "temperature": 0.1,
    },
)

# Initialize Self-Query Retriever
retriever = SelfQueryRetriever.from_llm(
    llm, vectorstore, document_content_description, metadata_field_info, verbose=True
)

# Intialize KNN for Retrieval

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Create a Prompt Template

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

# Define the prompt template
template = """Answer the question based on the following context along with page number:
{context}

Question: {question}
"""

# Create a ChatPromptTemplate from the template
prompt = ChatPromptTemplate.from_template(template)

# Function to format documents for inclusion in the context
def format_docs(docs):
    """
    Formats a list of documents into a string with content and metadata.

    Args:
        docs (list): A list of document objects with 'page_content' and 'metadata' attributes.

    Returns:
        str: A formatted string with each document's content and metadata.
    """
    return "\n\n".join(f"{doc.page_content}\n\nMetadata: {doc.metadata}" for doc in docs)

# Create a chatbot Question & Answer chain from the retriever
rag_chain_from_docs = (
    RunnablePassthrough.assign(
        context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

# Example query
question = "What is Ideal gas law?"

# Perform the retrieval and generate the response
response = rag_chain_with_source.invoke(question)

# Display the response
print(response)

In [None]:
for doc in response:
    print(f"id: {doc.metadata['_id']}\n")
    print(f"text: {doc.page_content[:256]}\n")
    print("-" * 80)
    print()

# Reranking

# Additional code for fixing token of 70 pages (Ignored for now)

In [None]:
import bs4, getpass, os, tiktoken
from langchain import hub
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain.prompts import ChatPromptTemplate

# Load, chunk and index the contents of the blog.
#loader = TextLoader("/content/output/output.md")
#docs = loader.load()

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens
#num_tokens_from_string(question, "cl100k_base")
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)
#similarity = cosine_similarity(query_result, document_result)

In [None]:
#TEST IGNORE
num_tokens_from_string(document, "cl100k_base")
query_result = hf.embed_query(question)
document_result = hf.embed_query(docs)
len(query_result)

AttributeError: 'list' object has no attribute 'replace'