In [None]:
from pprint import pprint
import chromadb
import openai
from openai import OpenAI
import re
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

openai.api_key =
chroma_db = chromadb.PersistentClient(path='db')

# Text Extraction

In [None]:
ipcc_pdf = "../data/IPCC_AR6_WGII_TechnicalSummary.pdf"

reader = PdfReader(ipcc_pdf)

ipcc_texts = [page.extract_text() for page in reader.pages]

# Show first page

In [None]:
pprint(ipcc_texts[0])

## Filtering out beginning and end of document

In [None]:
pprint(ipcc_texts[4])

In [None]:
pprint(ipcc_texts[-5])

In [None]:
ipcc_texts_filtered = ipcc_texts[5:-5]

In [None]:
## Remove "Technical Summary" and the number before it

ipcc_without_header_footer = [re.sub(r'\d+\nTechnical Summary', '',s) for s in ipcc_texts_filtered]

In [None]:
pprint(ipcc_without_header_footer[0])

In [None]:
ipcc_texts[5:10]

In [None]:
## Want to remove \nTS and TS\n
ipcc_without_header_footer = [re.sub(r'\nTS', '',s) for s in ipcc_without_header_footer]

ipcc_without_header_footer = [re.sub(r'TS\n', '',s) for s in ipcc_without_header_footer]

In [None]:
pprint(ipcc_without_header_footer[5])

## Splitting the text

In [None]:
char_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", ".", " ", ""], chunk_size=1000, chunk_overlap=0.2)

In [None]:
texts_char_split = char_splitter.split_text("\n\n".join(ipcc_without_header_footer))

len(texts_char_split)

## Token Split

In [None]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=2, tokens_per_chunk=256)

In [None]:
texts_token_splitted = []

for text in texts_char_split:
    texts_token_splitted.extend(token_splitter.split_text(text))

In [None]:
## Printing number of chunks
print(f"Number of chunks: {len(texts_token_splitted)}")

In [None]:
## First chunk
texts_token_splitted[0]

# Vector Database

In [None]:
chroma_db = chromadb.PersistentClient(path="db")

In [None]:
chroma_collection=chroma_db.get_or_create_collection("ipcc")

## Adding Documents

In [None]:
ids = [str(i) for i in range(len(texts_token_splitted))]

chroma_collection.add(documents=texts_token_splitted, ids=ids)

## Query

In [None]:
query = "What is the impact of climate change on the ocean"

res = chroma_collection.query(query_texts=[query], n_results=5)

res['documents']

# RAG

In [None]:
def rag(query, num_results=5):
    res = chroma_collection.query(query_texts=[query], n_results=num_results)

    documents = res['documents'][0]

    joined_info = ";".join([doc for doc in documents])

    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert on climate change. Your users are asking questions about information contained in attached information. You will be shown the user's question, and the relevant information. Answer the user's question using only this information."
        },
        {
            "role": "user",
            "content": f"Question: {query} \n Information: {joined_info} \n Answer:"
        }
    ]

    openai_bot = OpenAI()
    bot_response = openai_bot.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",
        messages=messages
    )

    content = bot_response.choices[0].message.content
    return content

## Test

In [None]:
query = "What is the impact of climate change on the ocean?"
rag(query)