# RAG application for "A Guide for First-Time Parents"

## Install tika and parse PDF file

- Install libraries
- Download PDF from the website [The Asian Parent](https://th.theasianparent.com/%E0%B8%84%E0%B8%B9%E0%B9%88%E0%B8%A1%E0%B8%B7%E0%B8%AD%E0%B8%94%E0%B8%B9%E0%B9%81%E0%B8%A5%E0%B8%A5%E0%B8%B9%E0%B8%81)
- Parse a PDF file using `tika`
- Clean text (using a simple created function)

In [28]:
import pandas as pd
import numpy as np
import faiss

import tika
tika.initVM()
from tika import parser
from unidecode import unidecode
import json

In [24]:
# parsed_book = parser.from_file("baby_0_3.pdf")

# n_pages = int(parsed_book["metadata"]["xmpTPg:NPages"])
# print(n_pages)

In [95]:
files = ["chief complaint", "making appointment", "medical experts"]
contents = []
intents = []
for file in files:
    with open(f"../data/{file}.txt", "r", encoding="utf-8") as f:
        file_contents = f.readlines()
        contents.extend()
        intents.extend([file]*len(file_contents))

In [25]:
def clean_text(text: str):
    """Clean parsed text from PDF for embedding"""
    text = text.replace("\uf70a", "่")
    text = text.replace("�ำ", "ำ")
    text = text.replace("�า", "ำ")
    return text

In [96]:
# content = parsed_book["content"]
# content_processed = clean_text(content)
# pages = content_processed.split("\n\n\n\n")

In [97]:
# pages_strip = [" ".join(page.split()) for page in pages]  # strip extra spaces from page

In [99]:
contents = [clean_text(c) for  c in contents]

## Perform RAG for each page in the book

- As we skim through, each page already contains a single content
- Chunk information to default `chunk_size` of 2048

In [None]:
def convert_page_to_chunk(page_text, chunk_size: int = 2048):
    chunks = [page_text[i:i + chunk_size] for i in range(0, len(page_text), chunk_size)]
    return chunks

In [None]:
chunks = []
for text in pages_strip:
    chunks.extend(convert_page_to_chunk(text))

In [None]:
len(chunks)

In [None]:
import json

with open("baby_0_3.json", "w") as f:
    json.dump(chunks, f, indent=4, ensure_ascii=False)

## Prompting using RAG

- Embed text chunks with and store using `faiss`
- Embed query using the same embedding script
- Find the closest text chunks
- Add information and perform RAG

In [165]:
from google.oauth2 import service_account
from google.cloud import aiplatform
from vertexai.language_models import TextGenerationModel, TextEmbeddingModel
from llama_index.legacy.llms.vertex import Vertex
from langchain.chat_models import ChatVertexAI
import os
from dotenv import load_dotenv

In [166]:
emb_model_name = "textembedding-gecko-multilingual@001"
gen_model_name  = "text-bison"
service_account_path = "credentials\\vertex-test-417403-ce72ad032af7.json"

credentials = service_account.Credentials.from_service_account_file(service_account_path)
aiplatform.init(project=credentials.project_id, credentials=credentials)
# emb_model = TextEmbeddingModel.from_pretrained(emb_model_name)
# gen_model = TextGenerationModel.from_pretrained(gen_model_name)

# vertex_ai = Vertex(model="text-bison", project=credentials.project_id, location= "asia-southeast1", credentials=credentials, temperature=0.2)
chat_vertex_ai = ChatVertexAI(model_name="chat-bison-32k", project=credentials.project_id, location= "asia-southeast1", credentials=credentials, temperature=0.2, max_output_tokens= 8192) # max for bison 32k                                 

  warn_deprecated(


In [113]:
def get_embedding(texts: list[str], model, batch_size=128):
    texts = [text.replace("\n", " ") for text in texts]
    embeddings = []
    for idx in range(0, len(texts), batch_size):
         embeddings.extend(model.get_embeddings(texts[idx:idx+batch_size]))
    vectors = [emb.values for emb in embeddings]
    return vectors

In [171]:
def get_completion(prompt: str, temperature: float = 0.0, top_p: float = 0.95, top_k: int = 40, max_output_tokens: int = 2048):
        parameters = {
            'temperature': temperature,
            'top_p': top_p,
            'top_k': top_k,
            'max_output_tokens': max_output_tokens
        }
        # return gen_model.predict(prompt, **parameters).text
        return chat_vertex_ai.predict(prompt, **parameters)

In [176]:
def get_intent_from_chat(text:str):
    prompt = f"Your task is to retrive intention of a given text. You should answer only 'medical experts' when the text is about finding medical expert, 'making appointment' when the text is about making appointment to the medical expert, 'chief complaint' when the text is about symptom. If the text is not related to what previous sentence mentioned, please answer 'unknown'. Text: `{text}`"
    intent = get_completion(prompt).strip()
    if intent in ["medical experts", "making appointment", "chief complaint"]:
        return intent
    return "unknown"

In [177]:
get_intent_from_chat("อยากหาหมอที่ตรวจหาสาเหตุของอาการหัวใจเต้นเร็วและเหนื่อยง่าย")

'medical experts'