In [None]:
import os
import dotenv
from pathlib import Path

import sys
import re
import os
import pandas as pd
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_cohere import CohereEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.vectorstores import VectorStore
from openai import OpenAI

dotenv.load_dotenv()

True

In [None]:
# Load docs

doc_paths = [
    "/content/darft.pdf",
    "/content/draft2.pdf",
]

docs = [] 
for doc_file in doc_paths:
    file_path = Path(doc_file)

    try:
        if file_path:
            loader = PyMuPDFLoader(file_path=file_path)
            data = loader.load_and_split()
        else:
            print('Upload a PDF file')
            sys.exit()

        docs.extend(data)

    except Exception as e:
        print(f"Error loading document {doc_file.name}: {e}")
    
    # finally:
    #     os.remove(file_path)

In [None]:
# Split docs
text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=1024, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

[Document(metadata={'source': 'docs\\test_rag.pdf', 'page': 0}, page_content='My favorite food is margarita pizza.  \nThere are 47588 bottles in the tr uck.'), Document(metadata={'source': 'docs\\test_rag.docx'}, page_content='My favorite food is margarita pizza.\n\nThere are 47588 bottles in the truck.'), Document(metadata={'source': 'https://docs.streamlit.io/develop/quick-reference/release-notes', 'title': 'Release notes - Streamlit Docs', 'description': 'A changelog of highlights and fixes for each version of Streamlit.', 'language': 'No language found.'}, page_content="Release notes - Streamlit DocsDocumentationsearchSearchrocket_launchGet startedInstallationaddFundamentalsaddFirst stepsaddcodeDevelopConceptsaddAPI referenceaddTutorialsaddQuick referenceremoveCheat sheetRelease notesremove202420232022202120202019Pre-release featuresRoadmapopen_in_newweb_assetDeployConceptsaddStreamlit Community CloudaddSnowflakeOther platformsaddschoolKnowledge baseFAQInstalling dependenciesDeploy

In [None]:
for i in range(len(texts)):
    cleaned_content = re.sub(r'[\u202a-\u202e]', '', texts[i].page_content)
    texts[i] = Document(metadata=texts[i].metadata, page_content=cleaned_content)

print(texts[140])

In [None]:
#HuggingFaceEmbeddings
model_name = "BAAI/bge-m3"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(texts, embeddings)

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()

In [None]:
typhoon_prompt = PromptTemplate(
    input_variables=["context","question"],
    template="""
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
    You should answer the question in Thai language only.
    Context: {context}

    You are an expert travel guide specializing in tourist attractions in Pathum Thani, Thailand.
    The user has the following question:
    Question: {question}

    Please provide a helpful response with the following details:
    1. Name of the attraction
    2. Description of the place (e.g., unique features, activities available)
    3. Opening and closing hours
    4. Additional information (e.g., transportation tips, entrance fees, or special advice)

    If you don't know the answer, simply say, "I don't know."
    """,
)

In [None]:
typhoon_token = os.getenv("TYPHOON_API_KEY")

# Initialize the Typhoon client
client = OpenAI(
    api_key= typhoon_token,
    base_url='https://api.opentyphoon.ai/v1'
)

# Define a function to generate a response using the LLM
def generate_response(context, question):
    prompt = typhoon_prompt.format(context=context, question=question)
    chat_completion = client.chat.completions.create(
        model="typhoon-v1.5x-70b-instruct",
        messages=[{"role": "user", "content": prompt}]
    )
    return chat_completion.choices[0].message.content

# Main function to handle user query
def answer_question(user_question):
    # Retrieve relevant context from the vectorstore
    retrieved_contexts = retriever.get_relevant_documents(user_question)
    context = "\n".join([doc.page_content for doc in retrieved_contexts])

    # Generate response using the context and question
    response = generate_response(context=context, question=user_question)
    return response

In [None]:
print("Welcome to the Pathum Thani Travel Guide!")
user_question = input("Please ask your question: ")
answer = answer_question(user_question)
print("\nResponse:")
print(answer)