# Setup

In [1]:
import os
import openai
import sys
import pandas as pd
# from datasets import Dataset
sys.path.append('../..')

# Load environment
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

# Set the OpenAI API key
openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
# define llm
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0.5)

  warn_deprecated(


# Web Scrape

In [6]:
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup

async def scrape_page_content(url):
    async with async_playwright() as p:
        # Launch a browser and open a new page
        browser = await p.chromium.launch()
        page = await browser.new_page()
        # Navigate to the given URL
        await page.goto(url)
        # Retrieve the page content
        html = await page.content()
        # Close the browser
        await browser.close()

        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(html, "html.parser")
        # Get the title of the page
        title = soup.title.string if soup.title else 'No Title'
        # Extract all paragraph texts
        paragraphs = [p.get_text() for p in soup.find_all('p')]

        return {
            'url': url,
            'title': title,
            'paragraphs': paragraphs
        }

# Usage
url_to_scrape = "https://pricai.org/2024/"

# Run the function 
scraped_data = await scrape_page_content(url_to_scrape)

# Print the scraped data
print(f"URL: {scraped_data['url']}")
print(f"Title: {scraped_data['title']}")
print("Paragraphs:")
for paragraph in scraped_data['paragraphs'][:2]:  
    print(paragraph)
print("---")

URL: https://pricai.org/2024/
Title: HOME
Paragraphs:

The Pacific Rim International Conference on Artificial Intelligence (PRICAI) is an annual international event which concentrates on AI theories, technologies and their applications in the areas of scientific, social, and economic importance for countries in the Pacific Rim. In the past, the conferences have been held in Nagoya (1990), Seoul (1992), Beijing (1994), Cairns (1996), Singapore (1998), Melbourne (2000), Tokyo (2002), Auckland (2004), Guilin (2006), Hanoi (2008), Daegu (2010), Kuching (2012), Gold Coast (2014), Phuket (2016), Nanjing (2018), Fiji (2019), Yokohama (2020, online), Hanoi (2021, online), Shanghai (2022, hybrid) and Jakarta (2023, hybrid).
---


# RAG Pipeline 

In [9]:
from langchain.schema import Document

# Ensure scraped_data is in a list to make this compatible with the list comprehension
scraped_data = [scraped_data] if isinstance(scraped_data, dict) else scraped_data

# Populate the link_content_map while creating the documents
documents = [
    Document(
        page_content=" ".join(data['paragraphs']),
        metadata={"url": data['url'], "title": data['title']}
    )
    for data in scraped_data
]

In [10]:
# text splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=300,
    length_function=len,
    is_separator_regex=False,
)

chunk_docs = text_splitter.split_documents(documents)

In [11]:
# Text embedding
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [12]:
# Vector Store
from langchain_chroma import Chroma

db_chroma = Chroma.from_documents(chunk_docs, embeddings)

In [13]:
# Retriever
retriever = db_chroma.as_retriever(search_type="mmr")

In [14]:
# Function to print contents of chunk_docs
def print_stored_documents(chunk_docs, limit=20):
    print("Contents of the Chroma Vector Database:")
    print("{:<100} {:<200}".format("Link", "Generated from Web Scraping"))  # Header
    print("=" * 300)  # Separator

    # Iterate through the chunked documents and print their content
    for idx, doc in enumerate(chunk_docs):
        if idx >= limit:  # Limit to the first 'limit' entries
            break

        # Extract URL and content from metadata and page content
        url = doc.metadata.get("url", "No URL")
        content = doc.page_content  # The content generated from web scraping

        # Print the URL and the content
        print("{:<100} {:<200}".format(url, content[:197] + '...' if len(content) > 200 else content))

# Call the function to print the stored documents
print_stored_documents(chunk_docs)

Contents of the Chroma Vector Database:
Link                                                                                                 Generated from Web Scraping                                                                                                                                                                             
https://pricai.org/2024/                                                                             The Pacific Rim International Conference on Artificial Intelligence (PRICAI) is an annual international event which concentrates on AI theories, technologies and their applications in the areas of ...


In [40]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI

# define prompt template -> downgrade 
answer_prompt = """
You are an AI assistant here to answer questions about PRICAI 2024. Use only the provided context to answer the question accurately. If there is no relevant information in the context, respond with "I'm not sure about that; it's outside my available information."

Guidelines:
- Answer only if the context provides relevant information.
- Respond "I'm not sure about that; it's outside my available information" for unrelated or unsupported questions.
- Avoid answering questions about unrelated topics, brands, or products.

Steps:
1. Review the user’s question and the context provided.
2. Verify if the question relates to PRICAI 2024.
3. If relevant information is available, answer the question.
4. If helpful, provide links from the context in the response.

Question: {messages}
Answer: {context}

Give a clear, organized answer in bullet points if necessary. Make the response easy to read and focused only on relevant information.
"""

# Chat prompt template
question_answering_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", answer_prompt),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

document_chain = create_stuff_documents_chain(llm, question_answering_prompt)

In [41]:
from langchain_core.messages import HumanMessage

# Function to query the vector store and generate a response
def generate_response(query):
    # Retrieve similar documents
    docs = retriever.get_relevant_documents(query, k=10)

    # Use the documents as context for the LLM
    response = document_chain.invoke(
        {
            "context": docs,
            "messages": [
                HumanMessage(content=query)
            ],
        }
    )

    return response

# Inference

In [42]:
# query
query = "where is Pricai converence held?"
response = generate_response(query)

print(response)

Number of requested results 20 is greater than number of elements in index 1, updating n_results = 1


- PRICAI 2024 will be held in Kyoto, Japan.
- It is an in-person event.


In [43]:
# query
query = "what can I prepare to attent the Pricai conference?"
response = generate_response(query)

print(response)

Number of requested results 20 is greater than number of elements in index 1, updating n_results = 1


To attend the PRICAI 2024 conference, you can prepare the following:

- **Research Papers**: If you are a researcher, consider submitting a technical paper on substantial, original, and unpublished research in all aspects of Artificial Intelligence.
- **Networking**: Be ready to engage with researchers, practitioners, educators, and users in AI for in-depth intellectual exchanges and research cooperation.
- **Travel Arrangements**: Since PRICAI 2024 will be held in person in Kyoto, Japan, prepare your travel plans, including flights, accommodation, and any necessary visas.
- **Conference Materials**: Gather any materials you might need for presentations or discussions, such as business cards and a notebook for taking notes.

Keep an eye on any updates or specific requirements from the conference organizers.


In [44]:
# query
query = "What is pricai, and why they make an event every year, what they expected from that event?"
response = generate_response(query)

print(response)

Number of requested results 20 is greater than number of elements in index 1, updating n_results = 1


- **PRICAI**: The Pacific Rim International Conference on Artificial Intelligence (PRICAI) is an annual international event focused on AI theories, technologies, and applications, particularly in the Pacific Rim region.

- **Purpose of the Event**: The conference is organized every year to:
  - Facilitate discussions on AI research and development.
  - Encourage exchanges of ideas among researchers, practitioners, educators, and users in the AI field.
  - Foster research cooperation and professional development.

- **Expectations from the Event**:
  - To serve as a platform for presenting substantial, original, and unpublished research in AI.
  - To bring together AI communities for in-depth intellectual exchanges.
  - To promote advancements in areas of scientific, social, and economic importance.

PRICAI 2024 will be held in person in Kyoto, Japan, and will be co-located with PRIMA2024.


In [45]:
# query
query = "I want to cook pisang goreng, please make the step and recipe"
response = generate_response(query)

print(response)

Number of requested results 20 is greater than number of elements in index 1, updating n_results = 1


I'm not sure about that; it's outside my available information.


The RAG can't answer the question 

In [46]:
# query
query = "Who is Prof. Bo An, Nanyang Technological University, Singapore also what his contribution in Pricai 2024?"
response = generate_response(query)

print(response)

Number of requested results 20 is greater than number of elements in index 1, updating n_results = 1


I'm not sure about that; it's outside my available information.


In [47]:
# query -> Can't answer
query = "when last call for the proposal and paper submited?"
response = generate_response(query)

print(response)

Number of requested results 20 is greater than number of elements in index 1, updating n_results = 1


I'm not sure about that; it's outside my available information.


In [48]:
# query -> Can't answer
query = "give me the schedule for pricai 2024"
response = generate_response(query)

print(response)

Number of requested results 20 is greater than number of elements in index 1, updating n_results = 1


I'm not sure about that; it's outside my available information.


In [49]:
# query
query = "List down topic that will present in PRICAI 2024"
response = generate_response(query)

print(response)

Number of requested results 20 is greater than number of elements in index 1, updating n_results = 1


I'm not sure about that; it's outside my available information.


In [50]:
# query
query = "is there any topics that explain about synthetic data"
response = generate_response(query)

print(response)

Number of requested results 20 is greater than number of elements in index 1, updating n_results = 1


I'm not sure about that; it's outside my available information.
