# Stanford Encylopedia of Philosophy(SEP) Chatbot

**This project uses the entire scraped content from SEP (Stanford Encyclopedia of Philosophy** <br>
Located at this link: https://plato.stanford.edu/ <br>

**USAGE:** 
* Click on the `Run All` button and ask the question in the last cell in the variable named `query`

**IMPORTANT:** Please do not uncomment and run the commented out cells, if you do not wish to run the entire pipeline for some period of time. <br>
Many of the cells are commented because they take a lot of compute and I saw they took long time to load when i tested the shared link incognito.

If you happened to like the project I’m pleased to inform you that this isn't the final iteration but merely a draft that will soon be deployable and web hostable and possibly scalable with Kubernetes.

I've put a lot of thought into this project, I hope you enjoy it. :)

## Scraping all entries from the main page and blacklisting 404 not found ones

**The entire scraping proces can last up to 15 minutes** <br>
**Run only if you wish to know what the code is doing**

In [None]:
'''
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_stanford_entries(start_url="https://plato.stanford.edu/contents.html"):
    entries = []
    visited = set()
    blacklisted_entries = []

    try:
        response = requests.get(start_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        for link_tag in soup.find_all('a', href=True):
            href = link_tag['href']
            absolute_url = urljoin(start_url, href)

            if absolute_url.startswith("https://plato.stanford.edu/entries/"):
                if absolute_url not in visited:
                    try:
                        head_response = requests.head(absolute_url, allow_redirects=True, timeout=5)
                        if head_response.status_code == 404:
                            print(f"Blacklisted 404: {absolute_url}")
                            blacklisted_entries.append(absolute_url)
                            visited.add(absolute_url)
                        else:
                            print(f"Found entry: {absolute_url}") # Added print statement
                            entries.append(absolute_url)
                            visited.add(absolute_url)
                    except requests.exceptions.RequestException as e:
                        print(f"Error checking link {absolute_url}: {e}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {start_url}: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

    return sorted(list(set(entries))), sorted(list(set(blacklisted_entries)))

entries, blacklisted = scrape_stanford_entries()
print("\nFound the following entries:")
for entry in entries:
    print(entry)
print(f"\nTotal number of unique, non-404 entries found: {len(entries)}")

print("\nBlacklisted (404) entries:")
for blacklisted_entry in blacklisted:
    print(blacklisted_entry)
print(f"\nTotal number of blacklisted (404) entries: {len(blacklisted)}")
'''

In [None]:
"""
for _ in entries:
    print(_)
"""

## Extracting text from html pages based on the url with tag cleaning, and saving it as a dictionary with key being the url and value being the processed text

In [None]:
"""
def extract_text_from_html(html_content):
    '''
    Uses BeautifulSoup to parse HTML and extract all visible text content.
    '''
    soup = BeautifulSoup(html_content, 'html.parser')
    all_text_elements = soup.find_all(string=True)
    visible_texts = [text.strip() for text in all_text_elements if text.strip()]
    full_text = ' '.join(visible_texts)
    return full_text

def scrape_text_from_urls(url_list):
    '''
    Scrapes the text content from a list of URLs and stores it in a dictionary.

    Args:
        url_list (list): A list of URLs to scrape.

    Returns:
        dict: A dictionary where keys are URLs and values are the extracted text.
    '''
    url_text_dict = {}
    for url in url_list:
        print(f"Scraping text from: {url}")
        try:
            response = requests.get(url)
            response.raise_for_status()
            html_content = response.content
            extracted_text = extract_text_from_html(html_content)
            url_text_dict[url] = extracted_text
            print(f"Successfully extracted text from: {url} (length: {len(extracted_text)} characters)")
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {url}: {e}")
        except Exception as e:
            print(f"An error occurred while processing {url}: {e}")
    return url_text_dict

# Assuming you have the 'entries' list from the previous scraping steps
# For demonstration, let's use a subset of potential entries:
entries = entries

# Scrape the text content from the entries
url_to_text_mapping = scrape_text_from_urls(entries)

# Print the resulting dictionary (optional)
print("\nExtracted text dictionary:")
for url, text in url_to_text_mapping.items():
    print(f"URL: {url}")
    # You might want to print a snippet of the text instead of the whole thing
    # print(f"Text (snippet): {text[:200]}...")
    print("-" * 20)

"""

## Oh fun! It looks like not everything was cleaned. To make or lives easier we will clean up until as well as including "BEGIN ARTICLE HTML DO NOT MODIFY THIS LINE AND ABOVE" in the next cell.

In [None]:
"""
if url_to_text_mapping:
    first_key = next(iter(url_to_text_mapping.keys()))
    first_value = url_to_text_mapping[first_key]
    print("First inserted key:", first_key)
    print("First inserted value (snippet):", first_value + "...")
else:
    print("The dictionary is empty.")
"""

## Here we clean everything before "BEGIN ARTICLE HTML DO NOT MODIFY THIS LINE AND ABOVE" and including that phrase.

In [None]:
'''
def filter_text_dictionary(url_text_dict):
    """
    Filters the values in a dictionary (where values are HTML strings)
    to remove everything up to and including the marker
    'BEGIN ARTICLE HTML DO NOT MODIFY THIS LINE AND ABOVE'.

    Args:
        url_text_dict (dict): A dictionary with URLs as keys and HTML strings as values.

    Returns:
        dict: A new dictionary with the same keys and filtered HTML strings as values.
              Values will be empty strings if the marker is not found.
    """
    filtered_dict = {}
    marker = "BEGIN ARTICLE HTML DO NOT MODIFY THIS LINE AND ABOVE"
    for url, html_content in url_text_dict.items():
        try:
            index = html_content.find(marker)
            if index != -1:
                filtered_dict[url] = html_content[index + len(marker):].strip()
            else:
                print(f"Marker not found in HTML for URL: {url}")
                filtered_dict[url] = ""
        except Exception as e:
            print(f"Error processing HTML for URL {url}: {e}")
            filtered_dict[url] = ""
    return filtered_dict

# Assuming you have your url_to_text_mapping dictionary populated
filtered_url_to_text_mapping = filter_text_dictionary(url_to_text_mapping)

# Print the first filtered value (for demonstration)
if filtered_url_to_text_mapping:
    first_key_filtered = next(iter(filtered_url_to_text_mapping.keys()))
    first_value_filtered = filtered_url_to_text_mapping[first_key_filtered]
    print("\nFirst filtered value (snippet):")
    print(first_value_filtered[:500] + "...")
else:
    print("\nFiltered dictionary is empty.")
'''

In [None]:
'''
# You can now use the filtered_url_to_text_mapping dictionary for further processing
# For example, to print the filtered text for all URLs:
print("\nFiltered text for all URLs:")
for url, text in filtered_url_to_text_mapping.items():
    print(f"URL: {url}")
    print(f"Filtered Text (snippet): {text[:200]}...")
    print("-" * 20)
'''

## We will pickle the dictionary as we don't want to run all those scripts taking 10 minutes to finish

In [None]:
"""
import pickle

pickled_dictionary = filtered_url_to_text_mapping

output_path = 'pickled_dictionary.pkl'  # Choose a filename with .pkl extension

with open(output_path, 'wb') as f:
    pickle.dump(pickled_dictionary, f)

print(f"Dictionary saved to: {output_path}")
"""

## We will unpickle the binary object from the kaggle working directory

In [None]:
import pickle

file_path = '/kaggle/working/pickled_dictionary.pkl'
variable_name = 'pickled_dictionary'

with open(file_path, 'rb') as f:
    loaded_data = pickle.load(f)

pickled_dictionary = loaded_data

## Installing packages and removing conflicts

In [None]:
!pip uninstall -qqy jupyterlab kfp  # Remove unused conflicting packages
!pip install -qU "google-genai==1.7.0" "chromadb==0.6.3" 

## Importing google genai for LLM inference for RAG capability

In [None]:
from google import genai
from google.genai import types

from IPython.display import Markdown

genai.__version__

## Importing GOOGLE API KEY.

You need to import your own key. Don't worry it is free and you don't need to connect your credit card

In [None]:
from kaggle_secrets import UserSecretsClient

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")

In [None]:
client = genai.Client(api_key=GOOGLE_API_KEY)

for m in client.models.list():
    if "embedContent" in m.supported_actions:
        print(m.name)

## Testing to see if the unpickled dictionary has everything that was needed

In [None]:
documents = [document for document in pickled_dictionary.values()]

In [None]:
#documents[0]

## Making a nicer list with the embedded url in each item

In case a user want to get a url of the explained page

In [None]:
concatenated_list = [f"URL is {url} and the text is: {text}" for url, text in pickled_dictionary.items()]

# "URL is https://plato.stanford.edu/entries/abduction/ and the text is: ... (extracted text) ..."

In [None]:
#concatenated_list[0]

## Had some problems with database ingestion so i calculated whether any list item exceeded 4MB

Spoiler: It didn't
This was a real doozy, fix explanation is in a nex few cells

In [None]:
# Function to calculate the size of a document in MB
def get_size_in_mb(document):
    return len(document.encode('utf-8')) / (1024 * 1024)  # Size in MB

# List of documents
concatenated_list = concatenated_list

# Create a list of tuples (document, size)
document_sizes = [(doc, get_size_in_mb(doc)) for doc in concatenated_list]

# Sort the documents by size in descending order
document_sizes_sorted = sorted(document_sizes, key=lambda x: x[1], reverse=True)

# Print the sorted documents with their sizes
for i, (doc, size_in_mb) in enumerate(document_sizes_sorted):
    print(f"Document {i+1} size: {size_in_mb:.2f} MB\nContent: {doc[:100]}...")  # Displaying only the first 100 chars of the content for brevity


## We first need to prepare a Embedding Function to give parameters for generating vector embeddings 

In [None]:
from chromadb import Documents, EmbeddingFunction, Embeddings
from google.api_core import retry

from google.genai import types


# Define a helper to retry when per-minute quota is reached.
is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})


class GeminiEmbeddingFunction(EmbeddingFunction):
    # Specify whether to generate embeddings for documents, or queries
    document_mode = True

    @retry.Retry(predicate=is_retriable)
    def __call__(self, input: Documents) -> Embeddings:
        if self.document_mode:
            embedding_task = "retrieval_document"
        else:
            embedding_task = "retrieval_query"

        response = client.models.embed_content(
            model="models/text-embedding-004",
            contents=input,
            config=types.EmbedContentConfig(
                task_type=embedding_task,
            ),
        )
        return [e.values for e in response.embeddings]

## Adding items to a Chroma db vector database

Ah yes, the bane of my existence. So many iterations, so many official solutions that failed and LLM massaging giving me always something more with cryptic error messages in the stack trace. <br>
For me in the end to settle with a hacky solution with slowly iterating and adding a timed pause between requests to a google database.

It is slower, but it works. I guess that is better than not working at all. <br>
And the good news is it only needs to be ran when you want to update the database. <br>
The SEP team rarely does as they are super picky what they let in and who they let edit ther knowledgebase. <br>

For example: it took quite a bit of time for Rudolf Carnap to get his article becuse he is such an important figure and it required a massive undertaking collectivizing his work in a encyclopedia article.

In [None]:
"""
DB_NAME = "googlecardb"
documents = concatenated_list

embed_fn = GeminiEmbeddingFunction()
embed_fn.document_mode = True

chroma_client = chromadb.Client()
db = chroma_client.get_or_create_collection(name=DB_NAME, embedding_function=embed_fn)

for i, doc in enumerate(documents):
    db.add(documents=[doc], ids=[str(i)])
    time.sleep(1) # Add a delay after each successful addition
    print(f"Added document with ID: {i}, Content (first 100 chars): {str(doc[:100])}")

print(f"\nFinished adding {len(documents)} documents to the '{DB_NAME}' collection.")
"""


## Querying the database

We now hope and pray everything is in order in the DB

In [None]:
db.count()
# You can peek at the data too.
db.peek(1)

## Testing the embeddings

In [None]:
# Switch to query mode when generating embeddings.
embed_fn.document_mode = False

# Search the Chroma DB using the specified query.
query = "Explain Abelards Logic?"

result = db.query(query_texts=[query], n_results=1)
[all_passages] = result["documents"]

Markdown(all_passages[0])

## Giving background using prompting strategies with instructions how to behave and answer but using zero shot prompting

In [None]:
query_oneline = query.replace("\n", " ")

# This prompt is where you can specify any guidance on tone, or what topics the model should stick to, or avoid.
prompt = f"""You are a helpful and informative bot that answers questions using text from the reference passage included below. You answer with the knowledge of a philosophy postdoc that is trying to bridge complicated subjects to a non-philosophy knowing audience, so be sure to break down complicated concepts and strike a friendly and converstional tone.
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. 
If the passage is irrelevant to the answer, you may ignore it.

QUESTION: {query_oneline}
"""

# Add the retrieved documents to the prompt.
for passage in all_passages:
    passage_oneline = passage.replace("\n", " ")
    prompt += f"PASSAGE: {passage_oneline}\n"

print(prompt)

## Generating the answer

In [None]:
answer = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=prompt)

Markdown(answer.text)

## Everything in one place

In [None]:
# Search the Chroma DB using the specified query.
query = "Explain Abelards Logic."  #< --- Type your question here

# Switch to query mode when generating embeddings.
embed_fn.document_mode = False

result = db.query(query_texts=[query], n_results=1)
[all_passages] = result["documents"]

#Markdown(all_passages[0])

query_oneline = query.replace("\n", " ")

# This prompt is where you can specify any guidance on tone, or what topics the model should stick to, or avoid.
prompt = f"""You are a helpful and informative bot that answers questions using text from the reference passage included below. You answer with the knowledge of a philosophy postdoc that is trying to bridge complicated subjects to a non-philosophy knowing audience, so be sure to break down complicated concepts and strike a friendly and converstional tone.
Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. 
If the passage is irrelevant to the answer, you may ignore it.
Answer no less than 100 words.

QUESTION: {query_oneline}
"""

# Add the retrieved documents to the prompt.
for passage in all_passages:
    passage_oneline = passage.replace("\n", " ")
    prompt += f"PASSAGE: {passage_oneline}\n"

#print(prompt)

answer = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=prompt)

Markdown(answer.text)