Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ A collection of use cases using MariaDB.

* **mariadb-kb-rag** - An example of doing AI RAG with MariaDB Vector using as content over one thousand articles scraped from the MariaDB Knowledge Base [mariadb.com/kb/](http://mariadb.com/kb). As described in blog [Try RAG with MariaDB Vector on your own MariaDB data!](https://mariadb.org/rag-with-mariadb-vector/).

* **wiki-rag-navigator** - A tool to navigate an indexed area of wikipedia and generate suggested additions / improvements to articles, using mariadb vector and openAI. This project was submitted for the [mariadb rag hackathon](https://mariadb.org/helsinki-python-meetup-with-mariadb/), see the [project readme](wiki-rag-navigator/readme.md) for more details! Original repository can be found [here](https://github.com/djstockma/wiki-rag-enhancer)


## How to add a demo?
1. Fork the repo.
Expand Down
7 changes: 7 additions & 0 deletions wiki-rag-navigator/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
DB_HOST=localhost
DB_PORT=3306
DB_USER=raguser
DB_PASSWORD=ragpass
DB_NAME=ragdb

OPENAI_API_KEY=your_openai_api_key
10 changes: 10 additions & 0 deletions wiki-rag-navigator/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Env
*.env

# Python
__pycache__/
*.py[cod]
*.pyo
*.pyd
*.sqlite3
*.log
21 changes: 21 additions & 0 deletions wiki-rag-navigator/LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) [2025] [Jens Stockmann]

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
28 changes: 28 additions & 0 deletions wiki-rag-navigator/app/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
FROM python:3.11-slim

# Install system dependencies (MariaDB Connector/C + build tools)
RUN apt-get update && apt-get install -y \
libmariadb-dev \
gcc \
g++ \
make \
python3-dev \
build-essential \
&& rm -rf /var/lib/apt/lists/*

# Set working directory
WORKDIR /app

RUN pip install --upgrade pip

# Install wheel (for wikipedia)
RUN pip install --no-cache-dir wheel

# Install dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt

# Copy app code
COPY . .

CMD ["python", "main.py"]
123 changes: 123 additions & 0 deletions wiki-rag-navigator/app/app_ui.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import streamlit as st
from load_db import load_db
from find_matches import find_matches, find_relevant_articles
from generate_suggestions import suggest_wikipedia_additions
from utils.logging_config import get_logger
from utils.generate_markdown_diff import generate_markdown_diff
from utils.parse_article import extract_article_text

logger = get_logger()

def main():
st.set_page_config(page_title="Wikipedia RAG enhancer", layout="wide")
st.title("Wikipedia RAG enhancer")

st.header("Paste Source Text or URL (http:// or https://) here")
source_text = st.text_area("Paste the source you want to use for improvement here:", height=400)

st.sidebar.header("Settings")
n_chunks_per_article = st.sidebar.slider("How many chunks to fetch", min_value=1, max_value=100, value=20)

st.sidebar.subheader("Wikipedia and embedding")
if st.sidebar.button("Load data from wikipedia and Embed"):
with st.spinner("Embedding Wikipedia articles..."):
n_of_embedded_articles = load_db()
st.sidebar.success(f"Embedding complete! {n_of_embedded_articles} articles embedded")
if "selected_chunks" not in st.session_state:
st.session_state.selected_chunks = {}

if "grouped_matches" not in st.session_state:
st.session_state.grouped_matches = {}

if st.button("Check"):
if not source_text:
st.warning("Please paste some text first!")
return

if source_text.startswith("http://") or source_text.startswith("https://"):
try:
source_text = extract_article_text(source_text)
if source_text:
st.success("Article content successfully extracted.")
st.subheader("Extracted Article Content:")
st.write(f"{source_text[0:200]}...") # Display first 100 characters
else:
st.error(f"Failed to extract article: {e}")
source_text = ""
return
except Exception as e:
st.error(f"Failed to extract article: {e}")
source_text = ""
return

grouped_matches = {}
matches = find_matches(text=source_text, n_chunks=n_chunks_per_article)
for match in matches:
article_title = match[3]
if article_title not in grouped_matches:
grouped_matches[article_title] = []
grouped_matches[article_title].append(match) #FIXME: find a way to sort appearance of chunks!

st.session_state.grouped_matches = grouped_matches
st.session_state.selected_chunks = {}

if st.session_state.grouped_matches:
st.header("Top Matching Wikipedia Chunks")

for article_title, chunks in st.session_state.grouped_matches.items():
st.subheader(f"{article_title}")
for idx, chunk in enumerate(chunks, 1):
chunk_text = chunk[1]
chunk_certainty = chunk[5]
chunk_number = chunk[4]
st.markdown(f"#### Chunk {chunk_number} (Certainty: {chunk_certainty:.2f})")
key = f"{article_title}_{idx}"

# Default value depends on session_state
checkbox_val = st.checkbox("Select this chunk", key=key, value=st.session_state.get(key, False))

if checkbox_val:
st.session_state.selected_chunks[key] = True
else:
if key in st.session_state.selected_chunks:
del st.session_state.selected_chunks[key]

st.write(chunk_text)

if st.button("Proceed with Selected Chunks"):
selected_data = []
for article_title, chunks in st.session_state.grouped_matches.items():
for idx, chunk in enumerate(chunks, 1):
key = f"{article_title}_{idx}"
if st.session_state.selected_chunks.get(key):
selected_data.append({
"article_title": article_title,
"chunk_text": chunk[1],
"chunk_index": chunk[4],
"chunk_id": chunk[0],
"edit_url": chunk[6],
})
if not selected_data:
st.warning("Please select at least one chunk.")
else:
with st.spinner("Generating LLM suggestions..."):
suggestions = suggest_wikipedia_additions(
wiki_chunks=selected_data,
source_text=source_text,
)

st.success("Suggestions generated!")
st.subheader("Suggested Additions to Wikipedia")
for suggestion in suggestions:
diff_markdown = generate_markdown_diff(
suggestion["original_chunk"],
suggestion["improved_chunk"]
)
st.markdown(diff_markdown)
st.markdown(f"**Justification:** {suggestion['justification']} **Edit [here]({suggestion['edit_url']})**")

# Uncomment if you want to see the full suggestion
#st.write(suggestion)

if __name__ == "__main__":
main()
19 changes: 19 additions & 0 deletions wiki-rag-navigator/app/find_matches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from utils.db import find_best_matches, get_relevant_article_counts
from utils.embedding import embed_text
from utils.db import get_connection

def find_matches(text, n_chunks=1, article: str = None):
"""Fetches n matches for ONE article. Returns list[id, text, embedding, article_title, chunk_index, certainty]"""
conn = get_connection()
embedded = embed_text(text)
if article:
result = find_best_matches(conn, embedded, n_chunks, [article])
else:
result = find_best_matches(conn, embedded, n_chunks, [])
return result

def find_relevant_articles(text, n=1000):
conn = get_connection()
embedded = embed_text(text)
counts = get_relevant_article_counts(conn, embedded, n=n)
return counts
119 changes: 119 additions & 0 deletions wiki-rag-navigator/app/generate_suggestions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from openai import OpenAI
import os
from dotenv import load_dotenv
from utils.logging_config import get_logger

load_dotenv()
logger = get_logger()

def suggest_wikipedia_additions(wiki_chunks: list[dict], source_text: str, model="gpt-4o-mini") -> list[dict]:
"""
Uses GPT to compare Wikipedia content with a source and suggest new facts to add.

Args:
wiki_chunks (list[dict]): Relevant chunks from Wikipedia with their metadata.
(id, text, embedding, article_title, chunk_index, certainty)
source_text (str): The source article (e.g. news article).
model (str): OpenAI model to use, default is gpt-4.

Returns:
dict: JSON-parsed suggestions (structured output from GPT).
"""

# Combining the Wikipedia chunks and metadata
joined_chunks = ""
for chunk in wiki_chunks:
# Extract the chunk text and metadata
chunk_text = chunk["chunk_text"]
chunk_number = chunk["chunk_id"]
chunk_title = chunk["article_title"]
# Format the chunk with its metadata
joined_chunks += f"#### Chunk {chunk_number} \n Title and subtitle: {chunk_title} \n Chunk Text: \n{chunk_text}\n\n"

# Construct the input prompt
user_prompt = f"""Your task is to:
1. Compare the source text to the Wikipedia content chunks.
2. Identify facts that are in the source but missing from the Wikipedia.
3. Output proposals for new text in structured JSON format like this:
{{
"proposed_additions": [
{{
"chunk_title": "(Sub)title of the chunk",
"chunk_id": "id of modified chunk, just the number (eg. "25")",
"improved_chunk": "Original chunk text modified with added improvement.",
"justification": "Why it's relevant for the article.",
"section_hint": "Optional: which section it fits into (if any)."
}},
...
]
}}

Please don't remove anything from the Wikipedia content, only make additions and return the improved chunk text (don't return the subtitle).

Wikipedia content:
<<<
{joined_chunks}
>>>

Source text:
<<<
{source_text}
>>>"""
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)
# Send to OpenAI
response = client.responses.create(
model=model,
instructions="You are a factual assistant helping improve Wikipedia articles by comparing them "
"to reliable sources and identifying missing but relevant content. "
"You generate the answers in the same language as the input, and adher to linguistic conventions of wikipedia.",
input=user_prompt
)

# Extract and try to parse the structured response
reply = response.output_text

# Try to safely parse JSON if it's well-formed
import json
try:
start = reply.find("{")
end = reply.rfind("}")
trimmed = reply[start:end + 1]
parsed: dict = json.loads(trimmed)
additions: list[dict] = parsed.get("proposed_additions")
final_additions = []
for addition in additions:
# Ensure all required fields are present
if "chunk_id" not in addition or "improved_chunk" not in addition or "justification" not in addition:
logger.warning("Warning: Missing required fields in JSON response.")
continue

# Add original chunk text for reference
chunk_id_raw = addition["chunk_id"]
try:
chunk_id = int(chunk_id_raw)
except ValueError:
logger.warning(f"Invalid chunk_id value: {chunk_id_raw}")
continue

# Find matching chunk from wiki_chunks (assumes chunk_index is at index 4)
(original_chunk, edit_url) = next(
((chunk["chunk_text"], chunk["edit_url"]) for chunk in wiki_chunks if int(chunk["chunk_id"]) == chunk_id),
None
)
if original_chunk is None:
logger.warning(f"No matching original chunk found for chunk_id={chunk_id}")
continue

addition["original_chunk"] = original_chunk
addition["edit_url"] = edit_url
final_additions.append(addition)

return final_additions

except ValueError as ve:
logger.warning("Warning: Could not parse JSON due to value missing.")
return []
except Exception as e:
logger.warning("Warning: Could not parse JSON.")
return []
Loading