MariaDB · djstockma · Jun 2, 2025 · Jun 2, 2025 · Jun 2, 2025 · Jun 2, 2025
diff --git a/README.md b/README.md
@@ -5,6 +5,8 @@ A collection of use cases using MariaDB.
 
 * **mariadb-kb-rag** - An example of doing AI RAG with MariaDB Vector using as content over one thousand articles scraped from the MariaDB Knowledge Base [mariadb.com/kb/](http://mariadb.com/kb). As described in blog [Try RAG with MariaDB Vector on your own MariaDB data!](https://mariadb.org/rag-with-mariadb-vector/).
 
+* **wiki-rag-navigator** - A tool to navigate an indexed area of wikipedia and generate suggested additions / improvements to articles, using mariadb vector and openAI. This project was submitted for the [mariadb rag hackathon](https://mariadb.org/helsinki-python-meetup-with-mariadb/), see the [project readme](wiki-rag-navigator/readme.md) for more details! Original repository can be found [here](https://github.com/djstockma/wiki-rag-enhancer)
+
 
 ## How to add a demo? 
 1. Fork the repo. 

diff --git a/wiki-rag-navigator/.env.example b/wiki-rag-navigator/.env.example
@@ -0,0 +1,7 @@
+DB_HOST=localhost
+DB_PORT=3306
+DB_USER=raguser
+DB_PASSWORD=ragpass
+DB_NAME=ragdb
+
+OPENAI_API_KEY=your_openai_api_key
diff --git a/wiki-rag-navigator/.gitignore b/wiki-rag-navigator/.gitignore
@@ -0,0 +1,10 @@
+# Env
+*.env
+
+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.sqlite3
+*.log
diff --git a/wiki-rag-navigator/LICENSE.txt b/wiki-rag-navigator/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) [2025] [Jens Stockmann]
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/wiki-rag-navigator/app/Dockerfile b/wiki-rag-navigator/app/Dockerfile
@@ -0,0 +1,28 @@
+FROM python:3.11-slim
+
+# Install system dependencies (MariaDB Connector/C + build tools)
+RUN apt-get update && apt-get install -y \
+    libmariadb-dev \
+    gcc \
+    g++ \
+    make \
+    python3-dev \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /app
+
+RUN pip install --upgrade pip
+
+# Install wheel (for wikipedia)
+RUN pip install --no-cache-dir wheel
+
+# Install dependencies
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+# Copy app code
+COPY . .
+
+CMD ["python", "main.py"]
diff --git a/wiki-rag-navigator/app/app_ui.py b/wiki-rag-navigator/app/app_ui.py
@@ -0,0 +1,123 @@
+import streamlit as st
+from load_db import load_db
+from find_matches import find_matches, find_relevant_articles
+from generate_suggestions import suggest_wikipedia_additions
+from utils.logging_config import get_logger
+from utils.generate_markdown_diff import generate_markdown_diff
+from utils.parse_article import extract_article_text
+
+logger = get_logger()
+
+def main():
+    st.set_page_config(page_title="Wikipedia RAG enhancer", layout="wide")
+    st.title("Wikipedia RAG enhancer")
+
+    st.header("Paste Source Text or URL (http:// or https://) here")  
+    source_text = st.text_area("Paste the source you want to use for improvement here:", height=400)
+
+    st.sidebar.header("Settings")
+    n_chunks_per_article = st.sidebar.slider("How many chunks to fetch", min_value=1, max_value=100, value=20)
+
+    st.sidebar.subheader("Wikipedia and embedding")
+    if st.sidebar.button("Load data from wikipedia and Embed"):
+        with st.spinner("Embedding Wikipedia articles..."):
+            n_of_embedded_articles = load_db()
+        st.sidebar.success(f"Embedding complete! {n_of_embedded_articles} articles embedded")
+    if "selected_chunks" not in st.session_state:
+        st.session_state.selected_chunks = {}
+
+    if "grouped_matches" not in st.session_state:
+        st.session_state.grouped_matches = {}
+
+    if st.button("Check"):
+        if not source_text:
+            st.warning("Please paste some text first!")
+            return
+
+        if source_text.startswith("http://") or source_text.startswith("https://"):
+            try:
+                source_text = extract_article_text(source_text)
+                if source_text:
+                    st.success("Article content successfully extracted.")
+                    st.subheader("Extracted Article Content:")
+                    st.write(f"{source_text[0:200]}...")  # Display first 100 characters
+                else:
+                    st.error(f"Failed to extract article: {e}")
+                    source_text = ""
+                    return
+            except Exception as e:
+                st.error(f"Failed to extract article: {e}")
+                source_text = ""
+                return
+
+        grouped_matches = {}
+        matches = find_matches(text=source_text, n_chunks=n_chunks_per_article)
+        for match in matches:
+            article_title = match[3]
+            if article_title not in grouped_matches:
+                grouped_matches[article_title] = []
+            grouped_matches[article_title].append(match) #FIXME: find a way to sort appearance of chunks!
+
+        st.session_state.grouped_matches = grouped_matches
+        st.session_state.selected_chunks = {}
+
+    if st.session_state.grouped_matches:
+        st.header("Top Matching Wikipedia Chunks")
+
+        for article_title, chunks in st.session_state.grouped_matches.items():
+            st.subheader(f"{article_title}")
+            for idx, chunk in enumerate(chunks, 1):
+                chunk_text = chunk[1]
+                chunk_certainty = chunk[5]
+                chunk_number = chunk[4]
+                st.markdown(f"#### Chunk {chunk_number} (Certainty: {chunk_certainty:.2f})")
+                key = f"{article_title}_{idx}"
+
+                # Default value depends on session_state
+                checkbox_val = st.checkbox("Select this chunk", key=key, value=st.session_state.get(key, False))
+
+                if checkbox_val:
+                    st.session_state.selected_chunks[key] = True
+                else:
+                    if key in st.session_state.selected_chunks:
+                        del st.session_state.selected_chunks[key]
+
+                st.write(chunk_text)
+
+        if st.button("Proceed with Selected Chunks"):
+            selected_data = []
+            for article_title, chunks in st.session_state.grouped_matches.items():
+                for idx, chunk in enumerate(chunks, 1):
+                    key = f"{article_title}_{idx}"
+                    if st.session_state.selected_chunks.get(key):
+                        selected_data.append({
+                            "article_title": article_title,
+                            "chunk_text": chunk[1],
+                            "chunk_index": chunk[4],
+                            "chunk_id": chunk[0],
+                            "edit_url": chunk[6],
+                        })
+            if not selected_data:
+                st.warning("Please select at least one chunk.")
+            else:
+                with st.spinner("Generating LLM suggestions..."):
+                    suggestions = suggest_wikipedia_additions(
+                        wiki_chunks=selected_data,
+                        source_text=source_text,
+                    )
+
+                st.success("Suggestions generated!")
+                st.subheader("Suggested Additions to Wikipedia")
+                for suggestion in suggestions:
+                    diff_markdown = generate_markdown_diff(
+                        suggestion["original_chunk"],
+                        suggestion["improved_chunk"]
+                    )
+                    st.markdown(diff_markdown)
+                    st.markdown(f"**Justification:** {suggestion['justification']}    **Edit [here]({suggestion['edit_url']})**")
+
+                    # Uncomment if you want to see the full suggestion
+                    #st.write(suggestion)
+
+if __name__ == "__main__":
+    main()
diff --git a/wiki-rag-navigator/app/find_matches.py b/wiki-rag-navigator/app/find_matches.py
@@ -0,0 +1,19 @@
+from utils.db import find_best_matches, get_relevant_article_counts
+from utils.embedding import embed_text
+from utils.db import get_connection
+
+def find_matches(text, n_chunks=1, article: str = None):
+    """Fetches n matches for ONE article. Returns list[id, text, embedding, article_title, chunk_index, certainty]"""
+    conn = get_connection()
+    embedded = embed_text(text)
+    if article:
+        result = find_best_matches(conn, embedded, n_chunks, [article])
+    else:
+        result = find_best_matches(conn, embedded, n_chunks, [])
+    return result
+
+def find_relevant_articles(text, n=1000):
+    conn = get_connection()
+    embedded = embed_text(text)
+    counts = get_relevant_article_counts(conn, embedded, n=n)
+    return counts
diff --git a/wiki-rag-navigator/app/generate_suggestions.py b/wiki-rag-navigator/app/generate_suggestions.py
@@ -0,0 +1,119 @@
+from openai import OpenAI
+import os
+from dotenv import load_dotenv
+from utils.logging_config import get_logger
+
+load_dotenv()
+logger = get_logger()
+
+def suggest_wikipedia_additions(wiki_chunks: list[dict], source_text: str, model="gpt-4o-mini") -> list[dict]:
+    """
+    Uses GPT to compare Wikipedia content with a source and suggest new facts to add.
+
+    Args:
+        wiki_chunks (list[dict]): Relevant chunks from Wikipedia with their metadata.
+            (id, text, embedding, article_title, chunk_index, certainty)
+        source_text (str): The source article (e.g. news article).
+        model (str): OpenAI model to use, default is gpt-4.
+
+    Returns:
+        dict: JSON-parsed suggestions (structured output from GPT).
+    """
+
+    # Combining the Wikipedia chunks and metadata
+    joined_chunks = ""
+    for chunk in wiki_chunks:
+        # Extract the chunk text and metadata
+        chunk_text = chunk["chunk_text"]
+        chunk_number = chunk["chunk_id"]
+        chunk_title = chunk["article_title"]
+        # Format the chunk with its metadata
+        joined_chunks += f"#### Chunk {chunk_number} \n Title and subtitle: {chunk_title} \n Chunk Text: \n{chunk_text}\n\n"
+
+    # Construct the input prompt
+    user_prompt = f"""Your task is to:
+1. Compare the source text to the Wikipedia content chunks.
+2. Identify facts that are in the source but missing from the Wikipedia.
+3. Output proposals for new text in structured JSON format like this:
+{{
+  "proposed_additions": [
+    {{
+      "chunk_title": "(Sub)title of the chunk",
+      "chunk_id": "id of modified chunk, just the number (eg. "25")",
+      "improved_chunk": "Original chunk text modified with added improvement.",
+      "justification": "Why it's relevant for the article.",
+      "section_hint": "Optional: which section it fits into (if any)."
+    }},
+    ...
+  ]
+}}
+
+Please don't remove anything from the Wikipedia content, only make additions and return the improved chunk text (don't return the subtitle).
+
+Wikipedia content:
+<<<
+{joined_chunks}
+>>>
+
+Source text:
+<<<
+{source_text}
+>>>"""
+    api_key = os.getenv("OPENAI_API_KEY")
+    client = OpenAI(api_key=api_key)
+    # Send to OpenAI
+    response = client.responses.create(
+        model=model,
+        instructions="You are a factual assistant helping improve Wikipedia articles by comparing them "
+                      "to reliable sources and identifying missing but relevant content. "
+                      "You generate the answers in the same language as the input, and adher to linguistic conventions of wikipedia.",
+        input=user_prompt
+    )
+
+    # Extract and try to parse the structured response
+    reply = response.output_text
+
+    # Try to safely parse JSON if it's well-formed
+    import json
+    try:
+        start = reply.find("{")
+        end = reply.rfind("}")
+        trimmed = reply[start:end + 1]
+        parsed: dict = json.loads(trimmed)
+        additions: list[dict] = parsed.get("proposed_additions")
+        final_additions = []
+        for addition in additions:
+            # Ensure all required fields are present
+            if "chunk_id" not in addition or "improved_chunk" not in addition or "justification" not in addition:
+                logger.warning("Warning: Missing required fields in JSON response.") 
+                continue
+
+            # Add original chunk text for reference
+            chunk_id_raw = addition["chunk_id"]
+            try:
+                chunk_id = int(chunk_id_raw)
+            except ValueError:
+                logger.warning(f"Invalid chunk_id value: {chunk_id_raw}")
+                continue
+
+            # Find matching chunk from wiki_chunks (assumes chunk_index is at index 4)
+            (original_chunk, edit_url) = next(
+                ((chunk["chunk_text"], chunk["edit_url"]) for chunk in wiki_chunks if int(chunk["chunk_id"]) == chunk_id),
+                None
+            )
+            if original_chunk is None:
+                logger.warning(f"No matching original chunk found for chunk_id={chunk_id}")
+                continue
+
+            addition["original_chunk"] = original_chunk
+            addition["edit_url"] = edit_url
+            final_additions.append(addition)
+
+        return final_additions
+
+    except ValueError as ve:
+        logger.warning("Warning: Could not parse JSON due to value missing.")
+        return []    
+    except Exception as e:
+        logger.warning("Warning: Could not parse JSON.")
+        return []