In [2]:
import sys
import os
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, parent_dir)
from modules.schema_analyzer import SchemaAnalyzer

In [3]:
analyzer = SchemaAnalyzer("../knowledge_base/ChinookData.json")
result = analyzer.analyze()

number_of_lines = 10
snippets = analyzer.get_file_snippets(number_of_lines)


In [4]:
import pprint
pprint.pprint(result, sort_dicts=False)

{'Genre': ['array of', {'GenreId': 'integer', 'Name': 'string'}],
 'MediaType': ['array of', {'Name': 'string', 'MediaTypeId': 'integer'}],
 'Artist': ['array of', {'ArtistId': 'integer', 'Name': 'string'}],
 'Album': ['array of',
           {'AlbumId': 'integer', 'ArtistId': 'integer', 'Title': 'string'}],
 'Track': ['array of',
           {'GenreId': 'integer',
            'MediaTypeId': 'integer',
            'Name': 'string',
            'Composer': 'string',
            'UnitPrice': 'number',
            'Bytes': 'integer',
            'TrackId': 'integer',
            'Milliseconds': 'integer',
            'AlbumId': 'integer'}],
 'Employee': ['array of',
              {'LastName': 'string',
               'Country': 'string',
               'State': 'string',
               'PostalCode': 'string',
               'Title': 'string',
               'Address': 'string',
               'Fax': 'string',
               'EmployeeId': 'integer',
               'BirthDate': 'string',
    

In [5]:
# 2. Holen Sie die Daten-Ausschnitte
snippets = analyzer.get_file_snippets(n=10) # n=10 für 10 Zeilen
head_str = snippets.get('head')
middle_str = snippets.get('middle')
tail_str = snippets.get('tail')

In [6]:
import json
import tiktoken

# 1. Das Schema in einen JSON-String umwandeln
schema_str = json.dumps(result, indent=2)

# 2. Tokenizer für dein Modell auswählen
encoding = tiktoken.encoding_for_model("gpt-4")  # oder "gpt-3.5-turbo", etc.

# 3. Tokenisieren und zählen
tokens = encoding.encode(schema_str)
print("Token-Anzahl:", len(tokens))

Token-Anzahl: 671


In [7]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [8]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

In [9]:
messages = [
    (
        "system",
        "Your task is to analyze the data and understand its schema, then create a corresponding LinkML schema. This should serve as a transformation proposal for a relational database.",
    ),
    (
        "human",
        f"Can you generate a LinkML schema that reflects the data structure, its format, and relationships? Please provide it in a code block. The schema is: result {schema_str}. The file is a JSON. \n Tail of the file: {tail_str} \n Head: {head_str} \n Middle: {middle_str}"
    ),
]

ai_msg = llm.invoke(messages)

In [10]:
from IPython.display import display, Markdown
display(Markdown(f"{ai_msg.content}"))

Based on the provided data, the LinkML schema would look like this:

```yaml
id: http://example.org/schema/
name: music_store

types:
  id:
    base: int
    uri: xsd:integer
  string:
    base: str
    uri: xsd:string
  number:
    base: float
    uri: xsd:float

classes:
  Genre:
    slots:
      - id
      - name
    slot_usage:
      id:
        range: id
      name:
        range: string
  MediaType:
    slots:
      - id
      - name
    slot_usage:
      id:
        range: id
      name:
        range: string
  Artist:
    slots:
      - id
      - name
    slot_usage:
      id:
        range: id
      name:
        range: string
  Album:
    slots:
      - id
      - artist_id
      - title
    slot_usage:
      id:
        range: id
      artist_id:
        range: id
      title:
        range: string
  Track:
    slots:
      - id
      - genre_id
      - media_type_id
      - name
      - composer
      - unit_price
      - bytes
      - milliseconds
      - album_id
    slot_usage:
      id:
        range: id
      genre_id:
        range: id
      media_type_id:
        range: id
      name:
        range: string
      composer:
        range: string
      unit_price:
        range: number
      bytes:
        range: id
      milliseconds:
        range: id
      album_id:
        range: id
  Employee:
    slots:
      - id
      - last_name
      - country
      - state
      - postal_code
      - title
      - address
      - fax
      - birth_date
      - city
      - hire_date
      - email
      - reports_to
      - first_name
      - phone
    slot_usage:
      id:
        range: id
      last_name:
        range: string
      country:
        range: string
      state:
        range: string
      postal_code:
        range: string
      title:
        range: string
      address:
        range: string
      fax:
        range: string
      birth_date:
        range: string
      city:
        range: string
      hire_date:
        range: string
      email:
        range: string
      reports_to:
        range: id
      first_name:
        range: string
      phone:
        range: string
  Customer:
    slots:
      - id
      - last_name
      - country
      - state
      - postal_code
      - address
      - fax
      - support_rep_id
      - city
      - email
      - company
      - first_name
      - phone
    slot_usage:
      id:
        range: id
      last_name:
        range: string
      country:
        range: string
      state:
        range: string
      postal_code:
        range: string
      address:
        range: string
      fax:
        range: string
      support_rep_id:
        range: id
      city:
        range: string
      email:
        range: string
      company:
        range: string
      first_name:
        range: string
      phone:
        range: string
  Invoice:
    slots:
      - id
      - customer_id
      - billing_postal_code
      - billing_state
      - billing_city
      - total
      - invoice_date
      - billing_address
      - billing_country
    slot_usage:
      id:
        range: id
      customer_id:
        range: id
      billing_postal_code:
        range: string
      billing_state:
        range: string
      billing_city:
        range: string
      total:
        range: number
      invoice_date:
        range: string
      billing_address:
        range: string
      billing_country:
        range: string
  InvoiceLine:
    slots:
      - id
      - invoice_id
      - track_id
      - unit_price
      - quantity
    slot_usage:
      id:
        range: id
      invoice_id:
        range: id
      track_id:
        range: id
      unit_price:
        range: number
      quantity:
        range: id
  Playlist:
    slots:
      - id
      - name
    slot_usage:
      id:
        range: id
      name:
        range: string
  PlaylistTrack:
    slots:
      - playlist_id
      - track_id
    slot_usage:
      playlist_id:
        range: id
      track_id:
        range: id
```

This schema reflects the structure of the data, including the relationships between different entities (e.g., a track belongs to an album, an album belongs to an artist, etc.). It also specifies the data types for each attribute.

In [11]:
import json
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path

import gradio as gr
from gradio import ChatMessage
from langchain.prompts.chat import ChatPromptTemplate
from langchain.schema import HumanMessage, SystemMessage
from langchain.chat_models import init_chat_model
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

llm = init_chat_model("gpt-4o", model_provider="openai")
KB_DIR = Path("../knowledge_base")
SUPPORTED_EXTS = [".csv", ".json", ".xlsx", ".xml"]

def list_kb_files():
    return [str(p.name) for p in KB_DIR.glob("*") if p.suffix.lower() in SUPPORTED_EXTS]

async def respond(message, chat_history, selected_files, uploaded_files):
    selected_paths = [KB_DIR / name for name in selected_files]
    uploaded_paths = [Path(f.name) for f in uploaded_files or []]
    all_paths = selected_paths + uploaded_paths

    combined_content = []

    for path in all_paths:
        try:
            analyzer = SchemaAnalyzer(str(path))
            schema_result = analyzer.analyze()
            snippets = analyzer.get_file_snippets(n=10)

            schema_str = json.dumps(schema_result, indent=2)
            head_str = snippets.get("head", "")
            middle_str = snippets.get("middle", "")
            tail_str = snippets.get("tail", "")

            file_block = f"""\n=== File: {path.name} ===\n
            📊 Schema:
            {schema_str}

            📄 Head:
            {head_str}

            📄 Middle:
            {middle_str}

            📄 Tail:
            {tail_str}
            """
            combined_content.append(file_block)

        except Exception as e:
            combined_content.append(f"=== File: {path.name} ===\n❌ Error while parsing: {str(e)}\n")

    full_data_context = "\n\n".join(combined_content)

    messages = [
        SystemMessage(
            content="Your task is to analyze the uploaded data and infer a relational structure from it. Then generate a LinkML schema proposal that captures the structure and relationships. Format the output as valid LinkML inside a code block."
        ),
        HumanMessage(
            content=f"User question: {message}\n\nHere is the data context:\n{full_data_context}"
        ),
    ]

    buffer = ""
    async for chunk in llm.astream(messages):
        if hasattr(chunk, "content") and chunk.content:
            buffer += chunk.content
            yield buffer 




# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 🧠 File-to-LinkML Schema Chatbot")

    with gr.Accordion("📂 Select or upload files", open=False) as inputs_accordion:
        kb_files = gr.CheckboxGroup(
            label="Select files from knowledge_base",
            choices=list_kb_files()
        )
        uploads = gr.File(
            label="Upload your own files",
            file_types=SUPPORTED_EXTS,
            file_count="multiple"
        )

    chatbot = gr.ChatInterface(
        fn=respond,
        title="LinkML Schema Generator",
        type="messages",
        additional_inputs=[kb_files, uploads],
        additional_inputs_accordion=inputs_accordion,
        autofocus=True,
        autoscroll=True,
        fill_width=True,
        save_history=True,
        examples=[
        ["Kannst du mir eine Linkml Schema entwerfen fuer die angehangenen Daten?"]],
    )

demo.launch()


  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7869
* To create a public link, set `share=True` in `launch()`.


