In [1]:
import json
from pathlib import Path

# Set paths (adjust if your folder is different)
dashboard_metadata_path = Path("../Data/dashboard_metadata.json")
supplier_delivery_metadata_path = Path("../Data/supplier_delivery_metadata.json")

# Load the JSONs
with open(dashboard_metadata_path, "r") as f:
    dashboard = json.load(f)
with open(supplier_delivery_metadata_path, "r") as f:
    deliveries = json.load(f)

chunks = []

# ----- Dashboard metadata -----

# --- ML Models (Financial + Supplier Score) ---
for ml_key in ["ML_financial", "ML_Supplier_score"]:
    ml = dashboard.get(ml_key, {})
    if ml:
        desc = ml.get("description", "")
        columns = ", ".join(ml.get("columns", []))
        model_info = ml.get("ml_model", {})
        purpose = model_info.get("purpose", "")
        name = model_info.get("name", "")
        mtype = model_info.get("type", "")
        lib = model_info.get("library", "")
        features = ", ".join(model_info.get("features_used", []))
        output = model_info.get("output", "")
        why = model_info.get("why_this_model", "")
        parameters = "; ".join([f"{k}={v}" for k, v in model_info.get("parameters", {}).items()])
        chunk = (
            f"ML Model '{name}' ({mtype}, library: {lib}) is used for: {purpose} "
            f"Features used: {features}. Output: {output}. Columns: {columns}. "
            f"Parameters: {parameters}. Reason: {why}."
        )
        if desc:
            chunk = f"{desc} " + chunk
        chunks.append(chunk)

        # Add training details (if any)
        training = model_info.get("training", {})
        if training:
            chunks.append(
                f"Training details for model '{name}': train/test split: {training.get('train_test_split', '')}, "
                f"scaling: {training.get('scaling', '')}."
            )

# --- Supplier Master Data ---
suppliers = dashboard.get("supplier_master_data", {})
for sid, entry in suppliers.items():
    chunks.append(
        f"Supplier {entry['Supplier ID']}: {entry['Supplier Name']}, Region: {entry['Region']}, Tier: {entry['Tier']}."
    )

# --- Average Supplier Scores ---
for sid, score in dashboard.get("average_supplier_score_2020_2023", {}).items():
    chunks.append(
        f"Supplier {sid} had an average performance score of {score} from 2020 to 2023."
    )

# --- Flagged Suppliers ---
flagged = dashboard.get("Flagged_suppliers", {})
if flagged:
    chunks.append(f"Flagged suppliers: {flagged.get('suppliers flagged', 'N/A')}")
    # Add per-supplier flag details (humanized)
    if flagged.get("SUP003_Flagged_quarters"):
        chunks.append(f"Supplier SUP003: {flagged['SUP003_Flagged_quarters']}")
    if flagged.get("SUP005_Flagged_quarters"):
        chunks.append(f"Supplier SUP005: {flagged['SUP005_Flagged_quarters']}")

# --- Charts ---
for chart in dashboard.get("charts", []):
    # Human readable description for each chart
    c_desc = (
        f"Chart '{chart['title']}' ({chart['type']}): {chart['description']} "
        f"X-axis: {chart.get('x_axis', '')}, Y-axis: {chart.get('y_axis', '')}. "
        f"Filters: {', '.join(chart.get('filters', []))}. Source data: {chart.get('source_data', '')}."
    )
    if "highlighted" in chart:
        c_desc += f" Highlighted: {chart['highlighted']}."
    if "highlighted_events" in chart:
        c_desc += f" Highlighted events: {', '.join(chart['highlighted_events'])}."
    if "purpose" in chart:
        c_desc += f" Purpose: {chart['purpose']}."
    chunks.append(c_desc)

# --- LLM Model Details (for reference) ---
llm = dashboard.get("llm_model", {})
if llm:
    llm_chunk = (
        f"LLM Model: {llm.get('name', '')} by {llm.get('provider', '')}. "
        f"Purpose: {llm.get('purpose', '')}. Limitations: {', '.join(llm.get('limitations', []))}"
    )
    chunks.append(llm_chunk)

# ----- Delivery metadata -----
for row in deliveries.get("filtered_supplier_deliveries", []):
    s_id = row.get("Supplier ID", "Unknown")
    order = row.get("Order Date", "")
    expected = row.get("Expected Delivery Date", "")
    actual = row.get("Actual Delivery Date", "")
    lost = row.get("Shipment Lost", False)
    defected = row.get("Defected", False)
    volume = row.get("Shipment Volume", "")
    value = row.get("Value Category", "")
    
    # Start chunk with delivery fact
    text = (
        f"Supplier {s_id} shipment ordered on {order}: "
        f"Expected delivery {expected}, Actual delivery {actual}. "
        f"Shipment volume: {volume}, Value category: {value}."
    )
    
    # Add status
    if lost and defected:
        text += " This shipment was LOST and DEFECTED."
    elif lost:
        text += " This shipment was LOST."
    elif defected:
        text += " This shipment was DEFECTED."
    else:
        text += " This shipment was delivered without loss or defect."
    
    chunks.append(text)

# Print a few example chunks
for c in chunks[:20]:
    print("-", c)
print(f"\nTotal chunks: {len(chunks)}")

# Save the chunks to a text file
with open("../Data/all_metadata_chunks.txt", "w") as out:
    for chunk in chunks:
        out.write(chunk + "\n")
print("Chunks saved to ../Data/all_metadata_chunks.txt")

- Quarterly financial metrics for each supplier, including credit score, revenue, and D&B rating. ML Model 'Isolation Forest' (Unsupervised anomaly detection, library: scikit-learn) is used for: Calculate financial risk scores for suppliers based on anomaly detection in their financial data. Features used: Credit Score, Revenue (USD), D&B Rating. Output: Financial Risk Score (0-100), higher means higher anomaly risk. Columns: Supplier ID, Quarter, Credit Score, Revenue (USD), D&B Rating. Parameters: contamination=0.1; random_state=42. Reason: Isolation Forest is well-suited for identifying anomalies in high-dimensional, unlabeled data such as supplier financial metrics. It efficiently detects outliers without requiring labeled examples of risky suppliers, making it ideal for unsupervised financial risk detection in this context..
- Calculates a comprehensive supplier performance score each quarter by combining operational performance data, financial risk, and supplier tier using a supe