In [10]:
pip install chromadb

Collecting chromadb
  Using cached chromadb-1.3.4-cp39-abi3-win_amd64.whl.metadata (7.4 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp313-cp313-win_amd64.whl.metadata (9.0 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Using cached uvicorn-0.38.0-py3-none-any.whl.metadata (6.8 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Using cached posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp313-cp313-win_amd64.whl.metadata (5.3 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Using cached opentelemetry_api-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Using cached opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemet


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import os, json, textwrap, uuid, datetime as dt
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from dotenv import load_dotenv
load_dotenv()  # uses OPENAI_API_KEY

# LLM
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Prompts (new location in LangChain 1.x)
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate

# Vectorstore
from langchain_community.vectorstores import Chroma

# Text splitters
from langchain_text_splitters import RecursiveCharacterTextSplitter


# Safe Python execution
import contextlib, io, traceback, builtins

### Import

In [12]:
BASE = Path.cwd()
DATA_DIR = BASE / "Data"
FIG_DIR = BASE / "figs"
RPT_DIR = BASE / "reports"
CHROMA_DIR = BASE / "chroma_db"

for p in [DATA_DIR, FIG_DIR, RPT_DIR, CHROMA_DIR]:
    p.mkdir(exist_ok=True)

print("Folders ready:", DATA_DIR, FIG_DIR, RPT_DIR, CHROMA_DIR, sep="\n - ")

Folders ready:
 - c:\Users\Kanta\OneDrive\เอกสาร\GitHub\AutoGPT\Data
 - c:\Users\Kanta\OneDrive\เอกสาร\GitHub\AutoGPT\figs
 - c:\Users\Kanta\OneDrive\เอกสาร\GitHub\AutoGPT\reports
 - c:\Users\Kanta\OneDrive\เอกสาร\GitHub\AutoGPT\chroma_db


### Load data

In [13]:
csv_path = DATA_DIR / "Titanic-Dataset.csv"
if not csv_path.exists():
    try:
        df = sns.load_dataset("titanic")
        df.to_csv(csv_path, index=False)
        print("Saved seaborn Titanic to", csv_path)
    except Exception as e:
        raise RuntimeError("Please place titanic.csv in ./data") from e
else:
    df = pd.read_csv(csv_path)

df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Rag

In [14]:
eda_checklist = """
# Titanic EDA Checklist
1) Inspect shape, dtypes, missing values
2) Descriptive stats (numeric + categorical)
3) Target: Survived distribution
4) Plots:
   - Survival rate by sex
   - Survival rate by class (pclass)
   - One more (age bins or embark_town)
5) Interactions (e.g., sex x class)
6) Save ≥3 figures to ./figs, write report to ./reports
"""

data_dictionary = """
# Titanic Data Dictionary (seaborn)
survived: 0/1 (target)
pclass: 1/2/3 class
sex: male/female
age: years
sibsp: siblings/spouses aboard
parch: parents/children aboard
fare: ticket price
embarked/embark_town: port
class: string for pclass
who/adult_male/alone: derived categoricals
"""

grading_rubric = """
# Report Rubric
- Overview + missingness + descriptive stats
- Survival comparisons: sex, class, and one more
- ≥3 plots saved in ./figs
- ≥5 bullet insights
- Clear headings + references to figures
"""

docs = [
    Document(page_content=eda_checklist,  metadata={"source":"checklist"}),
    Document(page_content=data_dictionary, metadata={"source":"dictionary"}),
    Document(page_content=grading_rubric, metadata={"source":"rubric"}),
]

emb = OpenAIEmbeddings()
vs = Chroma.from_documents(docs, embedding=emb, persist_directory=str(CHROMA_DIR))
vs.persist()
print("RAG docs count =", vs._collection.count())

RAG docs count = 3


  vs.persist()


### Safe python runner

In [15]:
ALLOWED_BUILTINS = {"abs","min","max","sum","len","range","enumerate","map","filter","zip","sorted","any","all","print"}
SAFE_GLOBALS = {k: getattr(builtins, k) for k in ALLOWED_BUILTINS}
SAFE_GLOBALS.update({
    "pd": pd, "np": np, "plt": plt, "sns": sns,
    "FIG_DIR": FIG_DIR, "DATA_DIR": DATA_DIR, "RPT_DIR": RPT_DIR
})
SAFE_LOCALS = {"df": df}

def run_python_safely(code: str) -> str:
    code = textwrap.dedent(code)
    buf = io.StringIO()
    try:
        with contextlib.redirect_stdout(buf):
            exec(code, SAFE_GLOBALS, SAFE_LOCALS)
        out = buf.getvalue()
        return out.strip() or "(no stdout)"
    except Exception:
        return "[ERROR]\n" + traceback.format_exc(limit=2)

print(run_python_safely("print('OK'); print('Shape:', df.shape)"))

OK
Shape: (891, 12)


### Tools

In [16]:
def retrieve_notes(query: str, k: int = 3) -> str:
    results = vs.similarity_search(query, k=k)
    blocks = []
    for i, d in enumerate(results, 1):
        blocks.append(f"## Doc {i} ({d.metadata.get('source','?')})\n{d.page_content.strip()}")
    return "\n\n".join(blocks)

def write_report(markdown_text: str) -> str:
    fname = RPT_DIR / f"titanic_report_{dt.datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
    with open(fname, "w", encoding="utf-8") as f:
        f.write(markdown_text)
    return f"Report written to {fname}"

TOOLS = {
    "PythonRunner": {
        "fn": run_python_safely,
        "desc": "Run short pandas/seaborn/matplotlib EDA. df is available. Save plots to FIG_DIR via plt.savefig('...')."
    },
    "RetrieveNotes": {
        "fn": retrieve_notes,
        "desc": "Retrieve EDA checklist, data dictionary, and grading rubric via semantic search."
    },
    "WriteReport": {
        "fn": write_report,
        "desc": "Write a Markdown report string to disk."
    }
}
print("Tools:", list(TOOLS.keys()))

Tools: ['PythonRunner', 'RetrieveNotes', 'WriteReport']


### LLM + Prompt for a mannual ReAct loop

In [21]:
llm = ChatOpenAI(model="gpt-4.1", temperature=0.2)

SYSTEM_PROMPT = """
You are an autonomous Data Analysis Agent for the Titanic dataset.
Goal: Produce a high-quality, fully written EDA report with key statistics and visualizations.

You can call EXACTLY one tool per step from: PythonRunner, RetrieveNotes, WriteReport.

====================
STRICT RULES
====================

1. Start with RetrieveNotes about "Titanic EDA".
2. Use PythonRunner to:
   - inspect data structure and missingness
   - compute descriptive statistics
   - generate at least 3 standard figures saved in ./figs
   - examine survival by sex, class, and one more feature

3. You MUST NOT call WriteReport until:
   - ALL analysis is complete,
   - all figures have been generated,
   - and the FULL Markdown report text has already been composed inside your "input".

4. The WRITE REPORT RULE (IMPORTANT):
   - The "input" to WriteReport MUST contain the final full Markdown report.
   - The final Markdown MUST include:
        • Overview  
        • Missing data analysis  
        • Descriptive statistics  
        • Survival comparisons by sex, class, and one more feature  
        • References to saved figures (./figs/...)  
        • At least **5 bullet-point insights** (using "-" or "•")  
   - DO NOT put instructions, placeholders, or descriptions of what should be written.
   - DO NOT say "Write a report…" or "This report should contain…".
   - The Markdown MUST be the **actual finished report content**.

5. Before calling WriteReport:
   - Your "thought" MUST explicitly confirm that the report is complete.
   - Your "thought" MUST summarize everything included in the report.
   - You must be 100% certain the Markdown is DONE.

6. When the report is written to disk:
   - Set final to "DONE".
   - stop = true.

7. Use a maximum of 12 steps.
8. If errors occur in PythonRunner, adapt and retry safely.

====================
RESPONSE FORMAT
====================
Always respond ONLY with a valid JSON object:

{
  "thought": "...",
  "action": "RetrieveNotes | PythonRunner | WriteReport | NONE",
  "input": "the tool input",
  "stop": false,
  "final": ""
}

When the WriteReport step successfully finishes, return:

{
  "thought": "Report completed.",
  "action": "NONE",
  "input": "",
  "stop": true,
  "final": "DONE"
}

"""

STEP_PROMPT = PromptTemplate.from_template(
    """{system}

History (last few steps):
{history}

Available tools:
- PythonRunner: {py_desc}
- RetrieveNotes: {rag_desc}
- WriteReport: {wr_desc}

State summary:
- Figures folder: {fig_dir}
- Reports folder: {rpt_dir}
- Steps used: {n_steps}/12

Respond as JSON only:
{{
  "thought": "...",
  "action": "RetrieveNotes|PythonRunner|WriteReport|NONE",
  "input": "...",
  "stop": false,
  "final": ""
}}
"""
)

### The mannual loop

In [22]:
def llm_plan_step(history: list, system: str) -> dict:
    msg = STEP_PROMPT.format(
        system=system,
        history="\n".join(history[-6:]) if history else "(empty)",
        py_desc=TOOLS["PythonRunner"]["desc"],
        rag_desc=TOOLS["RetrieveNotes"]["desc"],
        wr_desc=TOOLS["WriteReport"]["desc"],
        fig_dir=str(FIG_DIR),
        rpt_dir=str(RPT_DIR),
        n_steps=sum(1 for h in history if h.startswith("STEP"))
    )
    resp = llm.invoke(msg)
    text = resp.content.strip()
    # Try to extract JSON robustly
    try:
        # If the model adds code fences, strip them
        if text.startswith("```"):
            text = text.strip("`")
            # remove possible language hint
            if text.startswith("json"):
                text = text[4:].strip()
        data = json.loads(text)
    except Exception:
        # fallback: try to find first {...}
        start = text.find("{")
        end = text.rfind("}")
        if start >= 0 and end > start:
            data = json.loads(text[start:end+1])
        else:
            raise ValueError("LLM did not return valid JSON:\n" + text)
    # sanity defaults
    data.setdefault("thought","")
    data.setdefault("action","NONE")
    data.setdefault("input","")
    data.setdefault("stop", False)
    data.setdefault("final","")
    return data

history = []
max_steps = 12
step = 0
done = False

print("Starting agent loop...")
while step < max_steps and not done:
    plan = llm_plan_step(history, SYSTEM_PROMPT)
    thought = plan["thought"]
    action = plan["action"]
    ainput = plan["input"]
    stop = bool(plan["stop"])
    final = plan.get("final","")

    history.append(f"STEP {step+1} THOUGHT: {thought}")
    history.append(f"STEP {step+1} ACTION: {action}")

    if action in TOOLS:
        result = TOOLS[action]["fn"](ainput or "")
        # Keep observations short (truncate)
        obs = result if isinstance(result, str) else str(result)
        if len(obs) > 1200:
            obs = obs[:1200] + "\n...[truncated]..."
        history.append(f"STEP {step+1} OBS: {obs}")
    elif action == "NONE":
        history.append(f"STEP {step+1} OBS: (no-op)")
    else:
        history.append(f"STEP {step+1} OBS: [ERROR] Unknown tool '{action}'")

    step += 1
    # termination
    if stop or (final and final.strip().upper().endswith("DONE")):
        done = True
        history.append(f"STOP: {final or 'DONE'}")

print("\n=== Trace (last 40 lines) ===")
print("\n".join(history[-40:]))

print("\n=== Done? ===", done, "| Steps used:", step)

Starting agent loop...

=== Trace (last 40 lines) ===
STEP 1 THOUGHT: To begin, I will retrieve notes about 'Titanic EDA' to get the data dictionary, EDA checklist, and grading rubric. This will help ensure the analysis is thorough and aligned with best practices.
STEP 1 ACTION: RetrieveNotes
STEP 1 OBS: ## Doc 1 (checklist)
# Titanic EDA Checklist
1) Inspect shape, dtypes, missing values
2) Descriptive stats (numeric + categorical)
3) Target: Survived distribution
4) Plots:
   - Survival rate by sex
   - Survival rate by class (pclass)
   - One more (age bins or embark_town)
5) Interactions (e.g., sex x class)
6) Save ≥3 figures to ./figs, write report to ./reports

## Doc 2 (dictionary)
# Titanic Data Dictionary (seaborn)
survived: 0/1 (target)
pclass: 1/2/3 class
sex: male/female
age: years
sibsp: siblings/spouses aboard
parch: parents/children aboard
fare: ticket price
embarked/embark_town: port
class: string for pclass
who/adult_male/alone: derived categoricals

## Doc 3 (rubric)


<Figure size 640x480 with 0 Axes>

### Check output

In [23]:
print("Figures in ./figs:")
for p in sorted(FIG_DIR.glob("*")):
    print(" -", p.name)

print("\nReports in ./reports:")
reports = sorted(RPT_DIR.glob("*.md"))
for p in reports:
    print(" -", p.name)

latest = reports[-1] if reports else None
if latest:
    print("\n--- Latest Report Preview (first 2000 chars) ---\n")
    print(latest.read_text(encoding="utf-8")[:2000], "\n...\n")
else:
    print("No report found.")

Figures in ./figs:
 - age_histogram.png
 - survival_by_agegroup.png
 - survival_by_embarked.png
 - survival_by_pclass.png
 - survival_by_sex.png

Reports in ./reports:
 - titanic_report_20251114_232053.md
 - titanic_report_20251114_233506.md

--- Latest Report Preview (first 2000 chars) ---

# Titanic Dataset Exploratory Data Analysis (EDA) Report

## Overview
This report presents an exploratory data analysis (EDA) of the Titanic dataset, which contains information about passengers aboard the Titanic, including whether they survived the disaster. The analysis aims to uncover patterns in survival rates based on key features such as sex, passenger class, and port of embarkation.

## Missing Data Analysis
A review of the dataset reveals missing values in several columns:
- The 'Age' column has a notable proportion of missing values, which may affect age-related analyses.
- The 'Cabin' column contains substantial missingness, limiting its utility for analysis.
- The 'Embarked' column has a

### Rublic check

In [24]:
checks = {
    "has_overview": False,
    "has_missingness": False,
    "has_stats": False,
    "has_plots_3plus": len(list(FIG_DIR.glob("*.png"))) + len(list(FIG_DIR.glob("*.jpg"))) >= 3,
    "has_survival_by_sex": False,
    "has_survival_by_class": False,
    "has_third_comparison": False,
    "has_5_insights": False
}

if latest and latest.exists():
    txt = latest.read_text(encoding="utf-8").lower()
    checks["has_overview"] = ("overview" in txt or "introduction" in txt)
    checks["has_missingness"] = ("missing" in txt or "null" in txt or "na " in txt)
    checks["has_stats"] = any(w in txt for w in ["descriptive", "mean", "std", "median", "describe()"])
    checks["has_survival_by_sex"] = ("sex" in txt and "surviv" in txt)
    checks["has_survival_by_class"] = ("class" in txt and "surviv" in txt) or ("pclass" in txt and "surviv" in txt)
    checks["has_third_comparison"] = any(k in txt for k in ["embark", "age", "fare", "alone", "who"])
    checks["has_5_insights"] = txt.count("•") + txt.count("- ") + txt.count("* ") >= 5

import json
print(json.dumps(checks, indent=2))

{
  "has_overview": true,
  "has_missingness": true,
  "has_stats": true,
  "has_plots_3plus": true,
  "has_survival_by_sex": true,
  "has_survival_by_class": true,
  "has_third_comparison": true,
  "has_5_insights": true
}
