In [None]:
!pip install --upgrade langchain langchain-community
!pip install notion-client
!pip install fastembed
!pip install chromadb

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.1.0-py3-n

In [None]:
!pip install -U langchain-openai langchain-chroma

Collecting langchain-openai
  Downloading langchain_openai-0.3.27-py3-none-any.whl.metadata (2.3 kB)
Downloading langchain_openai-0.3.27-py3-none-any.whl (70 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.4/70.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-openai
Successfully installed langchain-openai-0.3.27


In [None]:
from notion_client import Client
from langchain.schema import Document
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_chroma import Chroma
import sys
import os

In [None]:
os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY"

In [None]:
def fetch_notion_page(client: Client, page_id: str) -> list[Document]:
    docs = []
    def recurse(pid):
        res = client.blocks.children.list(block_id=pid)
        texts = []
        for block in res["results"]:
            t = block["type"]
            if t in ("paragraph","heading_1","heading_2","heading_3",
                     "bulleted_list_item","numbered_list_item","quote","code"):
                rich = block[t].get("rich_text", [])
                texts.append("".join(r["plain_text"] for r in rich))
            elif t == "child_page":
                texts.append(f"# {block['child_page']['title']}")
                recurse(block["id"])
        docs.append(Document(
            page_content="\n\n".join(texts),
            metadata={"source": pid}
        ))
    recurse(page_id)
    return docs

def ingest_notion_page(
    integration_token: str,
    page_id: str,
    persist_dir: str = "./ais_notion_chroma_db",
):
    # 1) Fetch raw pages
    client = Client(auth=integration_token)
    pages = fetch_notion_page(client, page_id)

    # 2) Split into chunks
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024, chunk_overlap=100, length_function=len, add_start_index=True
    )
    docs = splitter.split_documents(pages)
    print(f"Split {len(pages)} pages into {len(docs)} chunks.")

    # 3) Embed & persist
    embedding = FastEmbedEmbeddings()
    Chroma.from_documents(
        documents=docs,
        embedding=embedding,
        persist_directory=persist_dir
    )

In [None]:
ingest_notion_page("integration_token", "5b9ffa2c525f481ea08ed801d8ef7896")

Split 50 pages into 284 chunks.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model_optimized.onnx:   0%|          | 0.00/66.5M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

In [None]:
def rag_chain_openai():
    model = ChatOpenAI(
        model_name="gpt-3.5-turbo",
        temperature=0
    )

    prompt = PromptTemplate.from_template(
        """
        [Instructions] You are a helpful assistant. Answer from only the context.
        If you don’t know, reply “No context available for: {input}”.
        Question: {input}
        Context: {context}
        Answer:
        """
    )

    embedding = FastEmbedEmbeddings()
    vector_store = Chroma(
        persist_directory="./ais_notion_chroma_db",
        embedding_function=embedding
    )

    retriever = vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 3}
    )
    doc_chain = create_stuff_documents_chain(model, prompt)
    return create_retrieval_chain(retriever, doc_chain)

In [None]:
chain = rag_chain_openai()

In [None]:
resp = chain.invoke({"input": "Whats a cost function ?"})
print(resp)
print(resp.keys())

print("ANSWER:", resp["answer"])
for doc in resp["context"]:
    print("→ Source:", doc.metadata["source"])

{'input': 'Whats a cost function ?', 'context': [Document(id='0e938cb5-3763-4981-93a7-431c57ab8ba0', metadata={'start_index': 0, 'source': '1752599b-3926-42d3-bdde-e5c293a4f3bb'}, page_content="Cost functions\n\nFor the neural network to learn we define cost functions. This indicates to the system what is a good answer and what isn't. For the example in particular if the function correctly classifies the image we should get the activation of one of the neurons very high and all the others very low. In the case of getting high values for different numbers, we have a high cost and don't like the answer.\n\nThe cost is defined as the sum of the squared differences between what the system gave me and what I expected. By averaging the costs we can get an idea of how well is the network performing.\n\nWe can think of the cost function as a function that receives only one parameter and we want to minimize it. Starting from a random input and getting into a local minimum can be feasible but ge

In [None]:
from IPython.display import Markdown, display

def show_fancy(resp):
    display(Markdown(f"## 📝 Answer\n\n{resp['answer']}"))

    md = "### 🔗 Sources\n"
    for doc in resp["context"]:
        snippet = doc.page_content.replace("\n", " ")[:120] + "…"
        md += f"- **{doc.metadata['source']}**: `{snippet}`\n"
    display(Markdown(md))

In [None]:
show_fancy(resp)

## 📝 Answer

A cost function is a function that indicates to the system what is a good answer and what isn't by measuring the sum of squared differences between the system's output and the expected output. It is used to evaluate how well a neural network is performing and is minimized during the learning process.

### 🔗 Sources
- **1752599b-3926-42d3-bdde-e5c293a4f3bb**: `Cost functions  For the neural network to learn we define cost functions. This indicates to the system what is a good an…`
- **1752599b-3926-42d3-bdde-e5c293a4f3bb**: `If we think of the cost as a multivariable function we could think that adding a small step of the negative of the gradi…`
- **f9d04b9f-2951-430d-ab4d-14ff26ba366d**: `We also could want to only detect significant input if we have a certain degree of certainty that the image is what we w…`
