# Cluster Entity Benchmarking

This notebook benchmarks the performance of a **Monolithic Cluster** table versus a **Fragmented** approach (Core vs Info vs Stats).

## Hypothesis
Clusters have high read traffic for discovery (Name/Image) and high write traffic for stats (Member Count). Separating these should improve concurrency and cache locality.

In [None]:
# Install dependencies (quietly)
# !pip install sqlmodel faker > /dev/null 2>&1

In [None]:
from sqlmodel import Field, SQLModel, create_engine, Session, select
from faker import Faker
import time
import random
import os
from uuid import UUID, uuid4
from datetime import datetime
from typing import Optional

os.makedirs("temp/db", exist_ok=True)
DATABASE_URL = "sqlite:///temp/db/benchmarking_cluster.db"
engine = create_engine(DATABASE_URL, echo=False)
fake = Faker()

## 1. Monolithic Architecture

All cluster data (`name`, `description`, `stats`, `settings`) is stored in one table.

### Schema
```sql
CREATE TABLE clustermonolith (
    cid CHAR(36) PRIMARY KEY,
    name VARCHAR(255) INDEX,
    topic VARCHAR(255) INDEX,
    description TEXT,
    owner_id CHAR(36),
    created_at DATETIME,
    image_url VARCHAR(255),
    is_private BOOLEAN,
    member_count INTEGER,
    post_count INTEGER,
    rules TEXT,
    settings TEXT
);
```

In [None]:
class ClusterMonolith(SQLModel, table=True):
    cid: UUID = Field(default_factory=uuid4, primary_key=True)
    name: str = Field(index=True)
    topic: str = Field(index=True)
    description: str
    owner_id: UUID
    created_at: datetime = Field(default_factory=datetime.utcnow)
    image_url: Optional[str] = None
    is_private: bool = False
    member_count: int = 0
    post_count: int = 0
    rules: Optional[str] = None
    settings: Optional[str] = None

## 2. Fragmented Architecture

Split into:
*   `ClusterCore`: Lightweight, frequent reads (List/Discovery).
*   `ClusterInfo`: Heavy text, on-demand reads (Details).
*   `ClusterStats`: High frequency writes (Member/Post counts).

### Schema
```sql
CREATE TABLE clustercore (
    cid CHAR(36) PRIMARY KEY,
    name VARCHAR(255) INDEX,
    topic VARCHAR(255) INDEX,
    image_url VARCHAR(255),
    is_private BOOLEAN
);

CREATE TABLE clusterinfo (
    cid CHAR(36) PRIMARY KEY FOREIGN KEY(clustercore.cid),
    description TEXT,
    owner_id CHAR(36),
    created_at DATETIME,
    rules TEXT,
    settings TEXT
);

CREATE TABLE clusterstats (
    cid CHAR(36) PRIMARY KEY FOREIGN KEY(clustercore.cid),
    member_count INTEGER,
    post_count INTEGER
);
```

In [None]:
class ClusterCore(SQLModel, table=True):
    cid: UUID = Field(default_factory=uuid4, primary_key=True)
    name: str = Field(index=True)
    topic: str = Field(index=True)
    image_url: Optional[str]
    is_private: bool = False

class ClusterInfo(SQLModel, table=True):
    cid: UUID = Field(primary_key=True, foreign_key="clustercore.cid")
    description: str
    owner_id: UUID
    created_at: datetime = Field(default_factory=datetime.utcnow)
    rules: Optional[str]
    settings: Optional[str]

class ClusterStats(SQLModel, table=True):
    cid: UUID = Field(primary_key=True, foreign_key="clustercore.cid")
    member_count: int = 0
    post_count: int = 0

## 3. Data Generation

In [None]:
ENTRY_COUNT = 5_000
print(f"Generating {ENTRY_COUNT} clusters...")

# Clear existing
SQLModel.metadata.drop_all(engine)
SQLModel.metadata.create_all(engine)

clusters_data = []
for _ in range(ENTRY_COUNT):
    clusters_data.append({
        "name": fake.company(),
        "topic": fake.word(),
        "description": fake.text(),
        "owner_id": uuid4(),
        "image_url": fake.image_url(),
        "is_private": fake.boolean(),
        "member_count": random.randint(0, 10000),
        "post_count": random.randint(0, 5000),
        "rules": fake.text(),
        "settings": "{}"
    })

In [None]:
# Populate Monolith
with Session(engine) as session:
    for data in clusters_data:
        session.add(ClusterMonolith(**data))
    session.commit()

# Populate Fragmented
with Session(engine) as session:
    for data in clusters_data:
        core = ClusterCore(
            name=data["name"], topic=data["topic"], 
            image_url=data["image_url"], is_private=data["is_private"]
        )
        session.add(core)
        session.flush()
        
        info = ClusterInfo(
            cid=core.cid, description=data["description"], 
            owner_id=data["owner_id"], rules=data["rules"], settings=data["settings"]
        )
        stats = ClusterStats(
            cid=core.cid, member_count=data["member_count"], post_count=data["post_count"]
        )
        session.add(info)
        session.add(stats)
    session.commit()

## 4. Benchmarks

In [None]:
print("Benchmarking Discovery (List Scenario)...")

# Benchmark Discovery (List Names & Images)
start_time = time.perf_counter()
with Session(engine) as session:
    # Monolith fetches everything
    results = session.exec(select(ClusterMonolith).limit(100)).all()
mono_list_time = time.perf_counter() - start_time

start_time = time.perf_counter()
with Session(engine) as session:
    # Fragmented fetches only Core
    results = session.exec(select(ClusterCore).limit(100)).all()
frag_list_time = time.perf_counter() - start_time

print(f"Monolith List (100 items): {mono_list_time:.6f}s")
print(f"Fragmented List (100 items): {frag_list_time:.6f}s")

In [None]:
print("Benchmarking Stats Update...")

# Benchmark Stats Update (Join Cluster)

start_time = time.perf_counter()
with Session(engine) as session:
    # Get a valid CID for Monolith
    cid_target = session.exec(select(ClusterMonolith.cid)).first()
    cluster = session.exec(select(ClusterMonolith).where(ClusterMonolith.cid == cid_target)).first()
    cluster.member_count += 1
    session.add(cluster)
    session.commit()
mono_update_time = time.perf_counter() - start_time

start_time = time.perf_counter()
with Session(engine) as session:
    # Get a valid CID for Fragmented
    cid_target = session.exec(select(ClusterStats.cid)).first()
    stats = session.exec(select(ClusterStats).where(ClusterStats.cid == cid_target)).first()
    stats.member_count += 1
    session.add(stats)
    session.commit()
frag_update_time = time.perf_counter() - start_time

print(f"Monolith Update: {mono_update_time:.6f}s")
print(f"Fragmented Update: {frag_update_time:.6f}s")

ratio = mono_update_time / frag_update_time if frag_update_time > 0 else 0
print(f"\nFragmented Update is {ratio:.2f}x faster than Monolith.")