# Comment Entity Benchmarking

This notebook benchmarks the performance of a **Monolithic Comment** table versus a **Fragmented** approach (Core vs Content vs Stats).

## Hypothesis
Comments are often displayed in large trees. Loading the structure (Core) separate from the content (on demand or lazy loaded) or just keeping the Core lean allows faster tree traversals and sorting.

In [1]:
# Install dependencies (quietly)
!pip install sqlmodel faker > /dev/null 2>&1

The system cannot find the path specified.


In [1]:
from sqlmodel import Field, SQLModel, create_engine, Session, select
from faker import Faker
import time
import random
import os
from uuid import UUID, uuid4
from datetime import datetime
from typing import Optional

os.makedirs("temp/db", exist_ok=True)
DATABASE_URL = "sqlite:///temp/db/benchmarking_comment.db"
engine = create_engine(DATABASE_URL, echo=False)
fake = Faker()

## 1. Monolithic Architecture

Standard recursive comment structure in a single table.

### Schema
```sql
CREATE TABLE commentmonolith (
    mid CHAR(36) PRIMARY KEY,
    pid CHAR(36) INDEX,
    uid CHAR(36),
    parent_mid CHAR(36) INDEX,
    content TEXT,
    created_at DATETIME INDEX,
    likes INTEGER,
    dislikes INTEGER
);
```

In [2]:
class CommentMonolith(SQLModel, table=True):
    mid: UUID = Field(default_factory=uuid4, primary_key=True)
    pid: UUID = Field(index=True)
    uid: UUID
    parent_mid: Optional[UUID] = Field(default=None, index=True)
    content: str
    created_at: datetime = Field(default_factory=datetime.utcnow, index=True)
    likes: int = 0
    dislikes: int = 0

## 2. Fragmented Architecture

Split into:
*   `CommentCore`: Tree structure (`pid`, `parent_mid`, `created_at`).
*   `CommentContent`: The actual text.
*   `CommentStats`: Likes/Dislikes.

### Schema
```sql
CREATE TABLE commentcore (
    mid CHAR(36) PRIMARY KEY,
    pid CHAR(36) INDEX,
    uid CHAR(36),
    parent_mid CHAR(36) INDEX,
    created_at DATETIME INDEX
);

CREATE TABLE commentcontent (
    mid CHAR(36) PRIMARY KEY FOREIGN KEY(commentcore.mid),
    content TEXT
);

CREATE TABLE commentstats (
    mid CHAR(36) PRIMARY KEY FOREIGN KEY(commentcore.mid),
    likes INTEGER,
    dislikes INTEGER
);
```

In [3]:
class CommentCore(SQLModel, table=True):
    mid: UUID = Field(default_factory=uuid4, primary_key=True)
    pid: UUID = Field(index=True)
    uid: UUID
    parent_mid: Optional[UUID] = Field(default=None, index=True)
    created_at: datetime = Field(default_factory=datetime.utcnow, index=True)

class CommentContent(SQLModel, table=True):
    mid: UUID = Field(primary_key=True, foreign_key="commentcore.mid")
    content: str

class CommentStats(SQLModel, table=True):
    mid: UUID = Field(primary_key=True, foreign_key="commentcore.mid")
    likes: int = 0
    dislikes: int = 0

## 3. Data Generation
Generating 100,000 comments.

In [4]:
ENTRY_COUNT = 100_000
POST_IDS = [uuid4() for _ in range(1000)]
print(f"Generating {ENTRY_COUNT} comments...")

SQLModel.metadata.drop_all(engine)
SQLModel.metadata.create_all(engine)

comments_data = []
for _ in range(ENTRY_COUNT):
    comments_data.append({
        "pid": random.choice(POST_IDS),
        "uid": uuid4(),
        "content": fake.sentence(),
        "created_at": fake.date_time_this_year(),
        "likes": random.randint(0, 500)
    })

Generating 100000 comments...


In [5]:
# Populate Monolith
with Session(engine) as session:
    batch = []
    for data in comments_data:
        batch.append(CommentMonolith(**data))
        if len(batch) >= 1000:
            session.add_all(batch)
            session.commit()
            batch = []
    if batch:
        session.add_all(batch)
        session.commit()

# Populate Fragmented
with Session(engine) as session:
    # Simplifying bulk insert for fragmentation logic simulation
    for i, data in enumerate(comments_data):
        core = CommentCore(pid=data["pid"], uid=data["uid"], created_at=data["created_at"])
        session.add(core)
        session.flush()
        
        content = CommentContent(mid=core.mid, content=data["content"])
        stats = CommentStats(mid=core.mid, likes=data["likes"])
        session.add(content)
        session.add(stats)
        
        if i % 1000 == 0:
            session.commit()
    session.commit()

## 4. Benchmarks

In [11]:
print("Benchmarking Tree Retrieval (Load All Comments for a Post)...")

# Benchmark Tree Retrieval (Get all comments for a post)
target_pid = POST_IDS[0]

start_time = time.perf_counter()
with Session(engine) as session:
    # Monolith fetch
    comments = session.exec(
        select(CommentMonolith)
        .where(CommentMonolith.pid == target_pid)
        .order_by(CommentMonolith.created_at)
    ).all()
mono_tree_time = time.perf_counter() - start_time

start_time = time.perf_counter()
with Session(engine) as session:
    # Fragmented fetch (Core only, e.g. for skeleton loading)
    comments = session.exec(
        select(CommentCore)
        .where(CommentCore.pid == target_pid)
        .order_by(CommentCore.created_at)
    ).all()
frag_tree_time = time.perf_counter() - start_time

print(f"Monolith Tree Fetch: {mono_tree_time:.6f}s")
print(f"Fragmented Tree Core Fetch: {frag_tree_time:.6f}s")

ratio = mono_tree_time / frag_tree_time if frag_tree_time > 0 else 0
print(f"\nFragmented Tree Fetch is {ratio:.2f}x faster than Monolith.")

Benchmarking Tree Retrieval (Load All Comments for a Post)...
Monolith Tree Fetch: 0.004006s
Fragmented Tree Core Fetch: 0.002082s

Fragmented Tree Fetch is 1.92x faster than Monolith.
