# User Entity Benchmarking

This notebook benchmarks the performance of a **Monolithic User** table versus a **Fragmented** approach (Auth vs Profile).

## Trade-offs Tested
1.  **Write Penalty**: Fragmentation requires multiple writes (Auth + Profile), which should be significantly slower.
2.  **Read Optimization (Feed Author)**: Fetching just the `name` and `image` for a list of authors (common in feeds) should be faster on the thinner `UserProfile` table vs the fat `UserMonolith`.

In [None]:
# Install dependencies (quietly)
# !pip install sqlmodel faker > /dev/null 2>&1

In [1]:
import time
import random
import os
from typing import Optional
from sqlmodel import Field, SQLModel, create_engine, Session, select, func
from faker import Faker
from enum import Enum
from uuid import UUID, uuid4
from datetime import datetime

# Ensure temp directory exists
os.makedirs("temp/db", exist_ok=True)

DATABASE_URL = "sqlite:///temp/db/benchmarking_user.db"
engine = create_engine(DATABASE_URL, echo=False)
fake = Faker()

## 1. Monolithic Architecture

Single table with all columns.

### Schema
```sql
CREATE TABLE usermonolith (
    uid CHAR(36) PRIMARY KEY,
    email VARCHAR(255) UNIQUE INDEX,
    password_hash VARCHAR(255),  -- Heavy/Secret
    role VARCHAR(50),
    is_verified BOOLEAN,
    name VARCHAR(255),
    phone VARCHAR(255),
    location VARCHAR(255),
    bio TEXT,                    -- Heavy
    profile_image VARCHAR(255),
    created_at DATETIME,
    last_active DATETIME
);
```

In [2]:
class UserRole(str, Enum):
    GUEST = "GUEST"
    MEMBER = "MEMBER"
    ADMIN = "ADMIN"

class UserMonolith(SQLModel, table=True):
    uid: UUID = Field(default_factory=uuid4, primary_key=True)
    email: str = Field(index=True, unique=True)  # Auth
    password_hash: str  # Auth
    role: UserRole = Field(default=UserRole.MEMBER)  # Auth
    is_verified: bool = Field(default=False)  # Auth
    name: str  # Profile
    phone: Optional[str] = Field(default=None)  # Profile
    location: Optional[str] = Field(default=None)  # Profile
    bio: Optional[str] = Field(default=None)  # Profile
    profile_image: Optional[str] = Field(default=None)  # Profile
    created_at: datetime = Field(default_factory=datetime.utcnow)  # Meta
    last_active: datetime = Field(default_factory=datetime.utcnow)  # Meta

## 2. Fragmented Architecture

Split into `UserAuth` (Credentials) and `UserProfile` (Public Data). This isolates heavy/unused columns from specific access patterns.

### Schema
```sql
CREATE TABLE userauth (
    uid CHAR(36) PRIMARY KEY,
    email VARCHAR(255) UNIQUE INDEX,
    password_hash VARCHAR(255),
    role VARCHAR(50),
    is_verified BOOLEAN
);

CREATE TABLE userprofile (
    uid CHAR(36) PRIMARY KEY FOREIGN KEY(userauth.uid),
    name VARCHAR(255),           -- Needed for Feed/Core
    profile_image VARCHAR(255),  -- Needed for Feed/Core
    phone VARCHAR(255),
    location VARCHAR(255),
    bio TEXT,
    created_at DATETIME,
    last_active DATETIME
);
```

In [3]:
class UserAuth(SQLModel, table=True):
    uid: UUID = Field(default_factory=uuid4, primary_key=True)
    email: str = Field(index=True, unique=True)
    password_hash: str
    role: UserRole = Field(default=UserRole.MEMBER)
    is_verified: bool = Field(default=False)

class UserProfile(SQLModel, table=True):
    uid: UUID = Field(primary_key=True, foreign_key="userauth.uid")
    name: str
    profile_image: Optional[str]
    phone: Optional[str]
    location: Optional[str]
    bio: Optional[str]
    created_at: datetime = Field(default_factory=datetime.utcnow)
    last_active: datetime = Field(default_factory=datetime.utcnow)

## 3. Data Generation

In [4]:
ENTRY_COUNT = 10_000
print(f"Generating {ENTRY_COUNT} users...")

# Clear existing tables if needed
SQLModel.metadata.drop_all(engine)
SQLModel.metadata.create_all(engine)

users_data = []
for _ in range(ENTRY_COUNT):
    users_data.append({
        "email": fake.unique.email(),
        "password_hash": fake.sha256(),
        "role": random.choice(list(UserRole)),
        "is_verified": fake.boolean(),
        "name": fake.name(),
        "phone": fake.phone_number(),
        "location": fake.city(),
        "bio": fake.text(),
        "profile_image": fake.image_url(),
        "created_at": fake.date_time_this_year(),
        "last_active": fake.date_time_this_year()
    })

Generating 10000 users...


## 4. Benchmarks

In [5]:
print("Benchmarking Insertions (WRITE PENALTY)...")

# Benchmark Monolith Insert
start_time = time.perf_counter()
with Session(engine) as session:
    for data in users_data:
        user = UserMonolith(**data)
        session.add(user)
    session.commit()
mono_insert_time = time.perf_counter() - start_time

# Benchmark Fragmented Insert
start_time = time.perf_counter()
with Session(engine) as session:
    for data in users_data:
        # Split data
        auth_data = {k: v for k, v in data.items() if k in UserAuth.model_fields}
        profile_data = {k: v for k, v in data.items() if k in UserProfile.model_fields}
        
        auth = UserAuth(**auth_data)
        session.add(auth)
        session.flush() # Get UID
        
        profile = UserProfile(uid=auth.uid, **profile_data)
        session.add(profile)
    session.commit()
frag_insert_time = time.perf_counter() - start_time

print(f"Monolith Insert Time: {mono_insert_time:.4f}s")
print(f"Fragmented Insert Time: {frag_insert_time:.4f}s")
print(f"RESULT: Fragmentation is {frag_insert_time/mono_insert_time:.1f}x slower to write.")

Benchmarking Insertions (WRITE PENALTY)...
Monolith Insert Time: 1.0853s
Fragmented Insert Time: 4.2281s
RESULT: Fragmentation is 3.9x slower to write.


In [6]:
print("Benchmarking Feed Author Resolution (READ OPTIMIZATION)...")
# Scenario: Loading a feed of 20 posts, we need to resolve the Author Name & Image for 20 UIDs.
SAMPLE_SIZE = 1000  # Running 1000 author lookups to average the noise
target_uids = [u.uid for u in session.exec(select(UserAuth).limit(SAMPLE_SIZE)).all()]

# Monolith Lookup
start_time = time.perf_counter()
with Session(engine) as session:
    for uid in target_uids:
        # Must scan wider table
        session.exec(select(UserMonolith.name, UserMonolith.profile_image).where(UserMonolith.uid == uid)).first()
mono_read_time = time.perf_counter() - start_time

# Fragmented Lookup
start_time = time.perf_counter()
with Session(engine) as session:
    for uid in target_uids:
        # Scan thinner table (UserProfile)
        session.exec(select(UserProfile.name, UserProfile.profile_image).where(UserProfile.uid == uid)).first()
frag_read_time = time.perf_counter() - start_time

print(f"Monolith Author Lookup ({SAMPLE_SIZE} ops): {mono_read_time:.4f}s")
print(f"Fragmented Author Lookup ({SAMPLE_SIZE} ops): {frag_read_time:.4f}s")

Benchmarking Feed Author Resolution (READ OPTIMIZATION)...
Monolith Author Lookup (1000 ops): 0.2984s
Fragmented Author Lookup (1000 ops): 0.2097s


In [7]:
# Justification
ratio = mono_read_time / frag_read_time if frag_read_time > 0 else 0
print(f"\nFragmented Read is {ratio:.2f}x faster than Monolith for Feed Author Resolution.")


Fragmented Read is 1.42x faster than Monolith for Feed Author Resolution.
