# NoSQL DB With Python

## 1. Configuration Setup & Connection Initialization


In [None]:
import os
import yaml
import pprint
import logging
from dotenv import load_dotenv, find_dotenv
from typing import List, Optional, Dict, Any, Type, TypeVar
from datetime import datetime, timezone
from beanie import Document
from daolib.drivers.nosql.mongo_connector import MongoConnector
from daolib.drivers.nosql.config import NoSQLConnectionEntry

logger = logging.getLogger(__name__)

In [None]:
load_dotenv(find_dotenv())

In [None]:
# ---- YAML File Reader ----

class YamlFileOperator:
    """Simple YAML file reader"""
    
    @staticmethod
    def read(file_path: str) -> Dict[str, Any]:
        """Read YAML file and return the parsed content"""
        try:
            with open(file_path, 'r') as file:
                return yaml.safe_load(file)
        except FileNotFoundError:
            raise FileNotFoundError(f"Config file not found: {file_path}")
        except yaml.YAMLError as e:
            raise ValueError(f"Error parsing YAML file: {e}")

### 1.1 Configuration Constants


In [None]:
# ---- Configuration Constants ----

class Constants:
    
    class DBConstants:
        """Database-related constants"""
        
        # YAML structure keys
        nosql_creds = "nosql_creds"
        mongo_inst = "mongo_inst"
        username = "username"
        host = "host"
        port = "port"
        db_name = "db_name"
        min_pool_size = "min_pool_size"
        max_pool_size = "max_pool_size"
        use_srv = "use_srv" # Added to constants
        # Passwords from environment variables (not stored in YAML for security)
        mongo_password_dev = os.getenv("MONGO_PASSWORD_DEV", "password")
        mongo_password_prod = os.getenv("MONGO_PASSWORD_PROD", "password")

### 1.2 Custom MongoConnector Helper


In [None]:
import os
import logging
from typing import Optional, List, Any

logger = logging.getLogger(__name__)

class MongoHelper(MongoConnector):
    """
    Concrete implementation of MongoConnector.
    Loads configuration from a YAML file (injected or default) and Env variables.
    """
    
    def __init__(self, config_path: Optional[str] = None, document_models: Optional[List[Any]] = None):
        """
        Args:
            config_path: Optional path to YAML config. If None, defaults to config/{env}.yml
            document_models: List of Beanie Document classes to register.
        """
        # 1. Initialize Parent (Registers models and sets up locks)
        super().__init__(document_models=document_models)

        # 2. Singleton Guard: Only setup config path if not already set
        # This prevents overwriting if 'init()' is called multiple times on the singleton
        if not hasattr(self, "_config_path"):
            
            # Capture the environment context (e.g., 'development', 'production')
            self._env = os.getenv('ENVIRONMENT', 'development')

            if config_path:
                self._config_path = config_path
            else:
                # Default logic: Use project_root/config/{env}.yml
                # Adjust 'current_dir' logic as needed for your project structure
                current_dir = os.getcwd() 
                self._config_path = os.path.abspath(
                    os.path.join(current_dir, "config", f"{self._env}.yml")
                )
            
            logger.info(f"MongoHelper configured using path: {self._config_path}")

    def read_and_load_config(self) -> NoSQLConnectionEntry:
        """
        Reads the YAML file stored in self._config_path and merges with ENV vars.
        """
        # 1. Read YAML
        try:
            # Verify file exists before trying to read
            if not os.path.exists(self._config_path):
                raise FileNotFoundError(f"Config file not found at: {self._config_path}")
                
            configs_data = YamlFileOperator.read(self._config_path)
        except Exception as e:
            logger.error(f"Failed to read Mongo config: {e}")
            raise e
        
        # 2. Extract Config Section (Safe navigation)
        # Adjust keys based on your actual YAML structure
        nosql_section = configs_data.get(Constants.DBConstants.nosql_creds, {})
        mongo_data = nosql_section.get(Constants.DBConstants.mongo_inst, {})
        
        # 3. Determine Password based on Environment
        # It's cleaner to use a single standard ENV var (MONGO_PASSWORD), 
        # but we support your split dev/prod logic here:
        if self._env == "development":
            password = os.getenv("MONGO_PASSWORD_DEV", "password")
        else:
            password = os.getenv("MONGO_PASSWORD_PROD", "")
            
        # 4. Return the Data Class
        return NoSQLConnectionEntry(
            username=mongo_data.get(Constants.DBConstants.username, ""),
            password=password,
            host=mongo_data.get(Constants.DBConstants.host, "localhost"),
            port=int(mongo_data.get(Constants.DBConstants.port, 27017)),
            database=mongo_data.get(Constants.DBConstants.db_name, "development"),
            min_pool_size=int(mongo_data.get(Constants.DBConstants.min_pool_size, 10)),
            max_pool_size=int(mongo_data.get(Constants.DBConstants.max_pool_size, 50)),
            use_srv=mongo_data.get(Constants.DBConstants.use_srv, False),
            use_ssl=mongo_data.get("use_ssl", False) # Added SSL support if in YAML
        )

### 1.3 Document Models

We define Beanie `Document` models for our data entities. These models automatically integrate with MongoDB through the initialized connector.


In [None]:
# ---- Define Beanie Document Models ----

class Address(Document):
    """Address document for embedding or referencing"""
    street: str
    number: int
    city: str
    country: str
    zip: str
    owner_id: Optional[str] = None
    
    class Settings:
        collection = "addresses"


class Person(Document):
    """Person document with optional embedded addresses"""
    first_name: str
    last_name: str
    age: int
    
    class Settings:
        collection = "person_collection"

### 1.4 Initialize Connection


In [None]:
# ---- Initialize Connector ----
D = TypeVar("D", bound=Document)

async def setup_async_connection(
        models: List[Type[D]], 
        config_path: Optional[str] = None
    ) -> MongoConnector:
    
    """Initialize the async MongoDB connection with daolib using YAML config"""
    connector = MongoHelper(document_models=models)
    await connector.init()
    print("‚úì Async MongoDB connection established via daolib")
    return connector

In [None]:
printer = pprint.PrettyPrinter(indent=4)

# Initialize the async connection
connector = await setup_async_connection(models=[Address, Person])

# Only for local development
await Person.find({}).delete()
await Address.find({}).delete()
print("‚úì Database cleared for fresh start")

## 2. Async CRUD Operations with daolib

This section demonstrates CRUD (Create, Read, Update, Delete) operations using daolib's `MongoConnector` and Beanie for async operations with MongoDB.


### 2.1 Insert Operations

Create and insert sample data into the database.


In [None]:
# Create and insert persons using Beanie
async def create_persons_async(
    first_names: List[str],
    last_names: List[str],
    ages: List[int]
) -> List[str]:
    """Insert multiple persons and return their IDs"""
    persons = []
    
    for first_name, last_name, age in zip(first_names, last_names, ages):
        person = Person(
            first_name=first_name,
            last_name=last_name,
            age=age
        )
        persons.append(person)
    
    # Beanie handles bulk insert
    results = await Person.insert_many(persons)
    # person_ids = [str(result.id) for result in results]
    # return person_ids
    return results.inserted_ids

In [None]:
# Insert sample data
first_names = ["Rahul", "Ananya", "Vikram", "Priya", "Arjun"]
last_names = ["Sharma", "Gupta", "Singh", "Mehta", "Verma"]
ages = [28, 24, 32, 27, 35]

inserted_ids_async = await create_persons_async(first_names, last_names, ages)
print(f"‚úì Inserted {len(inserted_ids_async)} persons")
print(f"IDs: {inserted_ids_async}")

### 2.3 Read Operations

Link: [Documentation](https://beanie-odm.dev/tutorial/finding-documents/)


In [None]:
# Get all (Generic)
T = TypeVar("T", bound=Document)

async def find_all(doc: Type[T]):
    """Retrieve all persons"""
    persons = await doc.find().to_list()
    for person in persons:
        # Retrieve FULL object, but exclude ID during dump
        # This works for ANY document without needing extra classes
        printer.pprint(person.model_dump(exclude={"id", "revision_id"}))

await find_all(Person)

In [None]:
# Get all with projection (Non generic)
from pydantic import BaseModel

class ViewModel(BaseModel):
    first_name: str
    last_name: str
    age: int

persons = await Person.find({}).project(ViewModel).to_list()
for person in persons:
    printer.pprint(person.model_dump())

In [None]:
# Get by ID (Generic)
T = TypeVar("T", bound=Document)

async def get_by_id(doc: Type[T], _id: str):
    """Retrieve a person by ID"""
    # Convert string ID back to ObjectId
    from bson import ObjectId
    result = await doc.get(ObjectId(_id))
    if result:
        printer.pprint(result.model_dump(exclude={"id", "revision_id"}))
    else:
        print(f"No person found with ID: {_id}")


# Retrieve the first person
print(f"Fetching person with ID: {inserted_ids_async[0]}")
await get_by_id(Person, inserted_ids_async[0])

In [None]:
# Search in range using Beanie query syntax
class ViewModel(BaseModel):
    first_name: str
    last_name: str
    age: int

persons = await Person.find(
    Person.age >= 25,
    Person.age <= 35
).sort("+age").project(ViewModel).to_list()

for person in persons:
    printer.pprint(person.model_dump())


### 2.4 Update Operations

Link: [Documentation](https://beanie-odm.dev/tutorial/updating-%26-deleting/)

Treating your database as a **storage bucket** (PyMongo style) versus treating it as a **collection of Python objects** (Beanie/ODM style).

#### 1. Data Integrity & Validation

When you use `.update()`, you are bypassing your application's validation logic. You could technically insert "bad" data that doesn't match your Pydantic model, and you wouldn't know until you tried to read it back and your app crashed.

* `.save()`/`.replace()`: Runs all Pydantic validators. If you try to save an invalid email or a negative age, Python throws an error *before* the bad data hits the database.


* `.update()`: Sends the command directly to MongoDB. MongoDB checks types roughly, but it doesn't know your Pydantic "business rules."

**Example:**

```python
# Bad update: MongoDB accepts this, but it breaks your Pydantic model later
await User.find_one(User.id == doc_id).update({"$set": {"email": "not-an-email"}}) 

# Good save: This raises a ValidationError immediately
user = await User.get(doc_id)
user.email = "not-an-email"
await user.save() # BOOM! Protected.

```

#### 2. Lifecycle Events (Hooks)

Beanie supports event-based actions (hooks) like `before_save`, `after_save`, `before_replace`, etc..

* **`.save()`:** Triggers these hooks. For example, if you have logic to automatically update an `updated_at` timestamp or hash a password whenever a user is saved, `.save()` ensures this happens.
* **`.update()`:** Bypasses these hooks entirely.

#### 3. Complex Business Logic

Sometimes the "new value" isn't a simple increment (`$inc`) or set (`$set`). It might require complex Python calculations, external API calls, or logic that is painful to write in MongoDB Query Language (MQL).

**Scenario:** Calculate a "trust score" based on 10 different fields and an external credit check.

* **Efficient:** Fetch the doc  Run complex Python math  `.save()`.
* **Hard:** trying to write a MongoDB aggregation pipeline update to do that math inside the database.

#### 4. Semantic "PUT" Operations

In REST API design:

* **PATCH** (`.update()`): "Change just these specific fields."
* **PUT** (`.replace()`): "Here is the **new complete state** of the resource. Make the database match this exactly."

If you are building an API endpoint that receives a full user profile form and saves it, `.replace()` guarantees that the database matches exactly what the user submitted, removing any old fields they might have cleared out.

#### Summary: When to use which?

| Feature | Use `.save()` / `.replace()` | Use `.update()` |
| --- | --- | --- |
| **Data Safety** | **High.** Enforces Pydantic schema & validation. | **Low.** Bypasses app validation. |
| **Logic** | **Complex.** Good for heavy Python calculations. | **Simple.** Good for increments, flags, renames. |
| **Side Effects** | **Triggers hooks** (e.g., `updated_at`, password hashing). | **Silent.** No Python hooks triggered. |
| **Performance** | **Slower.** Full document read/write roundtrip. | **Faster.** Single DB operation. |

#### Recommendation

**Default to `.save()`** for most standard application logic to keep your data safe and your code clean. Optimize with **`.update()`** only when:

1. Performance is critical (high-frequency updates).
2. You need atomicity (preventing race conditions).
3. You are doing simple patch operations (toggling a boolean, incrementing a counter).

In [None]:
# Update (Non Generic)
# Update the second person (Ananya)
from bson import ObjectId
target_id = inserted_ids_async[1]

# Atomic update (Pydantic validation fails)
await Person.find_one(Person.id == ObjectId(target_id)).update({    # type: ignore
    "$inc": {"age": 1},    
    "$set": {"married": False},
})

# Verify the update
print("\nVerifying update:")
# Fails the pydantic validation (No married field)
await get_by_id(Person, target_id)


### 2.5 Delete Operations

Remove documents from the database.


In [None]:
# Delete by ID (Generic)
T = TypeVar("T", bound=Document)

async def delete_by_id(doc: Type[T], _id: str):

    from bson import ObjectId
    result = await doc.get(ObjectId(_id))

    if result:
        await result.delete()
        print(f"‚úì Deleted person {_id}")
    else:
        print(f"Person {_id} not found")


# Delete the third person (Vikram)
target_id = inserted_ids_async[2]
await delete_by_id(Person, target_id)

# Verify deletion
print("\nVerifying deletion:")
await get_by_id(Person, target_id)


## Beanie vs PyMongo: Aggregation Syntax Guide

**TL;DR:** For complex aggregations (joins, grouping, transformations), Beanie uses the **exact same MongoDB aggregation pipeline syntax as PyMongo**. There are no special Beanie wrapper methods for `$lookup`, `$group`, `$project`, etc.

### Three Ways to Query in Beanie

#### 1. Simple Queries (Beanie Syntax)
```python
# Find with filters
books = await Book.find(Book.type == "fiction", Book.copies > 10).to_list()

# With projection
class SimpleView(BaseModel):
    title: str
    copies: int

books = await Book.find(Book.type == "fiction").project(SimpleView).to_list()
```

#### 2. Aggregation Methods (Beanie Helpers)
```python
# Built-in aggregation on query results
avg_copies = await Book.find(Book.type == "fiction").avg(Book.copies)
total_copies = await Book.find().sum(Book.copies)
max_price = await Product.find(Product.category == "Electronics").max(Product.price)
```

Available: `.sum()`, `.avg()`, `.max()`, `.min()`

#### 3. Complex Aggregation Pipelines (Raw PyMongo Syntax)

**This is where confusion happens!** Beanie does NOT provide wrapper methods for pipeline stages.

```python
# ‚úÖ CORRECT: Use raw MongoDB operators in a list of dictionaries
pipeline = [
    {"$lookup": {
        "from": "author",
        "localField": "author_id", 
        "foreignField": "_id",
        "as": "author_details"
    }},
    {"$unwind": "$author_details"},
    {"$group": {
        "_id": "$type",
        "total_books": {"$sum": 1},
        "avg_copies": {"$avg": "$copies"}
    }},
    {"$project": {
        "type": "$_id",
        "total_books": 1,
        "avg_copies": 1
    }}
]

results = await Book.aggregate(pipeline).to_list()
```

```python
# ‚ùå WRONG: There is no Beanie method like .lookup() or .group()
# This does NOT exist:
results = await Book.lookup("author").group("type").to_list()  # INVALID
```

### Key Differences Summary

| Feature | Beanie | PyMongo |
|---------|--------|---------|
| **Simple find** | `await Book.find(Book.price > 10).to_list()` | `await collection.find({"price": {"$gt": 10}}).to_list()` |
| **Aggregation helpers** | `await Book.find().avg(Book.price)` | Manual aggregation pipeline |
| **Complex pipelines** | `await Book.aggregate([{...}]).to_list()` | `await collection.aggregate([{...}]).to_list()` |
| **Pipeline syntax** | **Same raw dictionaries** | **Same raw dictionaries** |
| **Result parsing** | Can use `projection_model=MyModel` | Returns dicts |

### When to Use What

- **Use `.find()` queries**: Simple filtering, sorting, limiting
- **Use aggregation helpers**: Quick calculations (avg, sum, max, min)
- **Use `.aggregate()` with pipelines**: Joins, grouping, complex transformations, multi-stage operations

**References:**
- [Beanie Aggregation Tutorial](https://beanie-odm.dev/tutorial/aggregation/)
- [MongoDB Aggregation Pipeline](https://www.mongodb.com/docs/manual/core/aggregation-pipeline/)

---

## 3. Data Relationships

Explore different patterns for managing relationships between entities in MongoDB.

### 3.1 Embedding (One-to-Few)

Store related data directly within a parent document.

#### 3.1.1 Define Models for Authors and Books

In [None]:
class Author(Document):
    """Author document"""
    first_name: str
    last_name: str
    date_of_birth: datetime
    
    class Settings:
        collection = "author"

class Book(Document):
    """Book document with embedded authors list"""
    title: str
    authors: List[Author] 
    publish_date: datetime
    type: str   # Enum: fiction or non-fiction
    copies: int
    
    class Settings:
        collection = "book"

In [None]:
# Register additional models dynamically (dev/test only)
await connector.register_models([Author, Book])

# Only for local development
await Author.find({}).delete()
await Book.find({}).delete()

#### 3.1.2 Insert Author Data


In [None]:
# Create and insert authors
authors_data = [
    {
        "first_name": "Haruki",
        "last_name": "Murakami",
        "date_of_birth": datetime(1949, 1, 12, tzinfo=timezone.utc),
    },
    {
        "first_name": "Chimamanda",
        "last_name": "Ngozi Adichie",
        "date_of_birth": datetime(1977, 9, 15, tzinfo=timezone.utc),
    },
    {
        "first_name": "Yuval",
        "last_name": "Noah Harari",
        "date_of_birth": datetime(1976, 2, 24, tzinfo=timezone.utc),
    },
]

# Insert authors
authors = [Author(**data) for data in authors_data]
results = await Author.insert_many(authors)
author_ids = [str(uid) for uid in results.inserted_ids]

print(f"‚úì Inserted {len(author_ids)} authors")
print(f"Author IDs: {author_ids}")

murakami, adichie, harari = authors

#### 3.1.3 Insert Book Data with Author References


In [None]:
# Create and insert books with author references
books_data = [
    {
        "title": "Kafka on the Shore",
        "authors": [murakami],
        "publish_date": datetime(2002, 9, 12, tzinfo=timezone.utc),
        "type": "fiction",
        "copies": 12,
    },
    {
        "title": "Norwegian Wood",
        "authors": [murakami],
        "publish_date": datetime(1987, 9, 4, tzinfo=timezone.utc),
        "type": "fiction",
        "copies": 9,
    },
    {
        "title": "Half of a Yellow Sun",
        "authors": [adichie],
        "publish_date": datetime(2006, 9, 12, tzinfo=timezone.utc),
        "type": "fiction",
        "copies": 7,
    },
    {
        "title": "We Should All Be Feminists",
        "authors": [adichie],
        "publish_date": datetime(2014, 1, 1, tzinfo=timezone.utc),
        "type": "non-fiction",
        "copies": 15,
    },
    {
        "title": "Sapiens: A Brief History of Humankind",
        "authors": [harari],
        "publish_date": datetime(2011, 1, 1, tzinfo=timezone.utc),
        "type": "non-fiction",
        "copies": 20,
    },
]

# Insert books
books = [Book(**data) for data in books_data]
results = await Book.insert_many(books)
book_ids = [str(uid) for uid in results.inserted_ids]

print(f"‚úì Inserted {len(book_ids)} books")
print(f"Book IDs: {book_ids}")


#### 3.1.4 Query Exercises: Embedded Pattern

**Learning Objectives:**
- Query nested documents using Beanie's dot notation
- Filter books by embedded author properties
- Project specific fields from embedded documents
- Understand when embedding is efficient vs. when it creates duplication

**Note on Aggregation Syntax:**

For complex transformations, we use **raw PyMongo aggregation pipeline syntax**. Beanie's `.aggregate()` method accepts a list of pipeline stage dictionaries:

```python
pipeline = [
    {"$project": {"field": 1}},
    {"$group": {"_id": "$category"}}
]
results = await Book.aggregate(pipeline).to_list()
```

This is the standard MongoDB approach - Beanie does not provide wrapper methods for pipeline stages.

**Version Compatibility:** This notebook requires **Beanie >=1.26.0** for proper Motor 3.7+ support. If using Beanie 2.0.1 with Motor 3.7+, aggregation calls will fail with cursor-related errors. See Exercise 3 for details.

**Exercise 1: Find Books by Embedded Author Name**

Write a query to find all books where any author's `first_name` is "Haruki".

**Expected Result:** List of books (Kafka on the Shore, Norwegian Wood)

**Hints:**
- Use `Book.find()` with query conditions
- Access nested fields with dot notation: `Book.authors.first_name`
- Use `.to_list()` to execute the query
- The `authors` field is a list, so MongoDB will match if ANY element matches

In [None]:
# Exercise 1: Find books by embedded author name
# TODO: Implement query using Beanie syntax

results = await Book.find(
    Book.authors.first_name == "Haruki" # type: ignore
).to_list()

printer.pprint([result.model_dump(exclude={"id"}) for result in results])

**Exercise 2: Query by Embedded Author Birth Year**

Find all books written by authors born after 1970, using the embedded `date_of_birth` field.

**Expected Result:** Books by Adichie and Harari

**Hints:**
- Use `Book.find()` with date comparison
- Access nested date field: `Book.authors.date_of_birth`
- Use comparison operators: `>` for after
- Create a datetime object for comparison: `datetime(1970, 1, 1, tzinfo=timezone.utc)`

In [None]:
# Exercise 2: Query by embedded author birth year
# TODO: Filter using date comparison on nested field

results = await Book.find(
    Book.authors.date_of_birth > datetime(1970, 1, 1, tzinfo=timezone.utc) # type: ignore
).to_list()

printer.pprint([result.model_dump(exclude={"id"}) for result in results])

**Exercise 3: Project Partial Embedded Data**

Retrieve all books but only include the book title and the author names (not full Author objects).

**Expected Output Structure:**
```json
{
  "title": "Kafka on the Shore",
  "author_names": ["Haruki Murakami"]
}
```

**Hints:**
- Use `Book.aggregate()` for complex transformations
- Pipeline stages needed: `$project` with `$map` to transform authors array
- `$map` syntax: `{"$map": {"input": "$authors", "in": "$$this.first_name"}}`
- Concatenate first and last name using `$concat`
- Beanie method: `await Book.aggregate([...]).to_list()`

---

**‚ö†Ô∏è Version Compatibility Note:**

If you encounter this error:
```
TypeError: object AsyncIOMotorLatentCommandCursor can't be used in 'await' expression
```

This indicates a **Beanie + Motor version incompatibility**:

| Beanie Version | Motor Version | Status |
|----------------|---------------|--------|
| **2.0.1** | **3.7+** | ‚ùå **Broken** - Double-await issue |
| **>=1.26.0** | **>=3.7.0** | ‚úÖ **Works** - Recommended |
| **2.0.1** | **<3.6.0** | ‚úÖ Works - But misses Motor improvements |

**Solution:** Upgrade Beanie to fix the issue:
```bash
pip install --upgrade "beanie>=1.26.0"
```

**Why this happens:** Motor 3.7+ changed cursor behavior. Beanie 2.0.1 tries to `await` the cursor twice, causing the error. Beanie 1.26.0+ has the fix.

**References:**
- [Beanie Changelog](https://github.com/roman-right/beanie/blob/main/CHANGELOG.md)
- [Motor 3.7 Release Notes](https://motor.readthedocs.io/en/stable/changelog.html)

In [None]:
# Exercise 3: Project partial embedded data
# TODO: Use aggregation to reshape output
class OutputModel(BaseModel):
    title: str
    author_names: List[str]

pipeline = [
    {
        "$project": {
            "title": 1,
            "author_names": {
                "$map": {
                    "input": "$authors",
                    "as": "author",
                    "in": {
                        "$concat": [
                            "$$author.first_name",
                            " ",
                            "$$author.last_name"
                        ]
                    }
                }
            }
        }
    }
]

# Use the correct Beanie API - get_pymongo_collection() is synchronous
# collection = Book.get_pymongo_collection()
# results_raw = []
# async for doc in collection.aggregate(pipeline):
#     results_raw.append(doc)

# # Parse into OutputModel
# results = [OutputModel(**doc) for doc in results_raw]

# printer.pprint([result.model_dump() for result in results])

results = await Book.aggregate(pipeline, projection_model=OutputModel).to_list()
printer.pprint([result.model_dump() for result in results])

**Exercise 4: Count Books by Type with Embedded Author Info**

Using Beanie aggregation, group books by `type` (fiction/non-fiction) and include:
- Count of books per type
- List of unique author names per type

**Challenge:** What happens if the same author appears in multiple books? How do you deduplicate?

**Hints:**
- Use `Book.aggregate()` with multiple stages
- `$unwind` the authors array first to handle embedded documents
- `$group` by type field: `{"$group": {"_id": "$type", ...}}`
- Use `$addToSet` to collect unique author names (auto-deduplicates)
- Use `$sum: 1` to count documents
- Final stages: `$project` to rename fields nicely

In [None]:
# Exercise 4: Count books by type with author deduplication
# TODO: Use aggregation with $group, $addToSet, or $size
pipeline = [
    # first stage: unwind authors array
    {
        "$unwind": {
            "path": "$authors",
            "preserveNullAndEmptyArrays": True
        }
    },
    
    # second stage: group by type and aggregate
    {
        "$group": {
            "_id": "$type",
            "book_count": {"$sum": 1},
            "total_copies": {"$sum": "$copies"},
            "unique_authors": {
                "$addToSet": {
                    "$concat": [
                        "$authors.first_name",
                        " ",
                        "$authors.last_name"
                    ]
                }
            }
        }
    },
    
    # third stage: project to rename fields nicely
    {
        "$project": {
            "_id": 0,
            "type": "$_id",
            "book_count": 1,
            "total_copies": 1,
            "unique_authors": 1
        }
    }
]

results = await Book.aggregate(pipeline).to_list()
printer.pprint(results)


### 3.2 Reference (One-to-Many)

Store objectID

#### 3.2.1 Define Models for Authors and Books

In [None]:
from bson import ObjectId
from beanie import PydanticObjectId

class Author2(Document):
    """Author document"""
    first_name: str
    last_name: str
    date_of_birth: datetime
    
    class Settings:
        collection = "author2"

class Book2(Document):
    """Book document with referenced authors using ObjectIds"""
    title: str
    authors: List[PydanticObjectId]  # Use Beanie's PydanticObjectId for references
    publish_date: datetime
    type: str   # Enum: fiction or non-fiction
    copies: int
    
    class Settings:
        collection = "book2"

In [None]:
# Add models to existing connector
await connector.register_models([Author2, Book2])

# Only for local development
await Author2.find({}).delete()
await Book2.find({}).delete()

#### 3.2.2 Insert Author Data


In [None]:
# Create and insert authors
authors_data = [
    {
        "first_name": "Haruki",
        "last_name": "Murakami",
        "date_of_birth": datetime(1949, 1, 12, tzinfo=timezone.utc),
    },
    {
        "first_name": "Chimamanda",
        "last_name": "Ngozi Adichie",
        "date_of_birth": datetime(1977, 9, 15, tzinfo=timezone.utc),
    },
    {
        "first_name": "Yuval",
        "last_name": "Noah Harari",
        "date_of_birth": datetime(1976, 2, 24, tzinfo=timezone.utc),
    },
]

# Insert authors
authors = [Author2(**data) for data in authors_data]
results = await Author2.insert_many(authors)
author_ids: List[PydanticObjectId] = list(results.inserted_ids)

print(f"‚úì Inserted {len(author_ids)} authors")
print(f"Author IDs: {author_ids}")

murakami_id, adichie_id, harari_id = author_ids

#### 3.2.3 Insert Book Data with Author References


In [None]:
# Create and insert books with author references
books_data = [
    {
        "title": "Kafka on the Shore",
        "authors": [murakami_id],
        "publish_date": datetime(2002, 9, 12, tzinfo=timezone.utc),
        "type": "fiction",
        "copies": 12,
    },
    {
        "title": "Norwegian Wood",
        "authors": [murakami_id],
        "publish_date": datetime(1987, 9, 4, tzinfo=timezone.utc),
        "type": "fiction",
        "copies": 9,
    },
    {
        "title": "Half of a Yellow Sun",
        "authors": [adichie_id],
        "publish_date": datetime(2006, 9, 12, tzinfo=timezone.utc),
        "type": "fiction",
        "copies": 7,
    },
    {
        "title": "We Should All Be Feminists",
        "authors": [adichie_id],
        "publish_date": datetime(2014, 1, 1, tzinfo=timezone.utc),
        "type": "non-fiction",
        "copies": 15,
    },
    {
        "title": "Sapiens: A Brief History of Humankind",
        "authors": [harari_id],
        "publish_date": datetime(2011, 1, 1, tzinfo=timezone.utc),
        "type": "non-fiction",
        "copies": 20,
    },
]

# Insert books
books = [Book2(**data) for data in books_data]
results = await Book2.insert_many(books)
book_ids = [str(uid) for uid in results.inserted_ids]

print(f"‚úì Inserted {len(book_ids)} books")
print(f"Book IDs: {book_ids}")


#### 3.2.4 Query Exercises: Reference Pattern (Beanie Aggregation Methods)

**Learning Objectives:**
- Use Beanie's aggregation framework instead of raw MQL dictionaries
- Perform joins using structured aggregation methods
- Calculate derived fields (totals, counts, averages)
- Compare performance trade-offs between embedding and referencing

**Understanding Beanie Aggregation Methods**

Beanie provides **two types of aggregation interfaces**:

**1. Built-in Aggregation Methods** (Simple operations on query results)
```python
# Works on FindMany query results
avg_price = await Product.find(Product.price > 10).avg(Product.price)
sum_copies = await Book.find(Book.type == "fiction").sum(Book.copies)
max_age = await Author.find().max(Author.age)
```

Available methods: `.sum()`, `.avg()`, `.max()`, `.min()`

**2. Complex Aggregation Pipelines** (Using raw PyMongo syntax)

For joins (`$lookup`), grouping (`$group`), and complex transformations, Beanie uses **raw PyMongo aggregation pipeline dictionaries**:

```python
# Beanie's aggregate() method accepts PyMongo pipeline syntax
pipeline = [
    {"$lookup": {"from": "book", "localField": "_id", "foreignField": "authors", "as": "books"}},
    {"$project": {"name": 1, "book_count": {"$size": "$books"}}}
]

results = await Author.aggregate(pipeline).to_list()
```

**Key Points:**
- Beanie does **NOT** have special wrapper methods for `$lookup`, `$group`, `$project`, etc.
- You write the **same MongoDB aggregation operators** as you would in PyMongo
- The only difference: use `await Document.aggregate([pipeline])` instead of `collection.aggregate([pipeline])`
- Use `projection_model` parameter to automatically parse results into Pydantic models

**Reference:** [Beanie Aggregation Docs](https://beanie-odm.dev/tutorial/aggregation/)

**Exercise 1: Join Authors with Their Books (Basic $lookup)**

**Goal:** Retrieve all authors with their associated books using Beanie aggregation.

**Requirements:**
- Use Beanie's aggregation methods (NOT raw pipeline dicts)
- Join `Author` collection with `Book` collection
- Include: `first_name`, `last_name`, and list of book titles

**Expected Output Structure:**
```json
{
  "first_name": "Haruki",
  "last_name": "Murakami",
  "books": [
    {"title": "Kafka on the Shore"},
    {"title": "Norwegian Wood"}
  ]
}
```

**Hints:**
- Use `Author.aggregate()` with aggregation pipeline
- `$lookup` stage connects collections: `{"$lookup": {"from": "book", "localField": "_id", "foreignField": "authors", "as": "books"}}`
- `$project` stage to select only needed fields
- Use `$map` inside project to transform books array to only show titles
- Beanie method: `await Author.aggregate([...]).to_list()`

In [None]:
pipeline = [
    # Direct ObjectId to ObjectId join - no conversion needed!
    {
        "$lookup": {
            "from": "Book2",  # Fixed: MongoDB collection names are case-sensitive!
            "localField": "_id",
            "foreignField": "authors",
            "as": "books"
        }
    },
    {
        "$project": {
            "_id": 0,
            "first_name": 1,
            "last_name": 1,
            "books": {
                "title": 1
            }
        }
    }
]

results = await Author2.aggregate(pipeline).to_list()
printer.pprint([result for result in results])


**Exercise 2: Calculate Total Books Per Author**

**Goal:** For each author, calculate:
- Total number of books
- Total copies across all their books
- Average copies per book

**Expected Output:**
```json
{
  "author": "Haruki Murakami",
  "total_books": 2,
  "total_copies": 21,
  "avg_copies_per_book": 10.5
}
```

**Hints:**
- Start with `$lookup` to join Author with Book (from Exercise 1)
- Use `$addFields` to add calculated fields
- `$size` operator counts array elements: `{"$size": "$books"}`
- `$sum` operator on array: `{"$sum": "$books.copies"}`
- `$avg` operator: `{"$avg": "$books.copies"}`
- Concatenate author name using `$concat`: `["$first_name", " ", "$last_name"]`

In [None]:
pipeline = [
    {
        "$lookup": {
            "from": "Book2",
            "localField": "_id",
            "foreignField": "authors",
            "as": "books"
        }
    },
    {
        "$addFields": {
            "total_books": {"$size": "$books"},
            "total_copies": {"$sum": "$books.copies"},
            "avg_copies_per_book": {"$avg": "$books.copies"},
            "author": {
                "$concat": [
                    "$first_name",
                    " ",
                    "$last_name"
                ]
            }
        }
    },
    {
        "$project": {
            "_id": 0,
            "author": 1,
            "total_books": 1,
            "total_copies": 1,
            "avg_copies_per_book": 1
        }
    }
]

results = await Author2.aggregate(pipeline).to_list()
printer.pprint([result for result in results])


**Exercise 3: Reverse Join - Books with Author Details**

**Goal:** Start from `Book` collection and populate author information.

**Requirements:**
- For each book, include full author details (name, birth year)
- Calculate author's age at publication time
- Sort by publication date

**Challenge:** Handle books with multiple authors.

**Hints:**
- Use `Book.aggregate()` starting point
- `$lookup` with pipeline for complex join
- `$dateDiff` to calculate age: `{"$dateDiff": {"startDate": "$date_of_birth", "endDate": "$publish_date", "unit": "year"}}`
- `$sort` by publish_date: `{"$sort": {"publish_date": 1}}`
- Use `$unwind` if you want one doc per author instead of array

In [None]:
# Exercise 3: Reverse join with calculated fields
# Join from Book ‚Üí Author, calculate age at publication

pipeline = [

    
    # Stage 1: Lookup author details from Author2 collection
    {
        "$lookup": {
            "from": "Author2",
            "localField": "authors",
            "foreignField": "_id",
            "as": "author_info"
        }
    },
    
    # Stage 2: Unwind the author_info array (will have 1 element per book)
    {
        "$unwind": {
            "path": "$author_info",
            "preserveNullAndEmptyArrays": True
        }
    },
    
    # Stage 3: Add calculated fields
    {
        "$addFields": {
            "author_name": {
                "$concat": [
                    "$author_info.first_name",
                    " ",
                    "$author_info.last_name"
                ]
            },
            "author_birth_year": {"$year": "$author_info.date_of_birth"},
            "age_at_publication": {
                "$dateDiff": {
                    "startDate": "$author_info.date_of_birth",
                    "endDate": "$publish_date",
                    "unit": "year"
                }
            }
        }
    },
    
    # Stage 4: Project only needed fields
    {
        "$project": {
            "_id": 0,
            "title": 1,
            "publish_date": 1,
            "author_name": 1,
            "author_birth_year": 1,
            "age_at_publication": 1
        }
    },
    
    # Stage 6: Sort by publication date
    {
        "$sort": {
            "publish_date": 1
        }
    }
]

results = await Book2.aggregate(pipeline).to_list()
printer.pprint([result for result in results])

**Exercise 4: Filter Join - Books by Authors Born After 1970**

**Goal:** Combine filtering with joining.

**Requirements:**
1. Join Books with Authors
2. Filter to only include authors born after 1970
3. Project: book title, author name, author birth year
4. Sort by author birth year (newest first)

**Expected Behavior:** Should exclude Murakami's books, include Adichie and Harari.

**Hints:**
- Start with `Book.aggregate()`
- `$lookup` with `let` and pipeline for complex filtering
- Inside lookup pipeline, use `$match` with `$expr` to filter by birth year
- Extract year from date: `{"$year": "$date_of_birth"}`
- Compare: `{"$gt": [{"$year": "$date_of_birth"}, 1970]}`
- `$sort` by birth year descending: `{"$sort": {"author_birth_year": -1}}`

In [None]:
# Exercise 4: Filter join by birth year
# TODO: Combine $lookup with $match on nested fields

pipeline = [
    
    # Stage 1: Lookup author details from Author2 collection
    {
        "$lookup": {
            "from": "Author2",
            "localField": "authors",
            "foreignField": "_id",
            "as": "author_info"
        }
    },
    
    {
        "$unwind": {
            "path": "$author_info",
            "preserveNullAndEmptyArrays": True
        }
    },
    
    {
        "$match": {
            "$expr": {
                "$gt": [
                    {"$year": "$author_info.date_of_birth"},
                    1970
                ]
            }
        }
    },
    
    {
        "$sort": {
            "author_info.date_of_birth": -1
        }
    },
    
    {
        "$project": {
            "_id": 0,
            "title": 1,
            "author_name": {
                "$concat": [
                    "$author_info.first_name",
                    " ",
                    "$author_info.last_name"
                ]
            },
            "author_birth_year": {"$year": "$author_info.date_of_birth"},
        }
    }
]

results = await Book2.aggregate(pipeline).to_list()
printer.pprint([result for result in results])

**Exercise 5: Aggregated Statistics Across Relationship**

**Goal:** Generate a report of book statistics grouped by fiction/non-fiction.

**Requirements:**
- Group books by `type` field
- For each type, calculate:
  - Total books
  - Total copies in circulation
  - List of unique authors (names only, deduplicated)
  - Average publish year

**Expected Output:**
```json
{
  "type": "fiction",
  "total_books": 4,
  "total_copies": 34,
  "unique_authors": ["Haruki Murakami", "Chimamanda Ngozi Adichie"],
  "avg_publish_year": 1998
}
```

**Hints:**
- Start with `Book.aggregate()`
- First `$lookup` to get author details
- `$unwind` the authors array
- `$group` by `type`: `{"$group": {"_id": "$type", ...}}`
- Accumulators: `$sum: 1` for count, `$sum: "$copies"` for total copies
- `$addToSet` for unique author names: `{"$addToSet": {"$concat": ["$authors.first_name", " ", "$authors.last_name"]}}`
- Extract year and average: `{"$avg": {"$year": "$publish_date"}}`

In [None]:
# Exercise 5: Grouped statistics with deduplication
# TODO: Use $group with $addToSet for unique authors

pipeline = [
    
    # Stage 1: Lookup author details from Author2 collection
    {
        "$lookup": {
            "from": "Author2",
            "localField": "authors",
            "foreignField": "_id",
            "as": "author_info"
        }
    },
    
    {
        "$unwind": {
            "path": "$author_info",
            "preserveNullAndEmptyArrays": True
        }
    },
    
    {
        "$group": {
            "_id": "$type",
            "book_count": {"$sum": 1},
            "total_copies": {"$sum": "$copies"},
            "unique_authors": {
                "$addToSet": {
                    "$concat": [
                        "$author_info.first_name",
                        " ",
                        "$author_info.last_name"
                    ]
                }          
            }
        }
    },
    
    {
        "$project": {
            "_id": 0,
            "type": "$_id",
            "book_count": 1,
            "total_copies": 1,
            "unique_authors": 1
        }
    }
]

results = await Book2.aggregate(pipeline).to_list()
printer.pprint(results)

---

### üìö Mental Models: `$map`, `$addFields`, and `$set`

#### 1. **`$map` - The Array Transformer**

**Mental Model:** Think of `$map` like Python's `map()` function or JavaScript's `.map()`. It's a **for loop that transforms each element in an array**.

**Structure:**
```javascript
{
  "$map": {
    "input": "$arrayField",      // Which array to loop over
    "as": "variableName",         // What to call each element (like 'item' in a for loop)
    "in": <expression>            // What to do with each element (the transformation)
  }
}
```

**Concrete Example:**
```javascript
// You have: authors: [ObjectId('abc'), ObjectId('def'), ObjectId('ghi')]
// You want: author_ids_as_strings: ['abc', 'def', 'ghi']

{
  "$addFields": {
    "author_ids_as_strings": {
      "$map": {
        "input": "$authors",           // Loop over the authors array
        "as": "author_id",             // Call each ObjectId "author_id"
        "in": { "$toString": "$$author_id" }  // Convert each one to string
      }
    }
  }
}
```

**‚ö†Ô∏è Key Syntax Rules:**
- Inside `"in"`, use `$$variableName` (double `$$`) to reference the loop variable
- Use `$fieldName` (single `$`) for fields from the main document
- `$$this` is a special variable when you don't specify `"as"` (refers to current element)

**When to use `$map`:**
- ‚úÖ Transforming arrays (extract a field, convert types, calculate values)
- ‚úÖ Building new arrays with different structure
- ‚ùå NOT for filtering (use `$filter` instead)
- ‚ùå NOT for flat list operations (use `$unwind` first)

---

#### 2. **`$addFields` vs `$set` - The Great Confusion**

**Short Answer:** **They are IDENTICAL.** `$set` is just an alias for `$addFields` introduced in MongoDB 4.2.

| Feature | `$addFields` | `$set` |
|---------|--------------|--------|
| **Keeps existing fields** | ‚úÖ Yes | ‚úÖ Yes |
| **Adds new fields** | ‚úÖ Yes | ‚úÖ Yes |
| **Overwrites existing fields** | ‚úÖ Yes | ‚úÖ Yes |
| **Syntax** | Same | Same |
| **MongoDB Version** | 3.4+ | 4.2+ (newer) |

**Why two names?**
- `$addFields` - Original name (2016), describes what it does
- `$set` - Newer alias (2019), shorter and matches SQL UPDATE syntax

**Mental Model for Both:**
Think of them as **"Add or update these fields, keep everything else"**

**Example:**
```javascript
// Document BEFORE:
{ title: "1984", author: "Orwell", pages: 328 }

// Using $addFields or $set (IDENTICAL):
{
  "$addFields": {
    "category": "dystopian",     // NEW field (adds it)
    "pages": 350                 // EXISTING field (overwrites it)
  }
}

// Document AFTER:
{ title: "1984", author: "Orwell", pages: 350, category: "dystopian" }
// Notice: title and author are STILL THERE (not removed)
```

**Compare with `$project`:**
```javascript
// Using $project (DIFFERENT - removes unlisted fields):
{
  "$project": {
    "title": 1,
    "category": "dystopian",
    "pages": 350
  }
}

// Document AFTER:
{ title: "1984", pages: 350, category: "dystopian" }
// Notice: "author" field is GONE! (not listed in $project)
```

**Recommendation:**
- Use `$addFields` or `$set` (your choice, I prefer `$addFields` for clarity)
- Never mix them in the same pipeline (pick one for consistency)
- Use `$project` only when you want to **remove** fields

---

#### 3. **Common `$map` Patterns**

**Pattern 1: Extract a field from array of objects**
```javascript
// books: [{ title: "Book1", pages: 100 }, { title: "Book2", pages: 200 }]
// Want: book_titles: ["Book1", "Book2"]

{
  "$addFields": {
    "book_titles": {
      "$map": {
        "input": "$books",
        "as": "book",
        "in": "$$book.title"
      }
    }
  }
}
```

**Pattern 2: Transform entire objects**
```javascript
// authors: [{ first: "Jane", last: "Doe" }, { first: "John", last: "Smith" }]
// Want: author_names: [{ name: "Jane Doe" }, { name: "John Smith" }]

{
  "$addFields": {
    "author_names": {
      "$map": {
        "input": "$authors",
        "as": "author",
        "in": {
          "name": { "$concat": ["$$author.first", " ", "$$author.last"] }
        }
      }
    }
  }
}
```

**Pattern 3: Use document fields inside $map**
```javascript
// Calculate discount for each product based on customer type
// customer_type: "VIP", products: [{ price: 100 }, { price: 200 }]

{
  "$addFields": {
    "discounted_products": {
      "$map": {
        "input": "$products",
        "as": "product",
        "in": {
          "original": "$$product.price",
          "discounted": {
            "$cond": [
              { "$eq": ["$customer_type", "VIP"] },  // $ for document field
              { "$multiply": ["$$product.price", 0.8] },  // $$ for loop variable
              "$$product.price"
            ]
          }
        }
      }
    }
  }
}
```

---

#### 4. **Quick Decision Tree**

**Do you need to transform an array?**
- **YES** ‚Üí Use `$map`
  - Each element ‚Üí different value: `"in": <simple expression>`
  - Each element ‚Üí object: `"in": { field1: ..., field2: ... }`

**Do you need to add/update fields but keep existing ones?**
- **YES** ‚Üí Use `$addFields` (or `$set`)

**Do you need to remove fields or only show specific fields?**
- **YES** ‚Üí Use `$project`

**Do you need to work with each array element separately?**
- **YES** ‚Üí Use `$unwind` first, then `$addFields`

---

### ‚ö° Performance: `$unwind` vs `$map` - The Critical Trade-off

#### **The Fundamental Difference**

**`$unwind` approach:**
```javascript
// 1 document with 3-element array ‚Üí 3 separate documents
{ book: "Book1", authors: ["A", "B", "C"] }
     ‚Üì $unwind
{ book: "Book1", authors: "A" }
{ book: "Book1", authors: "B" }
{ book: "Book1", authors: "C" }
```

**`$map` approach:**
```javascript
// Array stays as array, transformed in-place
{ book: "Book1", authors: ["A", "B", "C"] }
     ‚Üì $map
{ book: "Book1", author_names: ["Name A", "Name B", "Name C"] }
```

---

#### **Performance Comparison Matrix**

| Factor | `$unwind` ‚Üí Process ‚Üí `$group` | `$map` (in-place) |
|--------|-------------------------------|-------------------|
| **Document Count** | üî¥ **Multiplies** (can be 10x-1000x) | üü¢ **Unchanged** |
| **Memory Usage** | üî¥ **High** (doc explosion) | üü¢ **Low** (single doc) |
| **Pipeline Complexity** | üü° **Multi-stage** (unwind ‚Üí process ‚Üí group) | üü¢ **Single stage** |
| **Index Usage** | üü¢ **Can use indexes** after unwind | üî¥ **Limited** (can't index array elements efficiently) |
| **Best For** | Filtering, joining, aggregating | Transforming, reshaping |
| **CPU Usage** | üî¥ **Higher** (more docs to process) | üü¢ **Lower** |

---

#### **Real Performance Impact: Example Scenario**

**Scenario:** 1000 books, each with 3 authors on average

**Approach 1: Unwind ‚Üí Process ‚Üí Group**
```javascript
[
  { "$unwind": "$authors" },           // 1,000 ‚Üí 3,000 documents
  { "$group": { "_id": "$type", ... }} // Process 3,000 docs
]
```
- **Intermediate documents:** 3,000 (3x increase)
- **Memory impact:** ~3x more RAM needed
- **Processing time:** ~2-4x slower for large datasets

**Approach 2: Direct $map**
```javascript
[
  { 
    "$addFields": {
      "author_names": {
        "$map": { "input": "$authors", "in": "$$this.name" }
      }
    }
  }
]
```
- **Intermediate documents:** 1,000 (no explosion)
- **Memory impact:** Minimal
- **Processing time:** Baseline

---

#### **When `$unwind` is BETTER (Despite Overhead)**

‚úÖ **1. When you need to filter by array elements**
```javascript
// Find books where AT LEAST ONE author is born after 1970
[
  { "$unwind": "$authors" },
  { "$match": { "authors.birth_year": { "$gt": 1970 } } },
  { "$group": { "_id": "$_id", "authors": { "$push": "$authors" } } }
]

// ‚ùå Can't do this efficiently with $map alone
```

‚úÖ **2. When you need to join on array elements**
```javascript
// $lookup can join after unwind uses indexes
[
  { "$unwind": "$author_ids" },
  { "$lookup": { "from": "authors", "localField": "author_ids", ... } }
]
```

‚úÖ **3. When you need separate aggregations per array element**
```javascript
// Group by author birth decade
[
  { "$unwind": "$authors" },
  { "$group": { "_id": { "$subtract": [ "$authors.birth_year", { "$mod": ["$authors.birth_year", 10] } ] } } }
]
```

---

#### **When `$map` is BETTER**

‚úÖ **1. Simple transformations (no filtering/grouping)**
```javascript
// Extract titles from books array
{ "$addFields": { "titles": { "$map": { "input": "$books", "in": "$$this.title" } } } }
```

‚úÖ **2. Type conversions or formatting**
```javascript
// Convert all author IDs to strings
{ "$addFields": { "author_ids_str": { "$map": { "input": "$authors", "in": { "$toString": "$$this._id" } } } } }
```

‚úÖ **3. Large arrays (performance critical)**
```javascript
// If array has 100+ elements, $unwind creates 100+ docs ‚Üí slow
// $map keeps it as 1 doc ‚Üí fast
```

---

#### **Hybrid Approach: Best of Both Worlds**

Sometimes you need both! Use `$map` for transformations, then `$unwind` only when necessary:

**Example: Calculate average book count per author, but only for fiction books**

‚ùå **Inefficient:**
```javascript
[
  { "$unwind": "$books" },                    // Explode ALL books
  { "$match": { "books.type": "fiction" } },  // Filter after explosion
  { "$group": { "_id": "$author", "count": { "$sum": 1 } } }
]
```

‚úÖ **Efficient:**
```javascript
[
  // Step 1: Filter array BEFORE unwinding (using $filter)
  {
    "$addFields": {
      "fiction_books": {
        "$filter": {
          "input": "$books",
          "cond": { "$eq": ["$$this.type", "fiction"] }
        }
      }
    }
  },
  // Step 2: NOW unwind (only fiction books)
  { "$unwind": "$fiction_books" },
  // Step 3: Group
  { "$group": { "_id": "$author", "count": { "$sum": 1 } } }
]
```

**Why it's better:**
- Reduces documents to unwind (filters first)
- Only explodes what you need

---

#### **Benchmarking Guidelines**

**Small datasets (< 10,000 docs):**
- Performance difference is negligible
- Choose based on readability

**Medium datasets (10,000 - 1,000,000 docs):**
- `$unwind` overhead becomes noticeable (2-5x slower)
- Use `$map` when possible

**Large datasets (> 1,000,000 docs):**
- `$unwind` can cause memory issues
- MongoDB's 16MB document limit can be hit after unwinding
- Strongly prefer `$map` or hybrid approaches

**MongoDB Atlas Performance Tip:**
- Use `$sample` to test pipelines on subset first
- Check aggregation execution stats with `explain: true`

---

#### **Decision Flowchart**

```
Do you need to filter/match on array elements?
‚îú‚îÄ YES ‚Üí Use $unwind
‚îÇ         (Accept the performance cost for correctness)
‚îî‚îÄ NO ‚Üí Continue...

Do you need to join ($lookup) on array elements?
‚îú‚îÄ YES ‚Üí Use $unwind
‚îî‚îÄ NO ‚Üí Continue...

Do you need to group/aggregate by array element properties?
‚îú‚îÄ YES ‚Üí Use $unwind
‚îî‚îÄ NO ‚Üí Use $map
          (Faster, less memory, simpler)
```

---

#### **Real Example Comparison**

Let's solve the same problem both ways:

**Problem:** For each book type, get a count of total books

**Method 1: Unwind + Group**
```javascript
// Documents: 5 books
pipeline = [
  { "$group": { "_id": "$type", "count": { "$sum": 1 } } }
]
// Intermediate docs: 5 (no unwind needed here!)
// Result: [{ "_id": "fiction", "count": 3 }, { "_id": "non-fiction", "count": 2 }]
```

**Method 2: Group + Map (if you want more details)**
```javascript
// Want book counts AND titles
pipeline = [
  {
    "$group": {
      "_id": "$type",
      "books": { "$push": "$title" }  // Collect titles
    }
  },
  {
    "$addFields": {
      "count": { "$size": "$books" }  // Count in-place
    }
  }
]
// Intermediate docs: 5 (no explosion)
// Result: [{ "_id": "fiction", "count": 3, "books": ["Book1", "Book2", "Book3"] }]
```

**Winner:** Method 2 (no unwind needed, gives more data)

---

#### **Key Takeaway**

> **`$unwind` is a necessary evil for certain operations (filtering, joining), but it has a real performance cost. Use `$map`, `$filter`, and `$reduce` whenever you can keep arrays intact. For large datasets, avoiding `$unwind` can mean the difference between a 2-second query and a 30-second query.**

---

### 3.3 Pattern Comparison & Best Practices

**Exercise 7: Performance Analysis**

**Goal:** Understand the trade-offs between embedding and referencing.

**Tasks:**
1. **Measure Query Performance:**
   - Time a query fetching books with embedded authors
   - Time the same query using references + join
   - Compare execution times

2. **Analyze Storage:**
   - Calculate total document size for embedded pattern
   - Calculate total document size for reference pattern
   - Identify duplication in embedded approach

3. **Update Scenarios:**
   - What happens when an author's name changes?
   - Embedded: How many documents need updating?
   - Referenced: How many documents need updating?

**Discussion Questions:**
- When would embedding be preferred over referencing?
- What are the warning signs that you've chosen the wrong pattern?
- How does the access pattern (read-heavy vs write-heavy) influence the choice?

---

**Exercise 8: Hybrid Pattern (Subset Pattern)**

**Challenge:** Implement a hybrid approach where:
- `Book` stores a **subset** of author data (name only)
- Full `Author` details remain in separate collection
- On book query, you get basic author info without a join
- On author profile page, you fetch full details

**Goal:** Balance between query performance and data consistency.

**Requirements:**
```python
class Book(Document):
    title: str
    author_summary: List[Dict[str, str]]  # {"name": "Haruki Murakami"}
    author_ids: List[str]  # For fetching full details when needed
    # ... other fields
```

**Discussion:** What are the synchronization challenges? When is this worth the complexity?

In [None]:
# Exercise 7: Performance comparison
# TODO: Implement timing measurements for both patterns



In [None]:
# Exercise 8: Hybrid subset pattern implementation
# TODO: Design and implement subset pattern with synchronization strategy



## 4. Cleanup

Close the database connection and clean up resources.


In [None]:
# Cleanup: close the connection
connector.close()
print("‚úì MongoDB connection closed")