In [1]:
import os

os.chdir("../..")
import django_init


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# For better printing

from rich import print
from rich.console import Console
from rich.markdown import Markdown
from rich.table import Table

console = Console()
from utils.sql import use_indexes


In [4]:
from utils.perf_display import format_duration, perf_counter
from rest_framework.test import APIClient

client = APIClient()


In [5]:
from books.models import Library, Review, Book
from django.db.models import Count

public = Library.objects.order_by("-id").first()
public_id = public.id

# Get the library with the most books
alexandria_id, book_count = (
    Book.objects.values("library_id")
    .annotate(book_count=Count("id"))
    .order_by("-book_count")
    .values_list("library_id", "book_count")
    .first()
)

alexandria = Library.objects.get(id=alexandria_id)

---

### 3rd example: annotated query

Let's continue with another book endpoint:

This time we want to retrieve paginated list of books, containing some additional information: the list of related tags, count of reviews, and author
A minimal implementation is available in [this file]("../../../../books/views/book/annotate_books_aggregate.py).


In [6]:
with perf_counter(time_sql=True):
    client.get(f"/books/{alexandria_id}/aggregate?ordering=release_date,id")

In [7]:
from django.db.models import QuerySet
from django.contrib.postgres.aggregates import ArrayAgg
from books.views.book.list_books_aggregate import serialize_books


def get_queryset(library_id, page_size=20, orderings=("release_date", "id")):
    return (
        Book.objects.filter(library_id=library_id)
        .order_by(*orderings)
        .select_related("author")[:page_size]
    )


def annotate_books_aggregate(book_qs: QuerySet[Book]):
    """
    return a list of book objects,
    annotated with a list of tag names, and count of reviews
    """
    return book_qs.annotate(
        review_count=Count("reviews"), tag_names=ArrayAgg("tags__name")
    )

In [8]:
with perf_counter(time_sql=True, print_sql=True):
    qs = annotate_books_aggregate(get_queryset(alexandria_id))
    result = serialize_books(qs)
print(result[0])


Do not forget about duplicates when doing multiple aggregations!

Luckily, both `Count` and `ArrayAgg` accept a `distinct` argument


In [9]:
def annotate_books_aggregate(book_qs: QuerySet[Book]):
    return book_qs.annotate(
        review_count=Count("reviews", distinct=True),
        tag_names=ArrayAgg("tags__name", distinct=True),
    )

In [10]:
with perf_counter(time_sql=True, print_sql=True):
    qs = get_queryset(alexandria_id)
    qs = annotate_books_aggregate(qs)
    result = serialize_books(qs)
print(result[0])


Now we have a baseline for our endpoint, let's try to improve it.


In [11]:
qs = get_queryset(alexandria_id)
qs = annotate_books_aggregate(qs)
result = qs.explain(analyze=True)

print(result)

Right, this plan starts to be really complex. Several tools are available online to decrypt PostgreSQL plans, such as [this one](https://explain.dalibo.com/plan/g5h9cad9a38869gc)

We can now see, as expected, that the slow operations are the scan on `books_review` and `books_booktag`.

Even though the planner uses an index, it retrieves almost every row on both those tables, before merging them.

The aggregation is applied only at the end, just before sorting by `release_date`.


In [12]:
with perf_counter(time_sql=True, print_sql=True):
    qs = get_queryset(alexandria_id, orderings=("id",))
    qs = annotate_books_aggregate(qs)
    result = serialize_books(qs)

print(result[0])


In [13]:
qs = get_queryset(alexandria_id, orderings=("id",))
qs = annotate_books_aggregate(qs)
result = qs.explain(analyze=True)

print(result)

By comparison, we can look at [the plan](https://explain.dalibo.com/plan/31ae7aa178g5f1da) when ordering by `id`. It's completely different.
1 key difference is the sort, which is now incremental

> Compared to regular sorts, sorting incrementally allows returning tuples before the entire result set has been sorted, which particularly enables optimizations with LIMIT queries. It may also reduce memory usage and the likelihood of spilling sorts to disk, but it comes at the cost of the increased overhead of splitting the result set into multiple sorting batches.

<br>
<br>


---

While we could try to find clever indexes to improve our queries, we'll try to find other solutions to reach the same result.


<br>
<br>
Since the aggregates are expensive, and only used for the output, but not to filter book rows, we can try to split the logic in two parts


In [14]:
def get_book_ids(library_id, page_size=20, orderings=("release_date", "id")):
    return list(
        Book.objects.filter(library_id=library_id)
        .order_by(*orderings)[:page_size]
        .values_list("id", flat=True)
    )


def get_queryset_from_ids(book_ids, orderings=("release_date", "id")):
    return (
        Book.objects.filter(id__in=book_ids)
        .select_related("author")
        .order_by(*orderings)
    )


def get_queryset_using_ids(library_id, page_size=20, orderings=("release_date", "id")):
    book_ids = get_book_ids(library_id, page_size=page_size, orderings=orderings)
    return get_queryset_from_ids(book_ids, orderings=orderings)

In [15]:
with perf_counter(time_sql=True, print_sql=True):
    qs = get_queryset_using_ids(alexandria_id)
    qs = annotate_books_aggregate(qs)

    result = serialize_books(qs)
print(result[0])

In [16]:
qs = get_queryset_using_ids(alexandria_id)
qs = annotate_books_aggregate(qs)
result = qs.explain(analyze=True)

print(result)

Another try: Instead of using aggregation (with a `GROUP BY` clause), can use subqueries.


In [17]:
from django.contrib.postgres.expressions import ArraySubquery
from django.db.models import Count, OuterRef, QuerySet
from django.db.models.functions import Coalesce

from books.models import Book, BookTag, Review


def annotate_books_subquery(book_qs: QuerySet[Book]):
    return book_qs.annotate(
        review_count=Coalesce(
            Review.objects.filter(book_id=OuterRef("id"))
            .values("book_id")
            .annotate(count=Count("id", distinct=True))
            .values("count"),
            0,
        ),
        tag_names=ArraySubquery(
            BookTag.objects.filter(book_id=OuterRef("id")).values("name")
        ),
    )


In [18]:
with perf_counter(time_sql=True, print_sql=True):
    qs = get_queryset_using_ids(alexandria_id)
    qs = annotate_books_subquery(qs)

    result = serialize_books(qs)

print(result[0])

In [19]:
qs = get_queryset_using_ids(alexandria_id)
qs = annotate_books_subquery(qs)
result = qs.explain(analyze=True)

print(result)

This method is already really fast.k
1 thing to note though: the `loops=20` on the `books_reviews` index.

If the query becomes more complex, we'll have to execute this part once par item in the page

Let's try to avoid this loop, using multiple queries, and joining them in python


In [20]:
def annotate_books_python(books):
    book_ids = [book.id for book in books]

    review_count = dict(
        Review.objects.filter(book_id__in=book_ids)
        .values("book_id")
        .annotate(count=Count("id"))
        .values_list("book_id", "count")
    )
    tag_list = dict(
        BookTag.objects.filter(book_id__in=book_ids)
        .values("book_id")
        .annotate(tags=ArrayAgg("name"))
        .values_list("book_id", "tags")
    )
    books = list(qs)
    for book in books:
        book.review_count = review_count[book.id]
        book.tag_names = tag_list.get(book.id, [])
    return books

In [21]:
with perf_counter(time_sql=True, print_sql=True):
    qs = get_queryset_using_ids(alexandria_id, page_size=20)
    qs = annotate_books_python(list(qs))

    result = serialize_books(qs)
print(result[0])

That's a bit faster.

---

We can try to reduce the number of queries by using Common Table Expressions (CTE), i.e. queries having `WITH` clauses.


In [22]:
raw_query = """
WITH review_count AS (
    SELECT
        books_review.book_id as id,
        count(books_review.id) as review_count
    FROM
        books_review
    WHERE
        books_review.book_id IN %(book_ids)s
        GROUP BY
            book_id
),
tag_names AS (
    SELECT
        books_booktag.book_id as id,
        array_agg(DISTINCT books_booktag.name) as tags_list
    FROM
        books_booktag
    WHERE
        books_booktag.book_id IN %(book_ids)s
        GROUP BY
            book_id
)
SELECT
    id,
    review_count,
    COALESCE(tags_list, '{}') as tags_list
FROM
    review_count FULL OUTER JOIN tag_names USING (id);

"""


def annotate_books_cte(books):
    book_ids = tuple(book.id for book in books)
    annotations_by_book_id = {
        raw_book.id: {
            "review_count": raw_book.review_count,
            "tag_names": raw_book.tags_list,
        }
        for raw_book in Book.objects.raw(raw_query, params={"book_ids": book_ids})
    }
    for book in books:
        annotations = annotations_by_book_id[book.id]
        for name, value in annotations.items():
            setattr(book, name, value)
    return books


In [23]:
with perf_counter(time_sql=True, print_sql=True):
    qs = get_queryset_using_ids(alexandria_id, page_size=20)
    qs = annotate_books_cte(list(qs))

    result = serialize_books(qs)
print(result[0])

In [29]:
from rich.table import Table
import time
from utils.perf_display import format_duration

table = Table(show_lines=True)
table.add_column("name", style="bold green")
table.add_column("duration")


def add_row(annotate_func):
    start = time.perf_counter()
    qs = get_queryset_using_ids(alexandria_id, page_size=100)
    qs = annotate_func(qs)
    serialize_books(qs)
    duration = time.perf_counter() - start
    table.add_row(annotate_func.__name__, format_duration(duration))


for annotate_func in [
    annotate_books_aggregate,
    annotate_books_subquery,
    annotate_books_python,
    annotate_books_cte,
]:
    add_row(annotate_func)

console.print(table)

We could even merge this query with the one retrieving all the `Book` and `Person` fields.

This would require to write more raw SQL code, though, which is not always a good idea
