In [1]:
import tqdm
from pathlib import Path

import firebase_admin
from firebase_admin import credentials, firestore

from core import Book

In [2]:
p_firebase_credentials = Path('../secrets/books-us-firebase-adminsdk-pkr1j-d0ea62b8bf.json')
assert p_firebase_credentials.is_file()

cred = credentials.Certificate(p_firebase_credentials)
firebase_admin.initialize_app(cred)
db = firestore.client()

In [3]:
import re

class Utils:
    @staticmethod
    def html_overview_to_simple_text(html_text: str) -> str:
        # Replace <br> and paragraph tags with line breaks
        text = re.sub(r"</p>\s*<br>\s*<p>", "\n", html_text)

        # Remove remaining <p> and </p> tags
        text = re.sub(r"</?p>", "", text)

        # Remove <b> tags but keep their content
        text = re.sub(r"</?b>", "", text)

        return text

In [4]:
def list_books(limit: int | None = None) -> list[Book]:
    res_books = []
    documents_gen = db.collection('books').list_documents()
    if limit is None or limit < 0:
        gen = enumerate(documents_gen)
    else:
        gen = zip(range(limit), documents_gen)
    for i, document_ref in tqdm.tqdm(gen):
        try:
            document = document_ref.get().to_dict()
            LOCALIZATION_KEY = 'localization'
            EN_LOCALIZATION_KEY = 'en'
            en_localization = document[LOCALIZATION_KEY][EN_LOCALIZATION_KEY]

            firebase_document_path = document_ref.path
            enabled = document.get('enabled')
            if not enabled:
                print(f'Skipping disabled book at {i}')
                continue
            title = en_localization.get('title')
            author = en_localization.get('author')
            raw_overview = en_localization.get('overview')
            overview = None if raw_overview is None else Utils.html_overview_to_simple_text(raw_overview)
            author_overview = en_localization.get('authorOverview')
            learning_items = en_localization.get('learningItems')

            book = Book(
                firebase_document_path=firebase_document_path,
                title=title,
                author=author,
                overview=overview,
                author_overview=author_overview,
                learning_items=learning_items,
            )
            
            res_books.append(book)
        except:
            print(f'Failed at {i}')
    
    return res_books

In [5]:
books = list_books()

17it [00:06,  2.74it/s]

Failed at 16


29it [00:11,  2.68it/s]

Failed at 28


33it [00:13,  2.70it/s]

Failed at 32


57it [00:23,  2.73it/s]

Failed at 56


58it [00:23,  2.30it/s]

Skipping disabled book at 57


63it [00:25,  2.68it/s]

Failed at 61


82it [00:33,  2.33it/s]

Failed at 81


89it [00:36,  2.73it/s]

Failed at 88


90it [00:36,  2.37it/s]

Failed at 89


96it [00:39,  2.31it/s]

Failed at 95


134it [00:47,  5.70it/s]

Failed at 132


138it [00:48,  5.95it/s]

Failed at 136
Failed at 137


149it [00:51,  4.38it/s]

Failed at 148


169it [00:55,  4.66it/s]

Failed at 167


191it [01:00,  5.03it/s]

Failed at 189


198it [01:01,  5.09it/s]

Failed at 196


203it [01:03,  3.49it/s]

Failed at 201


227it [01:08,  4.48it/s]

Failed at 226


238it [01:11,  5.41it/s]

Failed at 237


246it [01:13,  5.00it/s]

Skipping disabled book at 244


252it [01:14,  4.99it/s]

Failed at 251


263it [01:16,  5.50it/s]

Skipping disabled book at 261


272it [01:18,  4.93it/s]

Skipping disabled book at 270


281it [01:20,  4.42it/s]

Failed at 280


371it [01:41,  4.39it/s]

Failed at 369


389it [01:45,  4.96it/s]

Failed at 387


413it [01:51,  5.05it/s]

Failed at 412


419it [01:52,  5.66it/s]

Failed at 417


422it [01:53,  5.41it/s]

Failed at 421


427it [01:54,  5.18it/s]

Skipping disabled book at 425


449it [01:59,  4.70it/s]

Failed at 447
Failed at 448


451it [01:59,  4.66it/s]

Failed at 450


469it [02:04,  4.97it/s]

Failed at 467


474it [02:05,  4.70it/s]

Failed at 472


482it [02:07,  4.11it/s]

Skipping disabled book at 481


492it [02:09,  4.78it/s]

Failed at 490


499it [02:10,  5.65it/s]

Failed at 498


531it [02:19,  4.64it/s]

Failed at 529


534it [02:19,  5.36it/s]

Failed at 532
Failed at 533


545it [02:22,  4.42it/s]

Failed at 544


588it [02:32,  4.79it/s]

Failed at 587


592it [02:33,  4.35it/s]

Failed at 591


617it [02:39,  5.22it/s]

Skipping disabled book at 615


635it [02:43,  5.44it/s]

Failed at 634


657it [02:48,  5.18it/s]

Failed at 655


665it [02:50,  4.99it/s]

Failed at 663


729it [03:05,  5.01it/s]

Skipping disabled book at 727


731it [03:05,  5.15it/s]

Failed at 729
Failed at 730


748it [03:09,  4.39it/s]

Failed at 747


765it [03:13,  5.04it/s]

Failed at 764


776it [03:15,  4.88it/s]

Failed at 774


786it [03:17,  5.23it/s]

Failed at 785


793it [03:19,  4.77it/s]

Failed at 791


831it [03:28,  5.66it/s]

Failed at 829
Failed at 830


858it [03:35,  4.52it/s]

Skipping disabled book at 857


889it [03:42,  5.02it/s]

Skipping disabled book at 887


1004it [04:12,  3.87it/s]

Skipping disabled book at 1003


1016it [04:15,  4.65it/s]

Failed at 1015


1048it [04:22,  4.99it/s]

Failed at 1047


1072it [04:28,  4.66it/s]

Failed at 1071


1078it [04:29,  4.45it/s]

Failed at 1076


1108it [04:37,  4.23it/s]

Failed at 1107


1110it [04:37,  4.12it/s]

Skipping disabled book at 1109


1127it [04:41,  4.12it/s]

Failed at 1126


1136it [04:44,  4.59it/s]

Failed at 1134


1154it [04:48,  4.56it/s]

Failed at 1152


1184it [04:55,  4.35it/s]

Failed at 1183


1197it [04:58,  4.59it/s]

Failed at 1195


1208it [05:01,  4.49it/s]

Failed at 1207


1214it [05:02,  5.06it/s]

Failed at 1212


1223it [05:04,  4.89it/s]

Failed at 1221


1237it [05:07,  4.53it/s]

Failed at 1236


1261it [05:13,  3.97it/s]

Failed at 1260


1307it [05:24,  3.93it/s]

Skipping disabled book at 1306


1329it [05:29,  5.14it/s]

Failed at 1328


1330it [05:29,  4.71it/s]

Skipping disabled book at 1329


1342it [05:31,  5.65it/s]

Failed at 1341


1350it [05:34,  4.22it/s]

Failed at 1349


1372it [05:38,  5.80it/s]

Failed at 1370


1392it [05:43,  4.47it/s]

Failed at 1391


1400it [05:45,  5.06it/s]

Failed at 1399


1406it [05:47,  3.92it/s]

Failed at 1405


1417it [05:49,  4.67it/s]

Failed at 1415


1434it [05:53,  5.05it/s]

Failed at 1433


1449it [05:57,  4.54it/s]

Failed at 1447
Skipping disabled book at 1448


1452it [05:58,  4.29it/s]

Skipping disabled book at 1451


1482it [06:04,  4.58it/s]

Skipping disabled book at 1481


1488it [06:06,  5.23it/s]

Failed at 1487


1512it [06:12,  4.52it/s]

Failed at 1511


1628it [06:40,  4.54it/s]

Skipping disabled book at 1626
Failed at 1627


1633it [06:41,  4.63it/s]

Skipping disabled book at 1632


1713it [07:01,  4.85it/s]

Failed at 1711


1776it [07:15,  5.13it/s]

Skipping disabled book at 1774
Failed at 1775


1822it [07:27,  4.30it/s]

Failed at 1821


1867it [07:37,  4.19it/s]

Failed at 1866


1871it [07:38,  4.51it/s]

Failed at 1870


1884it [07:42,  4.23it/s]

Failed at 1883


1907it [07:47,  4.08it/s]


In [6]:
print('Total books:', len(books))

Total books: 1802


In [7]:
def set_book_categories(books: list[Book]) -> None:
    LOCALIZATION_KEY = 'localization'
    EN_LOCALIZATION_KEY = 'en'
    BOOK_IDS_KEY = 'booksIds'

    id2book = {
        b.book_id: b
        for b in books
    }
    # Clear existing categories
    for b in books:
        b.categories.clear()

    # Read categoreis
    categories_document_ref = db.collection('common').document('categories')
    categories_document = categories_document_ref.get().to_dict()
    for _, category_data in categories_document['categories'].items():
        localization = category_data.get(LOCALIZATION_KEY)
        if not localization:
            continue
        en_localization = localization.get(EN_LOCALIZATION_KEY)
        if not en_localization:
            continue
        category_title = en_localization['title']
        book_ids = category_data[BOOK_IDS_KEY]

        # Add this category to the respective books
        for book_id in book_ids:
            if not book_id in id2book:
                print(f'Category "{category_title}" refers to {book_id} which is not present in books collection.')
            else:
                id2book[book_id].categories.append(category_title)
    

In [8]:
set_book_categories(books)

Category "Home & Environment" refers to db317451-6a67-44b1-ab71-cf2ac414c2e9 which is not present in books collection.
Category "Happiness" refers to bace612a-a60c-478a-951d-21a23b0a50d8 which is not present in books collection.
Category "Happiness" refers to 337064e9-dbbe-4934-977c-1dfc8571ffd6 which is not present in books collection.
Category "Happiness" refers to 363b0c49-c2d8-4e90-82fb-a28299a4c06e which is not present in books collection.
Category "Happiness" refers to 0418eb4c-00b0-4700-838c-7e8da582e6db which is not present in books collection.
Category "Happiness" refers to 76bf1ef1-49a2-4e72-af22-f8d1a5274c31 which is not present in books collection.
Category "Happiness" refers to 7776082a-698d-430e-a217-a7e0f7eb7996 which is not present in books collection.
Category "Happiness" refers to 87130977-298d-4d4c-a39e-1f51ab4940b0 which is not present in books collection.
Category "Happiness" refers to 5a5bda74-ab30-4382-a383-24a4cb2e22d3 which is not present in books collection.
C

## Export books

In [10]:
import pandas as pd

In [11]:
df_books = pd.DataFrame([b.to_dict() for b in books])
df_books['categories'] = df_books['categories'].apply(lambda l: '|'.join(l))
df_books['learning_items'] = df_books['learning_items'].apply(lambda l: '|'.join(l))

In [12]:
df_books.head()

Unnamed: 0,firebase_document_path,id,title,author,overview,author_overview,categories,learning_items
0,books/0059543c-a8e5-4615-a78c-69f8e73c19c1,0059543c-a8e5-4615-a78c-69f8e73c19c1,Parenting the New Teen in the Age of Anxiety: ...,John Duffy,What’s inside\nYou're looking at a modern pare...,John Duffy is a successful clinical psychologi...,Family,Why your teenager is glued to their screens|Ho...
1,books/00bb4531-5150-44af-9eb1-c160c7d26ed6,00bb4531-5150-44af-9eb1-c160c7d26ed6,"Right Thing, Right Now: Good Values, Good Char...",Ryan Holiday,What’s inside\nThe modern world has many fasci...,Ryan Holiday is known for bringing philosophy ...,Self-Growth,Which four core virtues pave the way to a succ...
2,books/012399fc36f74c489e4e,012399fc36f74c489e4e,Eat Move Sleep: How Small Choices Lead to Big ...,Tom Rath,"What’s inside\nDiscover how inner strength, wi...",Tom Rath is a renowned author of best-selling ...,Health,The power of positive dietary choices|How move...
3,books/012d6fde-51e2-4289-822a-1cbb57394bcd,012d6fde-51e2-4289-822a-1cbb57394bcd,"From Strength to Strength: Finding Success, Ha...",Arthur C. Brooks,What’s inside\nAchieving your dreams is fulfil...,Arthur C. Brooks is a scientist whose research...,Happiness|Self-Growth,The two kinds of intelligence |How to maximize...
4,books/017dab0f35d34af59653,017dab0f35d34af59653,"Dream Big: Know What You Want, Why You Want It...",Bob Goff,What’s inside\nBreak free from the cycle of un...,"Robert Kendall Goff is a speaker, lawyer, auth...",Self-Growth,Ways to pick one ambition to focus on|Strategi...


In [13]:
p_csv_dest = Path('../data/books.csv')
df_books.to_csv(p_csv_dest)

# read dataframe to ensure it being valid
pd.read_csv(p_csv_dest, index_col=0).head()

Unnamed: 0,firebase_document_path,id,title,author,overview,author_overview,categories,learning_items
0,books/0059543c-a8e5-4615-a78c-69f8e73c19c1,0059543c-a8e5-4615-a78c-69f8e73c19c1,Parenting the New Teen in the Age of Anxiety: ...,John Duffy,What’s inside\nYou're looking at a modern pare...,John Duffy is a successful clinical psychologi...,Family,Why your teenager is glued to their screens|Ho...
1,books/00bb4531-5150-44af-9eb1-c160c7d26ed6,00bb4531-5150-44af-9eb1-c160c7d26ed6,"Right Thing, Right Now: Good Values, Good Char...",Ryan Holiday,What’s inside\nThe modern world has many fasci...,Ryan Holiday is known for bringing philosophy ...,Self-Growth,Which four core virtues pave the way to a succ...
2,books/012399fc36f74c489e4e,012399fc36f74c489e4e,Eat Move Sleep: How Small Choices Lead to Big ...,Tom Rath,"What’s inside\nDiscover how inner strength, wi...",Tom Rath is a renowned author of best-selling ...,Health,The power of positive dietary choices|How move...
3,books/012d6fde-51e2-4289-822a-1cbb57394bcd,012d6fde-51e2-4289-822a-1cbb57394bcd,"From Strength to Strength: Finding Success, Ha...",Arthur C. Brooks,What’s inside\nAchieving your dreams is fulfil...,Arthur C. Brooks is a scientist whose research...,Happiness|Self-Growth,The two kinds of intelligence |How to maximize...
4,books/017dab0f35d34af59653,017dab0f35d34af59653,"Dream Big: Know What You Want, Why You Want It...",Bob Goff,What’s inside\nBreak free from the cycle of un...,"Robert Kendall Goff is a speaker, lawyer, auth...",Self-Growth,Ways to pick one ambition to focus on|Strategi...
