In [1]:
import tqdm
from pathlib import Path

import firebase_admin
from firebase_admin import credentials, firestore

from core import Book

In [2]:
p_firebase_credentials = Path('../secrets/books-us-firebase-adminsdk-pkr1j-d0ea62b8bf.json')
assert p_firebase_credentials.is_file()

cred = credentials.Certificate(p_firebase_credentials)
firebase_admin.initialize_app(cred)
db = firestore.client()

In [3]:
import re

class Utils:
    @staticmethod
    def html_overview_to_simple_text(html_text: str) -> str:
        # Replace <br> and paragraph tags with line breaks
        text = re.sub(r"</p>\s*<br>\s*<p>", "\n", html_text)

        # Remove remaining <p> and </p> tags
        text = re.sub(r"</?p>", "", text)

        # Remove <b> tags but keep their content
        text = re.sub(r"</?b>", "", text)

        return text

In [18]:
def list_books(limit: int | None = None) -> list[Book]:
    res_books = []
    documents_gen = db.collection('books').list_documents()
    if limit is None or limit < 0:
        gen = enumerate(documents_gen)
    else:
        gen = zip(range(limit), documents_gen)
    for i, document_ref in tqdm.tqdm(gen):
        try:
            document = document_ref.get().to_dict()
            LOCALIZATION_KEY = 'localization'
            EN_LOCALIZATION_KEY = 'en'
            en_localization = document[LOCALIZATION_KEY][EN_LOCALIZATION_KEY]

            firebase_document_path = document_ref.path
            title = en_localization.get('title')
            author = en_localization.get('author')
            raw_overview = en_localization.get('overview')
            overview = None if raw_overview is None else Utils.html_overview_to_simple_text(raw_overview)
            author_overview = en_localization.get('authorOverview')
            learning_items = en_localization.get('learningItems')

            book = Book(
                firebase_document_path=firebase_document_path,
                title=title,
                author=author,
                overview=overview,
                author_overview=author_overview,
                learning_items=learning_items,
            )
            
            res_books.append(book)
        except:
            print(f'Failed at {i}')
    
    return res_books

In [26]:
books = list_books()

17it [00:06,  2.80it/s]

Failed at 16


30it [00:10,  4.39it/s]

Failed at 28


34it [00:11,  4.89it/s]

Failed at 32


58it [00:16,  5.49it/s]

Failed at 56


62it [00:16,  5.19it/s]

Failed at 61


82it [00:20,  5.29it/s]

Failed at 81


90it [00:22,  5.24it/s]

Failed at 88
Failed at 89


96it [00:23,  4.98it/s]

Failed at 95


134it [00:32,  5.80it/s]

Failed at 132


138it [00:33,  5.37it/s]

Failed at 136
Failed at 137


149it [00:35,  4.47it/s]

Failed at 148


168it [00:39,  4.35it/s]

Failed at 167


190it [00:44,  5.32it/s]

Failed at 189


198it [00:46,  5.65it/s]

Failed at 196


202it [00:48,  2.72it/s]

Failed at 201


227it [00:53,  5.05it/s]

Failed at 226


239it [00:56,  4.78it/s]

Failed at 237


252it [00:59,  4.05it/s]

Failed at 251


281it [01:05,  5.10it/s]

Failed at 280


370it [01:26,  4.55it/s]

Failed at 369


389it [01:29,  5.28it/s]

Failed at 387


413it [01:35,  5.45it/s]

Failed at 412


419it [01:36,  5.67it/s]

Failed at 417


422it [01:37,  5.29it/s]

Failed at 421


449it [01:42,  5.78it/s]

Failed at 447
Failed at 448


452it [01:43,  5.03it/s]

Failed at 450


468it [01:47,  5.34it/s]

Failed at 467


474it [01:48,  5.39it/s]

Failed at 472


492it [01:52,  5.71it/s]

Failed at 490


500it [01:53,  5.47it/s]

Failed at 498


530it [02:00,  4.17it/s]

Failed at 529


534it [02:01,  5.18it/s]

Failed at 532
Failed at 533


545it [02:04,  4.46it/s]

Failed at 544


589it [02:14,  4.86it/s]

Failed at 587


593it [02:15,  4.81it/s]

Failed at 591


635it [02:25,  4.84it/s]

Failed at 634


657it [02:31,  5.40it/s]

Failed at 655


665it [02:32,  5.39it/s]

Failed at 663


730it [02:47,  6.13it/s]

Failed at 728
Failed at 729


747it [02:51,  5.65it/s]

Failed at 746


764it [02:54,  5.58it/s]

Failed at 763


774it [02:56,  4.82it/s]

Failed at 773


785it [02:59,  5.28it/s]

Failed at 784


792it [03:01,  4.89it/s]

Failed at 790


830it [03:10,  5.37it/s]

Failed at 828
Failed at 829


1015it [03:54,  4.84it/s]

Failed at 1014


1047it [04:01,  4.86it/s]

Failed at 1046


1071it [04:07,  4.34it/s]

Failed at 1070


1077it [04:08,  5.34it/s]

Failed at 1075


1107it [04:15,  4.57it/s]

Failed at 1106


1127it [04:20,  4.64it/s]

Failed at 1125


1135it [04:22,  5.00it/s]

Failed at 1133


1153it [04:25,  5.21it/s]

Failed at 1151


1183it [04:32,  4.41it/s]

Failed at 1182


1195it [04:35,  5.16it/s]

Failed at 1194


1208it [04:38,  4.68it/s]

Failed at 1206


1213it [04:39,  5.13it/s]

Failed at 1211


1222it [04:41,  4.78it/s]

Failed at 1220


1236it [04:44,  4.62it/s]

Failed at 1235


1260it [04:49,  4.30it/s]

Failed at 1259


1329it [05:04,  4.76it/s]

Failed at 1327


1341it [05:07,  4.47it/s]

Failed at 1340


1349it [05:09,  4.98it/s]

Failed at 1348


1370it [05:13,  5.68it/s]

Failed at 1369


1391it [05:18,  5.28it/s]

Failed at 1390


1399it [05:19,  5.67it/s]

Failed at 1398


1405it [05:21,  4.89it/s]

Failed at 1404


1416it [05:24,  4.17it/s]

Failed at 1414


1433it [05:28,  5.09it/s]

Failed at 1432


1448it [05:32,  4.98it/s]

Failed at 1446


1487it [05:40,  4.80it/s]

Failed at 1486


1512it [05:46,  5.02it/s]

Failed at 1510


1628it [06:14,  4.81it/s]

Failed at 1626


1712it [06:34,  4.55it/s]

Failed at 1710


1775it [06:49,  4.92it/s]

Failed at 1774


1821it [06:59,  5.21it/s]

Failed at 1820


1866it [07:09,  4.85it/s]

Failed at 1865


1870it [07:10,  4.94it/s]

Failed at 1869


1883it [07:13,  4.83it/s]

Failed at 1882


1906it [07:19,  4.34it/s]


In [34]:
def set_book_categories(books: list[Book]) -> None:
    LOCALIZATION_KEY = 'localization'
    EN_LOCALIZATION_KEY = 'en'
    BOOK_IDS_KEY = 'booksIds'

    id2book = {
        b.book_id: b
        for b in books
    }
    # Clear existing categories
    for b in books:
        b.categories.clear()

    # Read categoreis
    categories_document_ref = db.collection('common').document('categories')
    categories_document = categories_document_ref.get().to_dict()
    for _, category_data in categories_document['categories'].items():
        localization = category_data.get(LOCALIZATION_KEY)
        if not localization:
            continue
        en_localization = localization.get(EN_LOCALIZATION_KEY)
        if not en_localization:
            continue
        category_title = en_localization['title']
        book_ids = category_data[BOOK_IDS_KEY]

        # Add this category to the respective books
        for book_id in book_ids:
            if not book_id in id2book:
                print(f'Category "{category_title}" refers to {book_id} which is not present in books collection.')
            else:
                id2book[book_id].categories.append(category_title)
    

In [35]:
set_book_categories(books)

Category "Home & Environment" refers to db317451-6a67-44b1-ab71-cf2ac414c2e9 which is not present in books collection.
Category "Happiness" refers to bace612a-a60c-478a-951d-21a23b0a50d8 which is not present in books collection.
Category "Happiness" refers to 337064e9-dbbe-4934-977c-1dfc8571ffd6 which is not present in books collection.
Category "Happiness" refers to 363b0c49-c2d8-4e90-82fb-a28299a4c06e which is not present in books collection.
Category "Happiness" refers to 0418eb4c-00b0-4700-838c-7e8da582e6db which is not present in books collection.
Category "Happiness" refers to 76bf1ef1-49a2-4e72-af22-f8d1a5274c31 which is not present in books collection.
Category "Happiness" refers to 7776082a-698d-430e-a217-a7e0f7eb7996 which is not present in books collection.
Category "Happiness" refers to 87130977-298d-4d4c-a39e-1f51ab4940b0 which is not present in books collection.
Category "Happiness" refers to 5a5bda74-ab30-4382-a383-24a4cb2e22d3 which is not present in books collection.
C

In [36]:
for book in books:
    print(book)

Book(id="0059543c-a8e5-4615-a78c-69f8e73c19c1", title="Parenting the New Teen in the Age of Anxiety: A Complete Guide to Your Child's Stressed, Depressed, Expanded, Amazing Adolescence", author="John Duffy", overview="What’s inside\nYou're looking at a modern parentin...", author_overview="John Duffy is a successful clinical psychologist w...", categories=['Family'], learning_items=['Why your teenager is glued to their screens', 'How self-care equals childcare', "The secret to those beloved aunts' and uncles' popularity", "What's inside your child's mind"])
Book(id="00bb4531-5150-44af-9eb1-c160c7d26ed6", title="Right Thing, Right Now: Good Values, Good Character, Good Deeds", author="Ryan Holiday", overview="What’s inside\nThe modern world has many fascinati...", author_overview="Ryan Holiday is known for bringing philosophy into...", categories=['Self-Growth'], learning_items=['Which four core virtues pave the way to a successful life', 'How to forge stronger relationships', 'Why life

## Export books

In [37]:
import pandas as pd

In [38]:
df_books = pd.DataFrame([b.to_dict() for b in books])
df_books['categories'] = df_books['categories'].apply(lambda l: '|'.join(l))
df_books['learning_items'] = df_books['learning_items'].apply(lambda l: '|'.join(l))

In [39]:
df_books.head()

Unnamed: 0,firebase_document_path,id,title,author,overview,author_overview,categories,learning_items
0,books/0059543c-a8e5-4615-a78c-69f8e73c19c1,0059543c-a8e5-4615-a78c-69f8e73c19c1,Parenting the New Teen in the Age of Anxiety: ...,John Duffy,What’s inside\nYou're looking at a modern pare...,John Duffy is a successful clinical psychologi...,Family,Why your teenager is glued to their screens|Ho...
1,books/00bb4531-5150-44af-9eb1-c160c7d26ed6,00bb4531-5150-44af-9eb1-c160c7d26ed6,"Right Thing, Right Now: Good Values, Good Char...",Ryan Holiday,What’s inside\nThe modern world has many fasci...,Ryan Holiday is known for bringing philosophy ...,Self-Growth,Which four core virtues pave the way to a succ...
2,books/012399fc36f74c489e4e,012399fc36f74c489e4e,Eat Move Sleep: How Small Choices Lead to Big ...,Tom Rath,"What’s inside\nDiscover how inner strength, wi...",Tom Rath is a renowned author of best-selling ...,Health,The power of positive dietary choices|How move...
3,books/012d6fde-51e2-4289-822a-1cbb57394bcd,012d6fde-51e2-4289-822a-1cbb57394bcd,"From Strength to Strength: Finding Success, Ha...",Arthur C. Brooks,What’s inside\nAchieving your dreams is fulfil...,Arthur C. Brooks is a scientist whose research...,Happiness|Self-Growth,The two kinds of intelligence |How to maximize...
4,books/017dab0f35d34af59653,017dab0f35d34af59653,"Dream Big: Know What You Want, Why You Want It...",Bob Goff,What’s inside\nBreak free from the cycle of un...,"Robert Kendall Goff is a speaker, lawyer, auth...",Self-Growth,Ways to pick one ambition to focus on|Strategi...


In [40]:
p_csv_dest = Path('../data/books.csv')
df_books.to_csv(p_csv_dest)

# read dataframe to ensure it being valid
pd.read_csv(p_csv_dest, index_col=0).head()

Unnamed: 0,firebase_document_path,id,title,author,overview,author_overview,categories,learning_items
0,books/0059543c-a8e5-4615-a78c-69f8e73c19c1,0059543c-a8e5-4615-a78c-69f8e73c19c1,Parenting the New Teen in the Age of Anxiety: ...,John Duffy,What’s inside\nYou're looking at a modern pare...,John Duffy is a successful clinical psychologi...,Family,Why your teenager is glued to their screens|Ho...
1,books/00bb4531-5150-44af-9eb1-c160c7d26ed6,00bb4531-5150-44af-9eb1-c160c7d26ed6,"Right Thing, Right Now: Good Values, Good Char...",Ryan Holiday,What’s inside\nThe modern world has many fasci...,Ryan Holiday is known for bringing philosophy ...,Self-Growth,Which four core virtues pave the way to a succ...
2,books/012399fc36f74c489e4e,012399fc36f74c489e4e,Eat Move Sleep: How Small Choices Lead to Big ...,Tom Rath,"What’s inside\nDiscover how inner strength, wi...",Tom Rath is a renowned author of best-selling ...,Health,The power of positive dietary choices|How move...
3,books/012d6fde-51e2-4289-822a-1cbb57394bcd,012d6fde-51e2-4289-822a-1cbb57394bcd,"From Strength to Strength: Finding Success, Ha...",Arthur C. Brooks,What’s inside\nAchieving your dreams is fulfil...,Arthur C. Brooks is a scientist whose research...,Happiness|Self-Growth,The two kinds of intelligence |How to maximize...
4,books/017dab0f35d34af59653,017dab0f35d34af59653,"Dream Big: Know What You Want, Why You Want It...",Bob Goff,What’s inside\nBreak free from the cycle of un...,"Robert Kendall Goff is a speaker, lawyer, auth...",Self-Growth,Ways to pick one ambition to focus on|Strategi...
