In [1]:
import dotenv
from pathlib import Path

import pandas as pd

from core import Book

In [2]:
dotenv.load_dotenv('../secrets/.env')

True

In [3]:
p_books = Path('../data/books.csv')
df_books = pd.read_csv(p_books, index_col=0)
df_books['categories'] = df_books['categories'].apply(lambda s: s.split('|') if not pd.isna(s) else [])
df_books['learning_items'] = df_books['learning_items'].apply(lambda s: s.split('|') if not pd.isna(s) else [])

In [4]:
df_books.head()

Unnamed: 0,firebase_document_path,id,title,author,overview,author_overview,categories,learning_items
0,books/0059543c-a8e5-4615-a78c-69f8e73c19c1,0059543c-a8e5-4615-a78c-69f8e73c19c1,Parenting the New Teen in the Age of Anxiety: ...,John Duffy,What’s inside\nYou're looking at a modern pare...,John Duffy is a successful clinical psychologi...,[Family],"[Why your teenager is glued to their screens, ..."
1,books/00bb4531-5150-44af-9eb1-c160c7d26ed6,00bb4531-5150-44af-9eb1-c160c7d26ed6,"Right Thing, Right Now: Good Values, Good Char...",Ryan Holiday,What’s inside\nThe modern world has many fasci...,Ryan Holiday is known for bringing philosophy ...,[Self-Growth],[Which four core virtues pave the way to a suc...
2,books/012399fc36f74c489e4e,012399fc36f74c489e4e,Eat Move Sleep: How Small Choices Lead to Big ...,Tom Rath,"What’s inside\nDiscover how inner strength, wi...",Tom Rath is a renowned author of best-selling ...,[Health],"[The power of positive dietary choices, How mo..."
3,books/012d6fde-51e2-4289-822a-1cbb57394bcd,012d6fde-51e2-4289-822a-1cbb57394bcd,"From Strength to Strength: Finding Success, Ha...",Arthur C. Brooks,What’s inside\nAchieving your dreams is fulfil...,Arthur C. Brooks is a scientist whose research...,"[Happiness, Self-Growth]","[The two kinds of intelligence , How to maximi..."
4,books/017dab0f35d34af59653,017dab0f35d34af59653,"Dream Big: Know What You Want, Why You Want It...",Bob Goff,What’s inside\nBreak free from the cycle of un...,"Robert Kendall Goff is a speaker, lawyer, auth...",[Self-Growth],"[Ways to pick one ambition to focus on, Strate..."


In [5]:
books = []
for i, row in df_books.iterrows():
    books.append(Book(
        firebase_document_path=row.firebase_document_path,
        title=row.title,
        author=row.author,
        overview=row.overview,
        author_overview=row.author_overview,
        categories=row.categories,
        learning_items=row.learning_items
    ))

In [6]:
print('\n'.join([str(b) for b in books[:10]]))

Book(id="0059543c-a8e5-4615-a78c-69f8e73c19c1", title="Parenting the New Teen in the Age of Anxiety: A Complete Guide to Your Child's Stressed, Depressed, Expanded, Amazing Adolescence", author="John Duffy", overview="What’s inside\nYou're looking at a modern parentin...", author_overview="John Duffy is a successful clinical psychologist w...", categories=['Family'], learning_items=['Why your teenager is glued to their screens', 'How self-care equals childcare', "The secret to those beloved aunts' and uncles' popularity", "What's inside your child's mind"])
Book(id="00bb4531-5150-44af-9eb1-c160c7d26ed6", title="Right Thing, Right Now: Good Values, Good Character, Good Deeds", author="Ryan Holiday", overview="What’s inside\nThe modern world has many fascinati...", author_overview="Ryan Holiday is known for bringing philosophy into...", categories=['Self-Growth'], learning_items=['Which four core virtues pave the way to a successful life', 'How to forge stronger relationships', 'Why life

In [7]:
categories = set()
for i, row in df_books.iterrows():
    categories.update(set(row.categories))
categories = list(categories)
print('\n'.join(categories))


Fiction
Home & Environment
Spirituality
Sports & Fitness
Business & Career
Happiness
Self-Growth
Money & Investments
Personalities
Productivity
Society & Tech
Leadership
Family
Love & Sex
Health
Negotiation


In [8]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_community.callbacks import get_openai_callback
from pydantic import BaseModel, Field


In [9]:
class OpenAIBookCategorizer:
    class BookCategoriesModel(BaseModel):
        reasoning_steps: list[str] = Field(description="Reasoning steps aiming to determine what categories are relevant to the book.")
        categories: list[str] = Field(description='A subset of predifined categories that fit the book best.')

    class StatisticsLogger:
        def __init__(self) -> None:
            self._prompt_tokens = []
            self._completion_tokens = []
            self._costs = []
        
        def log(self, prompt_tokens: int, completion_tokens: int, cost: float) -> None:
            self._prompt_tokens.append(prompt_tokens)
            self._completion_tokens.append(completion_tokens)
            self._costs.append(cost)
        
        @property
        def total_prompt_tokens(self) -> int:
            return sum(self._prompt_tokens)

        @property
        def average_prompt_tokens(self) -> float:
            return sum(self._prompt_tokens) / len(self._prompt_tokens)
        
        @property
        def total_completion_tokens(self) -> int:
            return sum(self._completion_tokens)
        
        @property
        def average_completion_tokens(self) -> float:
            return sum(self._completion_tokens) / len(self._completion_tokens)
        
        @property
        def total_cost(self) -> float:
            return sum(self._costs)
        
        @property
        def average_cost(self) -> float:
            return sum(self._costs) / len(self._costs)
        
        @property
        def total_inferences(self) -> int:
            return len(self._costs)
        
        def __str__(self) -> str:
            return (
                "Statistics:\n"
                f"\tNum inferences: {self.total_inferences}\n"
                f"\tAverage Prompt Tokens: {self.average_prompt_tokens}\n"
                f"\tAverage Completion Tokens: {self.average_completion_tokens}\n"
                f"\tAverage Cost: {self.average_cost}\n"
                f"\tTotal Prompt Tokens: {self.total_prompt_tokens}\n"
                f"\tTotal Completion Tokens: {self.total_completion_tokens}\n"
                f"\tTotal Cost: {self.total_cost}"
            )

    
    def __init__(self, categories: list[str]) -> None:      
        self._categories = categories

        # logging
        self._statistics_logger = self.StatisticsLogger()

        llm = ChatOpenAI(model='gpt-4o-mini', temperature=0)
        llm_with_structured_output = llm.with_structured_output(self.BookCategoriesModel)
        prompt_template = PromptTemplate.from_template(
            "You are an expert in book categorization. Your task is to assign the most fitting categories to the given book, selecting only from a predefined set. \n"
            "\n"
            "### Book Information: \n"
            "**Title**: {title} \n"
            "**Author**: {author} \n"
            "**Overview**: \n"
            "{overview} \n"
            "\n"
            "**Author Overview**: \n"
            "{author_overview} \n"
            "\n"
            "**Key Learning Items**: \n"
            "{learning_items} \n"
            "\n"
            "### Instructions: \n"
            "1. Carefully analyze the book's overview, author information, and key learning points. \n"
            "2. Reason step by step, format your reasoning as a collection of thoughts."
            f"3. Assign categories **only** from this predefined list: {categories}. \n"
            "4. Select **relevant categories** that accurately describe the book based on your analysis. \n"
        )

        self._chain = prompt_template | llm_with_structured_output

    @property
    def statistics_logger(self) -> 'OpenAIBookCategorizer.StatisticsLogger':
        return self._statistics_logger

    def categorize(self, book: Book) -> BookCategoriesModel:
        with get_openai_callback() as cb:
            # predict
            predictions = self._chain.invoke({
                'title': book.title,
                'author': book.author,
                'overview': book.overview,
                'author_overview': book.author_overview,
                'learning_items': '\n'.join(f' - {i}' for i in book.learning_items),
            })

            # log
            self._statistics_logger.log(
                prompt_tokens=cb.prompt_tokens,
                completion_tokens=cb.completion_tokens,
                cost=cb.total_cost
            )

        # process predictions
        predicted_categories = predictions.categories

        valid_predicted_categories = []
        invalid_predicted_categories = []
        standartized_valid_categories = [c.lower().strip() for c in self._categories]
        for c in predicted_categories:
            standartized_category = c.lower().strip()
            if standartized_category in standartized_valid_categories:
                valid_predicted_categories.append(c)
            else:
                invalid_predicted_categories.append(c)
        if len(invalid_predicted_categories) > 0:
            print(f'Warning: Invalid categories: {invalid_predicted_categories}')
        
        predictions.categories = valid_predicted_categories

        return predictions
        
        

## Process Books

In [10]:
categorizer = OpenAIBookCategorizer(categories)

In [11]:
import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

books_to_process = books[:] # List of books
l_books_n_predictions = []  # List to store (book, prediction) pairs

bar = tqdm.tqdm(total=len(books_to_process))

def process_book(book):
    """Categorize a single book and return the (book, prediction) pair."""
    prediction = categorizer.categorize(book)
    bar.update()
    return (book, prediction)

# Define the number of workers (adjust based on your system & API rate limits)
num_workers = 50  # Set this to control parallelism

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    # Submit all book categorization tasks
    futures = [executor.submit(process_book, book) for book in books_to_process]

    for future in futures:
        book, prediction = future.result()
        l_books_n_predictions.append((book, prediction))

# Print summary
print(f"Processed {len(l_books_n_predictions)} books successfully.")


100%|██████████| 1802/1802 [01:37<00:00,  1.96s/it]

Processed 1802 books successfully.


In [12]:
print(str(categorizer.statistics_logger))

Statistics:
	Num inferences: 1802
	Average Prompt Tokens: 512.3102108768036
	Average Completion Tokens: 136.10710321864596
	Average Cost: 0.00015851079356270838
	Total Prompt Tokens: 923183
	Total Completion Tokens: 245265
	Total Cost: 0.2856364500000005


In [13]:
row_items = [
    {
        **book.to_dict(),
        'pred_reasoning_steps': pred.reasoning_steps,
        'pred_categories': pred.categories
    }
    for book, pred in l_books_n_predictions
]

df_predictions = pd.DataFrame(row_items)
for col in ['categories', 'learning_items', 'pred_reasoning_steps', 'pred_categories']:
    df_predictions[col] = df_predictions[col].apply(lambda l: '|'.join(l))

df_predictions.head()

Unnamed: 0,firebase_document_path,id,title,author,overview,author_overview,categories,learning_items,pred_reasoning_steps,pred_categories
0,books/0059543c-a8e5-4615-a78c-69f8e73c19c1,0059543c-a8e5-4615-a78c-69f8e73c19c1,Parenting the New Teen in the Age of Anxiety: ...,John Duffy,What’s inside\nYou're looking at a modern pare...,John Duffy is a successful clinical psychologi...,Family,Why your teenager is glued to their screens|Ho...,"The book is a guide for parents, specifically ...",Family|Self-Growth|Society & Tech
1,books/00bb4531-5150-44af-9eb1-c160c7d26ed6,00bb4531-5150-44af-9eb1-c160c7d26ed6,"Right Thing, Right Now: Good Values, Good Char...",Ryan Holiday,What’s inside\nThe modern world has many fasci...,Ryan Holiday is known for bringing philosophy ...,Self-Growth,Which four core virtues pave the way to a succ...,"The book discusses moral codes and virtues, wh...",Self-Growth|Happiness|Health
2,books/012399fc36f74c489e4e,012399fc36f74c489e4e,Eat Move Sleep: How Small Choices Lead to Big ...,Tom Rath,"What’s inside\nDiscover how inner strength, wi...",Tom Rath is a renowned author of best-selling ...,Health,The power of positive dietary choices|How move...,The book focuses on well-being and how small c...,Health|Self-Growth|Sports & Fitness
3,books/012d6fde-51e2-4289-822a-1cbb57394bcd,012d6fde-51e2-4289-822a-1cbb57394bcd,"From Strength to Strength: Finding Success, Ha...",Arthur C. Brooks,What’s inside\nAchieving your dreams is fulfil...,Arthur C. Brooks is a scientist whose research...,Happiness|Self-Growth,The two kinds of intelligence |How to maximize...,The book discusses finding purpose and fulfill...,Happiness|Self-Growth
4,books/017dab0f35d34af59653,017dab0f35d34af59653,"Dream Big: Know What You Want, Why You Want It...",Bob Goff,What’s inside\nBreak free from the cycle of un...,"Robert Kendall Goff is a speaker, lawyer, auth...",Self-Growth,Ways to pick one ambition to focus on|Strategi...,The book focuses on achieving goals and ambiti...,Self-Growth|Productivity|Happiness


In [14]:
p_predictions = Path('../data/books_predictions.csv')
df_predictions.to_csv(p_predictions)