In [77]:
import os
os.chdir('/Users/maksym.konevych/Documents/Projects/HWPersonalization/Categorization/Code/ai_categories_experiment')

In [78]:
import dotenv
from pathlib import Path

import pandas as pd

from src.core import Book

In [79]:
dotenv.load_dotenv('secrets/.env')

True

In [80]:
p_books = Path('data/v0.1.0.books.csv')
df_books = pd.read_csv(p_books, index_col=0)
df_books['categories'] = df_books['categories'].apply(lambda s: s.split('|') if not pd.isna(s) else [])

In [81]:
df_books.head()

Unnamed: 0,firebase_document_path,id,title,author,overview,author_overview,categories
0,books/0059543c-a8e5-4615-a78c-69f8e73c19c1,0059543c-a8e5-4615-a78c-69f8e73c19c1,Parenting the New Teen in the Age of Anxiety: ...,John Duffy,You're looking at a modern parenting guide. Te...,John Duffy is a successful clinical psychologi...,[Family]
1,books/00bb4531-5150-44af-9eb1-c160c7d26ed6,00bb4531-5150-44af-9eb1-c160c7d26ed6,"Right Thing, Right Now: Good Values, Good Char...",Ryan Holiday,The modern world has many fascinating inventio...,Ryan Holiday is known for bringing philosophy ...,[Self-Growth]
2,books/012399fc36f74c489e4e,012399fc36f74c489e4e,Eat Move Sleep: How Small Choices Lead to Big ...,Tom Rath,"Discover how inner strength, willpower, and ri...",Tom Rath is a renowned author of best-selling ...,[Health]
3,books/012d6fde-51e2-4289-822a-1cbb57394bcd,012d6fde-51e2-4289-822a-1cbb57394bcd,"From Strength to Strength: Finding Success, Ha...",Arthur C. Brooks,"Achieving your dreams is fulfilling, but what ...",Arthur C. Brooks is a scientist whose research...,"[Happiness, Self-Growth]"
4,books/017dab0f35d34af59653,017dab0f35d34af59653,"Dream Big: Know What You Want, Why You Want It...",Bob Goff,Break free from the cycle of unfinished projec...,"Robert Kendall Goff is a speaker, lawyer, auth...",[Self-Growth]


In [82]:
books = []
for i, row in df_books.iterrows():
    books.append(Book(
        firebase_document_path=row.firebase_document_path,
        title=row.title,
        author=row.author,
        overview=row.overview,
        author_overview=row.author_overview,
        categories=row.categories,
    ))

In [83]:
print('\n'.join([str(b) for b in books[:10]]))

Book(id="0059543c-a8e5-4615-a78c-69f8e73c19c1", title="Parenting the New Teen in the Age of Anxiety: A Complete Guide to Your Child's Stressed, Depressed, Expanded, Amazing Adolescence", author="John Duffy", overview="You're looking at a modern parenting guide. Teenag...", author_overview="John Duffy is a successful clinical psychologist w...", categories=['Family'], learning_items=[])
Book(id="00bb4531-5150-44af-9eb1-c160c7d26ed6", title="Right Thing, Right Now: Good Values, Good Character, Good Deeds", author="Ryan Holiday", overview="The modern world has many fascinating inventions, ...", author_overview="Ryan Holiday is known for bringing philosophy into...", categories=['Self-Growth'], learning_items=[])
Book(id="012399fc36f74c489e4e", title="Eat Move Sleep: How Small Choices Lead to Big Changes", author="Tom Rath", overview="Discover how inner strength, willpower, and right ...", author_overview="Tom Rath is a renowned author of best-selling book...", categories=['Health'], learn

In [84]:
categories = set()
for i, row in df_books.iterrows():
    categories.update(set(row.categories))
categories = list(categories)
print('\n'.join(categories))


Happiness
Home & Environment
Spirituality
Health
Money & Investments
Society & Tech
Personalities
Productivity
Negotiation
Leadership
Business & Career
Family
Self-Growth
Fiction
Love & Sex
Sports & Fitness


In [85]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_community.callbacks import get_openai_callback
from pydantic import BaseModel, Field


In [86]:
class OpenAIBookCategorizer:
    class ReasoningModel(BaseModel):
        reasonings: list[str] = Field(description="Reasoning steps about topics of the book and what categories might be relevant")


    class BookCategoriesModel(BaseModel):
        categories: list[str] = Field(description='Relevant categories for the book')


    class StatisticsLogger:
        def __init__(self) -> None:
            self._prompt_tokens = []
            self._completion_tokens = []
            self._costs = []
        
        def log(self, prompt_tokens: int, completion_tokens: int, cost: float) -> None:
            self._prompt_tokens.append(prompt_tokens)
            self._completion_tokens.append(completion_tokens)
            self._costs.append(cost)
        
        @property
        def total_prompt_tokens(self) -> int:
            return sum(self._prompt_tokens)

        @property
        def average_prompt_tokens(self) -> float:
            return sum(self._prompt_tokens) / len(self._prompt_tokens)
        
        @property
        def total_completion_tokens(self) -> int:
            return sum(self._completion_tokens)
        
        @property
        def average_completion_tokens(self) -> float:
            return sum(self._completion_tokens) / len(self._completion_tokens)
        
        @property
        def total_cost(self) -> float:
            return sum(self._costs)
        
        @property
        def average_cost(self) -> float:
            return sum(self._costs) / len(self._costs)
        
        @property
        def total_inferences(self) -> int:
            return len(self._costs)
        
        def __str__(self) -> str:
            return (
                "Statistics:\n"
                f"\tNum inferences: {self.total_inferences}\n"
                f"\tAverage Prompt Tokens: {self.average_prompt_tokens}\n"
                f"\tAverage Completion Tokens: {self.average_completion_tokens}\n"
                f"\tAverage Cost: {self.average_cost}\n"
                f"\tTotal Prompt Tokens: {self.total_prompt_tokens}\n"
                f"\tTotal Completion Tokens: {self.total_completion_tokens}\n"
                f"\tTotal Cost: {self.total_cost}"
            )

    
    def __init__(self, categories: list[str]) -> None:      
        self._categories = categories

        # logging
        self._statistics_logger = self.StatisticsLogger()

        # Stage 1: Reasoning
        llm_reasoning = ChatOpenAI(model='gpt-4o-mini', temperature=0)
        llm_reasoning = llm_reasoning.with_structured_output(self.ReasoningModel)
        prompt_template_reasoning = PromptTemplate.from_template(
            "You are an expert in book categorization. Analyze the book information step-by-step. \n"
            "\n"
            "### Book Data: \n"
            "**Title**: {title} \n"
            "**Author**: {author} \n"
            "**Overview**: \n"
            "{overview} \n"
            "\n"
            "### Instructions: \n"
            "1. Carefully analyze the book's overview, author information, etc. to determine what categories from {categories} are most relevant to the book. Your goal is to ensure that users browsing given category will only see relevant books."
            "2. Focus on assigning categories that uniquely fit the book. Determine what categories might seem relevant but should be rejected because they are secondary. **You are better-off rejecting a category that is secondary to the theme of the book that including it.**"
            "3. Reason step by step, format your reasoning as a collection of thoughts.\n"
            "4. **Do not assign categories yet**."
        )

        self._reasoning_chain = prompt_template_reasoning | llm_reasoning

        # Stage 2: Categorization
        llm_categorization = ChatOpenAI(model='gpt-4o-mini', temperature=0)
        llm_categorization = llm_categorization.with_structured_output(self.BookCategoriesModel)
        
        prompt_template_categorization = PromptTemplate.from_template(
            "You are a book categorization expert. Your task is to assign most appropriate categories for the book based on information about it and your prior reasonings.\n"
            "Your goal is to only assign most relevant categories to provide best user experience at discovering books through categories. \n"
            "\n"
            "### Book Data: \n"
            "**Title**: {title} \n"
            "**Author**: {author} \n"
            "**Overview**: \n"
            "{overview} \n"
            "\n"
            "{reasonings} \n"
            "\n"
            "### Instructions: \n"
            "1. Select **the most relevant** categories **only** from {categories} based on book information and prior analysis. \n"
            "2. **Do not invent new categories**."
            "3. Focus on assigning categories that uniquely fit the book. **Do not select categories that might be irrelevant or seem secondary.**"
        )
        
        self._categorization_chain = prompt_template_categorization | llm_categorization

    @property
    def statistics_logger(self) -> 'OpenAIBookCategorizer.StatisticsLogger':
        return self._statistics_logger
    
    def _format_reasonings(self, reasoning: list[str]) -> str:
        return (
            "### Prior reasonings:\n" + "\n".join(f"- {step}" for step in reasoning)
        )

    def categorize(self, book: Book) -> tuple[list[str], list[str]]:
        '''
        :return: reasoning steps and categories
        '''

        with get_openai_callback() as cb:
            # Stage 1: Reasoning
            reasoning_response = self._reasoning_chain.invoke({
                'title': book.title,
                'author': book.author,
                'overview': book.overview,
                'author_overview': book.author_overview,
                'categories': self._categories
            })
            
            # Stage 2: Categorization
            formatted_reasonings = self._format_reasonings(
                reasoning_response.reasonings,
            )
            
            categorization_response = self._categorization_chain.invoke({
                'title': book.title,
                'author': book.author,
                'overview': book.overview,
                'author_overview': book.author_overview,
                'reasonings': formatted_reasonings,
                'categories': self._categories
            })

            # Log
            self._statistics_logger.log(
                prompt_tokens=cb.prompt_tokens,
                completion_tokens=cb.completion_tokens,
                cost=cb.total_cost
            )
            
        # Validate categories
        valid_categories = []
        invalid_categories = []
        standardized_valid = [c.lower().strip() for c in self._categories]
        
        for cat in categorization_response.categories:
            standardized = cat.lower().strip()
            if standardized in standardized_valid:
                valid_categories.append(cat)
            else:
                invalid_categories.append(cat)
        
        if invalid_categories:
            print(f"Warning: Invalid categories detected: {invalid_categories}")
        
        return reasoning_response.reasonings, valid_categories
            

## Process Books

In [87]:
categorizer = OpenAIBookCategorizer(categories)

In [88]:
print(books[1369])

Book(id="e48230b2839f4898bc8f", title="Stock Investing for Dummies", author="Paul Mladjenovic", overview="Open the door to a happy and fulfilling life by le...", author_overview="Paul Mladjenovic is an American speaker and educat...", categories=['Money & Investments'], learning_items=[])


In [89]:
categorizer.categorize(books[1369])

(["The title 'Stock Investing for Dummies' clearly indicates that the book is focused on stock investing, which falls under the category of 'Money & Investments'.",
  'The author, Paul Mladjenovic, is known for writing about investing, which reinforces the idea that this book is primarily about financial education and investment strategies.',
  "The overview mentions 'successful investing' and 'searching for companies and brokers', which further emphasizes the financial aspect of the book, aligning it with investment education.",
  "While the overview mentions 'a happy and fulfilling life', this is likely a secondary theme that relates to the benefits of successful investing rather than the primary focus of the book.",
  "Categories like 'Happiness' or 'Self-Growth' could be considered, but they are not the main focus of the book. The primary aim is to educate readers on stock investing rather than personal development or happiness.",
  "The book does not address topics related to 'Hom

In [90]:
print(str(categorizer.statistics_logger))

Statistics:
	Num inferences: 1
	Average Prompt Tokens: 948.0
	Average Completion Tokens: 313.0
	Average Cost: 0.00032999999999999994
	Total Prompt Tokens: 948
	Total Completion Tokens: 313
	Total Cost: 0.00032999999999999994


In [91]:
import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

books_to_process = books[:] # List of books
l_books_n_predictions = []  # List to store (book, prediction) pairs

bar = tqdm.tqdm(total=len(books_to_process))

def process_book(book):
    """Categorize a single book and return the (book, prediction) pair."""
    prediction = categorizer.categorize(book)
    bar.update()
    return (book, prediction)

# Define the number of workers (adjust based on your system & API rate limits)
num_workers = 50  # Set this to control parallelism

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=num_workers) as executor:
    # Submit all book categorization tasks
    futures = [executor.submit(process_book, book) for book in books_to_process]

    for future in futures:
        book, prediction = future.result()
        l_books_n_predictions.append((book, prediction))

# Print summary
print(f"Processed {len(l_books_n_predictions)} books successfully.")


100%|██████████| 100/100 [02:12<00:00,  1.33s/it]






Processed 1816 books successfully.


In [92]:
print(str(categorizer.statistics_logger))

Statistics:
	Num inferences: 1817
	Average Prompt Tokens: 956.9212988442488
	Average Completion Tokens: 298.6472206934507
	Average Cost: 0.00032272652724270795
	Total Prompt Tokens: 1738726
	Total Completion Tokens: 542642
	Total Cost: 0.5863941000000004


In [93]:
row_items = [
    {
        **book.to_dict(),
        'pred_reasoning_steps': pred[0],
        'pred_categories': pred[1]
    }
    for book, pred in l_books_n_predictions
]

df_predictions = pd.DataFrame(row_items)
for col in ['categories', 'pred_reasoning_steps', 'pred_categories']:
    df_predictions[col] = df_predictions[col].apply(lambda l: '|'.join(l))
for col in ['learning_items']:
    del df_predictions[col]

df_predictions.head()

Unnamed: 0,firebase_document_path,id,title,author,overview,author_overview,categories,pred_reasoning_steps,pred_categories
0,books/0059543c-a8e5-4615-a78c-69f8e73c19c1,0059543c-a8e5-4615-a78c-69f8e73c19c1,Parenting the New Teen in the Age of Anxiety: ...,John Duffy,You're looking at a modern parenting guide. Te...,John Duffy is a successful clinical psychologi...,Family,"The book is a parenting guide, which suggests ...",Family|Health|Society & Tech
1,books/00bb4531-5150-44af-9eb1-c160c7d26ed6,00bb4531-5150-44af-9eb1-c160c7d26ed6,"Right Thing, Right Now: Good Values, Good Char...",Ryan Holiday,The modern world has many fascinating inventio...,Ryan Holiday is known for bringing philosophy ...,Self-Growth,The book discusses the importance of moral cod...,Happiness|Spirituality|Self-Growth
2,books/012399fc36f74c489e4e,012399fc36f74c489e4e,Eat Move Sleep: How Small Choices Lead to Big ...,Tom Rath,"Discover how inner strength, willpower, and ri...",Tom Rath is a renowned author of best-selling ...,Health,The book focuses on well-being and making choi...,Health|Self-Growth
3,books/012d6fde-51e2-4289-822a-1cbb57394bcd,012d6fde-51e2-4289-822a-1cbb57394bcd,"From Strength to Strength: Finding Success, Ha...",Arthur C. Brooks,"Achieving your dreams is fulfilling, but what ...",Arthur C. Brooks is a scientist whose research...,Happiness|Self-Growth,"The book's title, 'From Strength to Strength',...",Happiness|Self-Growth
4,books/017dab0f35d34af59653,017dab0f35d34af59653,"Dream Big: Know What You Want, Why You Want It...",Bob Goff,Break free from the cycle of unfinished projec...,"Robert Kendall Goff is a speaker, lawyer, auth...",Self-Growth,The title 'Dream Big' suggests a focus on aspi...,Self-Growth|Productivity


In [94]:
p_predictions = Path('data/v0.1.0.books_predictions.csv')
df_predictions.to_csv(p_predictions)