In [2]:
import os
os.chdir('/Users/maksym.konevych/Documents/Projects/HWPersonalization/Categorization/Code/ai_categories_experiment')

In [3]:
import dotenv
from pathlib import Path

import pandas as pd

from src.core import Book

In [4]:
dotenv.load_dotenv('secrets/.env')

True

In [5]:
p_books = Path('data/v0.1.0.books.csv')
df_books = pd.read_csv(p_books, index_col=0)
df_books['categories'] = df_books['categories'].apply(lambda s: s.split('|') if not pd.isna(s) else [])

In [6]:
df_books.head()

Unnamed: 0,firebase_document_path,id,title,author,overview,author_overview,categories
0,books/0059543c-a8e5-4615-a78c-69f8e73c19c1,0059543c-a8e5-4615-a78c-69f8e73c19c1,Parenting the New Teen in the Age of Anxiety: ...,John Duffy,You're looking at a modern parenting guide. Te...,John Duffy is a successful clinical psychologi...,[Family]
1,books/008df3c8-12d8-4386-9b8b-0b1b2cc3771f,008df3c8-12d8-4386-9b8b-0b1b2cc3771f,Make Today Count: The Secret of Your Success I...,John C. Maxwell,Make your days less chaotic and more impactful...,"Known as an international leadership expert, J...",[]
2,books/00bb4531-5150-44af-9eb1-c160c7d26ed6,00bb4531-5150-44af-9eb1-c160c7d26ed6,"Right Thing, Right Now: Good Values, Good Char...",Ryan Holiday,The modern world has many fascinating inventio...,Ryan Holiday is known for bringing philosophy ...,[Self-Growth]
3,books/012399fc36f74c489e4e,012399fc36f74c489e4e,Eat Move Sleep: How Small Choices Lead to Big ...,Tom Rath,"Discover how inner strength, willpower, and ri...",Tom Rath is a renowned author of best-selling ...,[Health]
4,books/012d6fde-51e2-4289-822a-1cbb57394bcd,012d6fde-51e2-4289-822a-1cbb57394bcd,"From Strength to Strength: Finding Success, Ha...",Arthur C. Brooks,"Achieving your dreams is fulfilling, but what ...",Arthur C. Brooks is a scientist whose research...,"[Happiness, Self-Growth]"


In [7]:
books = []
for i, row in df_books.iterrows():
    books.append(Book(
        firebase_document_path=row.firebase_document_path,
        title=row.title,
        author=row.author,
        overview=row.overview,
        author_overview=row.author_overview,
        categories=row.categories,
    ))

## Idea

For every book, ask an LLM to generate categories. Traverse through all books, collect data about generated categories, and analyze it to figure out possible final sets of categories.

In [8]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_community.callbacks import get_openai_callback
from pydantic import BaseModel, Field

from src.statistics_logging import StatisticsLogger


In [9]:
class OpenAICategoriesSynthesizer:
    class BookCategoriesResponseModel(BaseModel):
        categories: list[str] = Field(description="List of categories")
    
    def __init__(self) -> None:
        self._logger = StatisticsLogger()

        llm_categories_gen = ChatOpenAI(model='gpt-4o-mini', temperature=0).with_structured_output(self.BookCategoriesResponseModel)
        prompt_template_categories_gen = PromptTemplate.from_template(
            "You are an expert in book categorization. Create 1-3 mutually exclusive categories for this book.\n"
            "**Requirements**:\n"
            "1. Make categories broad and generic - users should easily find books without being overwhelmed\n"
            "2. Avoid too specific categories. Imagine that the total number of categories is 15-30 and you have to guess from existing categories."
            "3. Avoid hierarchical relationships - never include both parent and child categories\n"
            "4. Prefer short category names over long ones\n"
            "5. Ensure full independence - categories shouldn't overlap or subsume each other\n"
            "\n"
            "**Example of BAD output**: \n"
            "['Health', 'Physical Health', 'Mental Wellness'] (contains hierarchy)\n"
            "['Social Philosophy', 'Brand Management', 'Consumer Awareness'] (too specific)\n"
            "**Example of GOOD output**: \n"
            "['Nutrition', 'Fitness', 'Self-Growth']\n"
            "\n"
            "### Book Data:\n"
            "**Title**: {title}\n"
            "**Author**: {author}\n"
            "**Overview**:\n"
            "{overview}\n"
            "**Author Overview**:\n"
            "{author_overview}"
        )

        self._categories_gen_chain = prompt_template_categories_gen | llm_categories_gen

    @property
    def logger(self) -> StatisticsLogger:
        return self._logger
    
    def generate_categories(self, book: Book) -> list[str]:
        with get_openai_callback() as cb:
            res = self._categories_gen_chain.invoke({
                'title': book.title,
                'author': book.author,
                'overview': book.overview,
                'author_overview': book.author_overview,
            })

            self._logger.log(
                prompt_tokens=cb.prompt_tokens,
                completion_tokens=cb.completion_tokens,
                cost=cb.total_cost
            )
        return res.categories



In [10]:
cs = OpenAICategoriesSynthesizer()

In [11]:
i = 754
print(books[i])
print(cs.generate_categories(books[i]))

Book(id="DnOXwhBHwadUfg3vz7xJ", title="The Universe Has Your Back: Transform Fear to Faith", author="Gabrielle Bernstein", overview="Curious if there's more to life than fear? Venture...", author_overview="​​Gabrielle Bernstein is a renowned motivational s...", categories=['Spirituality', 'Self-Growth'], learning_items=[])
['Self-Help', 'Spirituality', 'Personal Development']


In [12]:
print(cs._logger)

Statistics:
	Num inferences: 1
	Average Prompt Tokens: 326.0
	Average Completion Tokens: 16.0
	Average Cost: 5.85e-05
	Total Prompt Tokens: 326
	Total Completion Tokens: 16
	Total Cost: 5.85e-05


In [13]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def process_book(book):
    return book, cs.generate_categories(book)

with ThreadPoolExecutor(max_workers=100) as executor:
    results = list(tqdm(executor.map(process_book, books), total=len(books), desc="Processing Books"))


Processing Books: 100%|██████████| 1817/1817 [00:39<00:00, 46.46it/s]


In [14]:
print(cs.logger)

Statistics:
	Num inferences: 1818
	Average Prompt Tokens: 335.8916391639164
	Average Completion Tokens: 14.00935093509351
	Average Cost: 5.878935643564354e-05
	Total Prompt Tokens: 610651
	Total Completion Tokens: 25469
	Total Cost: 0.10687904999999996


In [15]:
all_categories = [cat.lower() for _, cats in results for cat in cats]

In [16]:
len(all_categories), len(set(all_categories))

(5451, 396)

In [17]:
from collections import Counter

counter = Counter(all_categories)

In [18]:
import pandas as pd

In [19]:
df_categories = pd.DataFrame(sorted(counter.items(), key=lambda i: i[1], reverse=True), columns=['category', 'count'])

In [20]:
df_categories.head()

Unnamed: 0,category,count
0,self-help,932
1,personal development,587
2,business,328
3,psychology,317
4,health,138


In [21]:
p_data_dest = Path('data/cat_gen')
p_data_dest.mkdir(exist_ok=True)
df_categories.to_csv(p_data_dest / 'v0.0.1.categories.csv')

In [22]:
results

[(<src.core.Book at 0x113ddcc70>,
  ['Parenting', 'Mental Health', 'Adolescence']),
 (<src.core.Book at 0x113ddc310>,
  ['Self-Help', 'Personal Development', 'Motivation']),
 (<src.core.Book at 0x113ddfc70>,
  ['Philosophy', 'Self-Help', 'Personal Development']),
 (<src.core.Book at 0x113dddc00>, ['Health', 'Wellness', 'Lifestyle']),
 (<src.core.Book at 0x113ddcd90>,
  ['Self-Help', 'Personal Development', 'Life Transitions']),
 (<src.core.Book at 0x113ddff70>,
  ['Self-Help', 'Motivation', 'Personal Development']),
 (<src.core.Book at 0x113ddfc40>,
  ['Self-Help', 'Motivation', 'Personal Development']),
 (<src.core.Book at 0x113ddcd30>,
  ['Sustainability', 'Food & Cooking', 'Lifestyle']),
 (<src.core.Book at 0x113ddffa0>,
  ['Self-Help', 'Personal Development', 'Business']),
 (<src.core.Book at 0x113ddce80>, ['Finance', 'Self-Help', 'Business']),
 (<src.core.Book at 0x113ddcca0>,
  ['Self-Help', 'Personal Development', 'Mental Health']),
 (<src.core.Book at 0x117656cb0>,
  ['Communic

In [79]:
class FormalizedCategory(BaseModel):
        name: str = Field(description='Category name')
        description: str = Field(description='Category description')


class FormalizedBookCategoriesResponseModel(BaseModel):
    categories: list[FormalizedCategory] = Field(description="Final list of categories")


class OpenAICategoriesConsolidator:
    
    def __init__(self) -> None:
        self._logger = StatisticsLogger()

        llm_categories_consolidator = ChatOpenAI(model='gpt-4o-mini', temperature=0).with_structured_output(FormalizedBookCategoriesResponseModel)
        prompt_template_categories_consolidator = PromptTemplate.from_template(
            'You are an expert in taxonomy and classification. I have a list of raw book categories that contains: \n'
            ' - Near-duplicates (e.g., "Sexuality" and "Sexual Health") \n'
            ' - Broad and specific categories (e.g., "Fitness" and "Martial Arts") \n'
            ' - Overlapping topics that could be consolidated \n'
            'Your task is to synthesize these into {min}-{max} well-balanced final categories following these instructions: \n'
            ' 1. **Avoid long category names. E.g. "Education" is better than "Education & Learning", "Cooking" is better than "Culinary Arts & Food"**.\n'
            ' 2. Minimize redundancy. \n'
            ' 4. Cover all major themes while avoiding excessive granularity. \n'
            ' 5. Use clear, intuitive, concise names that make sense to a broad audience. \n'
            ' 6. Consider the fact that these categories will be used primarily for non-fiction books that help adults to grow and learn new stuff. \n'
            'Raw book categories:\n'
            '{categories} \n'
            'Please return the final list of categories along with a description (specification) to each of them. \n'
        )

        self._categories_consolidator_chain = prompt_template_categories_consolidator | llm_categories_consolidator

    @property
    def logger(self) -> StatisticsLogger:
        return self._logger
    
    def consolidate_categories(self, categories: list[str], desired_min: int, desired_max: int) -> list[FormalizedCategory]:
        with get_openai_callback() as cb:
            res = self._categories_consolidator_chain.invoke({
                'categories': categories,
                'min': desired_min,
                'max': desired_max
            })

            self._logger.log(
                prompt_tokens=cb.prompt_tokens,
                completion_tokens=cb.completion_tokens,
                cost=cb.total_cost
            )

        return res.categories



In [80]:
consolidator = OpenAICategoriesConsolidator()

In [96]:
min_cat = 10
max_cat = 15
final_categories = consolidator.consolidate_categories(list(set(all_categories)), min_cat, max_cat)

In [97]:
print(consolidator.logger)

Statistics:
	Num inferences: 4
	Average Prompt Tokens: 2013.0
	Average Completion Tokens: 586.5
	Average Cost: 0.0006538499999999999
	Total Prompt Tokens: 8052
	Total Completion Tokens: 2346
	Total Cost: 0.0026153999999999995


In [98]:
df_final_categories = pd.DataFrame([v.__dict__ for v in final_categories])

In [99]:
df_final_categories

Unnamed: 0,name,description
0,Personal Development,"Books focused on self-improvement, emotional i..."
1,Health & Wellness,"Resources on physical health, mental well-bein..."
2,Social Sciences,"Explorations of human behavior, society, cultu..."
3,Business & Economics,"Insights into business management, finance, ma..."
4,Education & Learning,Guides on personal and professional developmen...
5,Politics & Society,"Analysis of political theories, civil rights, ..."
6,Creativity & Arts,"Books on creative processes, art, literature, ..."
7,Science & Technology,"Explorations of scientific concepts, technolog..."
8,History & Culture,"Insights into historical events, cultural stud..."
9,Family & Relationships,"Guides on family dynamics, parenting, and inte..."


In [100]:
df_final_categories.to_csv(p_data_dest / f'v0.0.1.final_categories.{min_cat}-{max_cat}.csv')