In [1]:
import os
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import langchain
langchain.debug = False

from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.callbacks import get_openai_callback

from openai.error import InvalidRequestError

In [2]:
from typing import List, Dict, Tuple, Optional, Union, Callable
from os import PathLike
from langchain.callbacks.openai_info import OpenAICallbackHandler

In [3]:
with open('openai_api_key.txt', 'r') as f:
    api_key = f.read()
os.environ["OPENAI_API_KEY"] = api_key

<h1>Critcizing books</h1>

In [4]:
# TODO: LLM is biased to always criticize something even if it makes little sense. 
# It's also biased to always produce 5 points of critique. Try to resolve it with better prompting.

template_chapter_critique = '''
Try to criticize a webnovel book chapter below.
Focus on poor writing and defects that can be seen in the text rather than on simple grammatical or spelling errors.
Your critique should lead to the author being able to improve the chapter and his writing skills ultimately leading to the book being better and more popular.
Also consider which chapter it is defined by chapter number, the first chapters might need a little different approach than the latter ones as they mush hook the reader.
Write anywhere from 0 to 5 points of critique. Do not write critique that is not at least partially objective. It's okay to not write 5 points or to even write 0 points if chapter is already good enough.
If there is nothing to criticize, write "0. The chapter is good.". 
Start each of your points with a newline and a number, e.g. 
1. The dialogs are not well structured - it's hard to understand who is speaking at the moment.
2. The author uses too many adjectives - it's hard to understand what is important and what is not.
Support your argument with a quote from the chapter whenever that is possible.
Given chapter number {chapter_number} of the book {book_name}: 
{chapter}
'''

template_critique_summary = '''
You will be given a critique of book chapters 1-{n_chapters}.
Your task is to write a summary of the critique. Try to connect specific points of critique into a more general critique.
Disregard critique that is supported only by one point and focus on flaws that are shown in multiple points.
The aim of this summary is to help the author improve his writing skills and ultimately make the book better and more popular.
Start each of your points with a newline and a number, e.g. 
1. The dialogs are not well structured - it's hard to understand who is speaking at the moment.
2. The author uses too many adjectives - it's hard to understand what is important and what is not.
There is {n_points} points of critique in total. Your summary should be much shorter than that with at most {half_n_points} points but it's okay to write even less than that
if the critique points are not consistent enough to form a more general points and therefore should be disregarded.
The critique is as follows:
{critique}
'''

template_chapter_scoring = '''
Try to score a webnovel book chapter given below.
Score it from 0 to 10 and be based on predicted popularity of the novel.
You can use decimal places for the score to be more specific but do not get too specific as 0.1 resolution is more than enough.
0 means that quality is really low from the get go and noone will read more than a few paragraphs.
5 means that it's average so it can be read but it's not good enough to be popular (can have few readers but not more than few hundreds).
10 means that it's really great and should be popular (can have thousands of readers and should be in the top of the ranking on any webnovel website).

Try to explain the scoring a little bit but do not write more than 2-3 sentences.
Given chapter number {chapter_number} of the book {book_name}: 
{chapter}
'''

template_scoring_summary = '''
Try to summarize a webnovel book popularity scores into one number. 
Score it from 0 to 10. 
You can use decimal places for the score to be more specific but do not get too specific as 0.1 resolution is more than enough.
0 means that quality is really low from the get go and noone will read more than a few paragraphs.
5 means that it's average so it can be read but it's not good enough to be popular (can have few readers but not more than few hundreds).
10 means that it's really great and should be popular (can have thousands of readers and should be in the top of the ranking on any webnovel website).
Keep in mind that scoring of the first chapters might be more important to overall popularity scoring as they are the ones that hook the reader.
The exception for this might be scoring for some kind of glossary or wiki which sometimes shows as the first chapter but is usually skipped by the readers and should't contribute to the overall score.
Do not explain the score, just give a single number from 0 to 10 and no more words in your reply. Also do not reply with more than 1 number.
The scoring of the first {n_chapters} is as follows: 
{scoring}
'''

llm = ChatOpenAI(temperature=0, request_timeout=20)
prompt_chapter_critique = ChatPromptTemplate.from_template(template_chapter_critique)
chain_chapter_critique = LLMChain(llm=llm, prompt=prompt_chapter_critique)
prompt_critique_summary = ChatPromptTemplate.from_template(template_critique_summary)
chain_critique_summary = LLMChain(llm=llm, prompt=prompt_critique_summary)

prompt_chapter_scoring = ChatPromptTemplate.from_template(template_chapter_scoring)
chain_chapter_scorer = LLMChain(llm=llm, prompt=prompt_chapter_scoring)
prompt_scoring_summary = ChatPromptTemplate.from_template(template_scoring_summary)
chain_scoring_summary = LLMChain(llm=llm, prompt=prompt_scoring_summary)

In [5]:
def load_html(filepath):
    with open(filepath, 'r') as f:
        content = f.read()
    soup = BeautifulSoup(content, 'lxml')
    content = soup.body.find('div', attrs={'id':'chr-content'})
    title = content.find('h4')

    content = content.find_all('p')
    content = [c.text for c in content]
    content = [re.sub('\s+', ' ', re.sub('^\s+|\s+$', '', paragraph)) for paragraph in content]
    content = [paragraph for paragraph in content if paragraph != '']
    content = '\n'.join(content)
    if title is not None:
        content = title.text + '\n' + content
    return content

In [6]:
def criticize_book(
        dir_path: PathLike,
        book_name: str,
        n_chapters: int,
        chain_critiqe: LLMChain,
        chain_summarizer: LLMChain
        ) -> Tuple[str, str]:
    '''
    Criticize a book chapter by chapter and then summarize the critique to criticize the whole book more generally.
    '''
    all_results = []
    n_crit_points = 0

    for chapter_number in tqdm(range(1, n_chapters+1)):
        file = os.path.join(dir_path, book_name, f'{chapter_number-1}.html')
        content = load_html(file)
        # TODO: # Below line can fail if the chapter is too long. Resolve it by doing some kind of chunking.
        result = chain_critiqe.run(chapter_number=chapter_number, book_name=book_name, chapter=content) 
        result = f'Chapter {chapter_number} critique:\n' + result
        all_results.append(result)

        n_points = len([point for point in result.split('\n') if re.search('^[0-9]+\.', point) is not None])
        n_crit_points += n_points

    whole_critique = '\n'.join(all_results)
    # TODO: Below line can fail if the critique is too long. Resolve it by doing some kind of chunking.
    summarized_critique = chain_summarizer.run(
        n_chapters=n_chapters, n_points=n_crit_points,
        half_n_points=n_crit_points//2, critique=whole_critique
        )
    return summarized_critique, whole_critique


def score_book(
        dir_path: PathLike,
        book_name: str,
        n_chapters: int,
        chain_score_detail: LLMChain,
        chain_score_summarizer: LLMChain
        ) -> Tuple[str, str]:
    '''
    Score a book chapter by chapter and then summarize the scoring into one number.
    '''
    all_results = []

    for chapter_number in tqdm(range(1, n_chapters+1)):
        file = os.path.join(dir_path, book_name, f'{chapter_number-1}.html')
        content = load_html(file)
        # TODO: # Below line can fail if the chapter is too long. Resolve it by doing some kind of chunking.
        result = chain_score_detail.run(chapter_number=chapter_number, book_name=book_name, chapter=content) 
        result = f'Chapter {chapter_number} score:\n' + result
        all_results.append(result)

    whole_scoring = '\n'.join(all_results)
    # TODO: Below line can fail if the scoring is too long. Resolve it by doing some kind of chunking.
    summarized_scoring = chain_score_summarizer.run(
        n_chapters=n_chapters, scoring=whole_scoring
        )
    return summarized_scoring, whole_scoring

In [7]:
class BookFeedbackPipeline:
    def __init__(
            self,
            pipeline_func: Callable,
            chain_level_1: LLMChain,
            chain_level_2: LLMChain,
            data_directory: PathLike
            ):
        self.pipeline_func = pipeline_func
        self.chain_level_1 = chain_level_1
        self.chain_level_2 = chain_level_2
        self.data_directory = data_directory

    def run(self, book_name: str, n_chapters: int) -> Tuple[str, str]:
        '''
        Runs the pipeline for a single book.
        '''
        level_2_results, level_1_results = self.pipeline_func(
            self.data_directory,
            book_name,
            n_chapters,
            self.chain_level_1,
            self.chain_level_2
        )
        return level_2_results, level_1_results
    
    def run_all(self, n_chapters: int) -> Tuple[Dict[str, str], Dict[str, str], OpenAICallbackHandler]:
        '''
        Runs the pipeline for all books in the data directory.
        '''
        book_names = os.listdir(self.data_directory)

        level_1_storage = {}
        level_2_storage = {}
        with get_openai_callback() as cb:
            for book_name in tqdm(book_names):
                try:
                    level_2_results, level_1_results = self.run(book_name, n_chapters)
                except InvalidRequestError as e:
                    print(f'Failed to run pipeline for book: {book_name}.\nError: {e}')
                level_2_storage[book_name] = level_2_results
                level_1_storage[book_name] = level_1_results
        return level_2_storage, level_1_storage, cb


In [8]:
data_directory = 'chosen_books'
critique_pipeline = BookFeedbackPipeline(criticize_book, chain_chapter_critique, chain_critique_summary, data_directory)
scoring_pipeline = BookFeedbackPipeline(score_book, chain_chapter_scorer, chain_scoring_summary, data_directory)

In [17]:
crit_summaries, crit_details, crit_api_usage = critique_pipeline.run_all(n_chapters=4)
scoring_summaries, scoring_details, scoring_api_usage = scoring_pipeline.run_all(n_chapters=4)

  0%|          | 0/10 [00:00<?, ?it/s]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-O58BdWGkFEHb36qtingdLANv on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-O58BdWGkFEHb36qtingdLANv on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account

In [28]:
import json
with open('crit_summaries.json', 'w') as f:
    json.dump(crit_summaries, f)
with open('crit_details.json', 'w') as f:
    json.dump(crit_details, f)
with open('crit_api_usage.txt', 'w') as f:
    f.write(str(crit_api_usage))


with open('scoring_summaries.json', 'w') as f:
    json.dump(scoring_summaries, f)
with open('scoring_details.json', 'w') as f:
    json.dump(scoring_details, f)
with open('scoring_api_usage.txt', 'w') as f:
    f.write(str(scoring_api_usage))

In [40]:
with open('scoring_summaries.json', 'r') as f:
    temp = json.load(f)

In [41]:
for k, val in temp.items():
    print('='*20)
    print(k)
    print(val)

TheBeginningAfterTheEnd
7.5
SeizedByTheSystem
6.5
TheBookEatingMagician
Score: 8.0
TheExtraSOdyssey
7.0
SupremeMagus
Score: 5.5
HolisticFantasy
5.5
MemoryLost
7.0
ShadowSlave
Score: 8.0
MesmerizingGhostDoctor
Score: 8.0
TheBlackNecromancer
Overall score: 6.0
