In [None]:
import json
from bs4 import *

from bs4 import BeautifulSoup, Tag
import json

def tag_to_dict(tag):
    """Convert a BeautifulSoup Tag to a dictionary."""
    if not isinstance(tag, Tag):
        return str(tag)
    
    return {
        'name': tag.name,
        'attrs': dict(tag.attrs),
        'contents': [tag_to_dict(child) for child in tag.contents]
    }

def dict_to_tag(d):
    """Convert a dictionary back to a BeautifulSoup Tag."""
    if not isinstance(d, dict):
        return d
    
    # Create a new Tag
    soup = BeautifulSoup("", 'html.parser')
    tag = soup.new_tag(d['name'])
    
    # Add attributes
    for key, value in d['attrs'].items():
        tag[key] = value
    
    # Add contents
    for content in d['contents']:
        if isinstance(content, dict):
            tag.append(dict_to_tag(content))
        else:
            tag.append(content)
    
    return tag

def load_chapters_from_json(filename):
    """Load chapters from JSON file and convert back to Tags."""
    with open(filename, 'r', encoding='utf-8') as fp:
        loaded_dict = json.load(fp)
    
    restored_dict = {}
    for title, content in loaded_dict.items():
        restored_dict[title] = [
            (type_info, dict_to_tag(tag_dict) if isinstance(tag_dict, dict) else tag_dict)
            for type_info, tag_dict in content
        ]
    
    return restored_dict

    
restored_book = load_chapters_from_json('Leadership_Kissinger.json')

In [None]:
for k in restored_book.keys():
    for i, entry in enumerate(restored_book[k]):
        restored_book[k][i] = (entry[0], entry[1].get_text())

In [None]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4")
total = []

for k in restored_book.keys():
    total_paragraph = []
    print(f"{k} -> ")
    group = 1

    previous_title = None
    for i, entry in enumerate(restored_book[k]):
        if entry[0] == 'title':
            if total:
                print(f"Section {group} {previous_title}: {int(sum(total))} tokens for {len(total)} paragraphs")
                total = []
                group += 1
                previous_title = entry[1]
            else:
                previous_title = entry[1]
        
        if entry[0] == 'paragraph':
            cnt = len(encoding.encode(entry[1]))
            total.append(cnt)
            restored_book[k][i] = (entry[0], entry[1], cnt)
            
    print(f"Section {group}: {int(sum(total))} tokens for {len(total)} paragraphs")
    print("---------------------------------------------")
    print()

In [None]:
api = "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"


In [None]:
api = "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"

from openai import OpenAI
from tqdm import tqdm

client = OpenAI(api_key=api)
SYSTEM_INSTRUCTION = "You are a helpful assistant that summarizes paragraphs from books. Return markdown formatting without any tags around it."
# Function to summarize text using ChatGPT
def summarize_text(text, messages=[
            {"role": "system", "content": SYSTEM_INSTRUCTION },
            {"role": "user", "content": f"Summarize the following text: {text}"}
        ]):
    response = client.chat.completions.create(
        model="gpt-4o", 
        messages = messages,
    )
    summary = response.choices[0].message.content.strip()
    return summary, messages

In [None]:
# One per subsection

In [None]:
summarized = {}

chapter = restored_book['Introduction']
for k in restored_book:
    print(k)
    chapter = restored_book[k]
    groups = []
    aux = []
    previous_title = ""
    for i, entry in enumerate(chapter):
        if (i+1)%10 == 0:
            print(f"\t{i+1}/{len(chapter)}")
        if entry[0] == 'title':
            if previous_title:
                
                text = "##" + previous_title + "\n".join(aux)
                
                summary_text, messages = summarize_text(text)

                strategy = "Collect paragraphs within subsection (between titles) and summarize those. " + str(messages) 
                
                token_counts = [len(encoding.encode(pg)) for pg in aux]

                section = {
                    "title":previous_title,
                    "original_text": (aux, token_counts),
                    "summary": {
                        "strategy": strategy,
                        "text": summary_text
                    }
                }
                
                groups.append(section)
                previous_title = entry[1]
                aux = []
            else:
                previous_title = entry[1]
    
        if entry[0] == 'paragraph':
            text = entry[1]
            aux.append(text)
            
    if aux:
        text = "##" + previous_title + "\n".join(aux)
        summary_text, messages = summarize_text(text)

        strategy = "Collect paragraphs within subsection (between titles) and summarize those. " + str(messages) 
        
        token_counts = [len(encoding.encode(pg)) for pg in aux]

        section = {
            "title":previous_title,
            "original_text": (aux, token_counts),
            "summary": {
                "strategy": strategy,
                "text": summary_text
            }
        }
        
        groups.append(section)

    summarized[k] = groups

In [128]:
summarized_w_chapters = {}

for chapter_title in summarized.keys():
    print(chapter_title)
    internal_text = f"#Chapter Title: {chapter_title}\n"

    for section in summarized[chapter_title]:
        section_title = section['title']
        summary_text = section['summary']['text']
        
        internal_text += f"##{section_title}" + "\n" + summary_text
        
    messages=[
            {"role": "system", "content": "You are a helpful assistant that summarizes paragraphs from books. Use paragraphs to break logically and to enable better structure. Return markdown formatting without any tags around it."},
            {"role": "user", "content": f"Summarize the following text composed of summaries of each subsection of the chapter '{chapter_title}': {internal_text}"}
    ]

    chapter_summary_text, messages = summarize_text(internal_text, messages=messages)
    
    strategy = "Take the collected summaries for subsections and then summarize using: " + str(messages)

    chapter = {"summary": 
                   {"strategy": strategy,
                    "text": chapter_summary_text},
                "content": summarized[chapter_title]}

    summarized_w_chapters[chapter_title] = chapter    
    

Introduction
Konrad Adenauer: The Strategy of Humility
Charles de Gaulle: The Strategy of Will
Richard Nixon: The Strategy of Equilibrium
Anwar Sadat: The Strategy of Transcendence
Lee Kuan Yew: The Strategy of Excellence
Margaret Thatcher: The Strategy of Conviction
Conclusion: The Evolution of Leadership


In [139]:
summarized_w_chapters.keys()

dict_keys(['Introduction', 'Konrad Adenauer: The Strategy of Humility', 'Charles de Gaulle: The Strategy of Will', 'Richard Nixon: The Strategy of Equilibrium', 'Anwar Sadat: The Strategy of Transcendence', 'Lee Kuan Yew: The Strategy of Excellence', 'Margaret Thatcher: The Strategy of Conviction', 'Conclusion: The Evolution of Leadership'])

In [130]:
simple_book_summary = {
    'title': "Leadership - Henry Kissinger"
}

chapter_summaries = ""
for title, chapter in summarized_w_chapters.items():
    chapter_summaries += f"## Chapter '{title}'" + "\n"
    chapter_summaries += f"{chapter['summary']['text']}"
    chapter_summaries += "\n\n"

In [131]:
messages=[
            {"role": "system", "content": "You are a helpful assistant that summarizes paragraphs from books. Use paragraphs to break logically and to enable better structure. Return markdown formatting without any tags around it."},
            {"role": "user", "content": f"Create a concluding summary (in the number of appropriate paragraphs) of the book {simple_book_summary['title']} from the summaries of its chapters: {chapter_summaries}"}
    ]

book_summary_text, messages = summarize_text(chapter_summaries, messages=messages)
    
strategy = "Take the collected summaries for chapters and then summarize using: " + str(messages)

simple_book_summary['summary'] = {"strategy": strategy, "text": book_summary_text}
simple_book_summary['content'] = summarized_w_chapters

In [132]:
print(simple_book_summary['summary']['text'])

The book "Leadership" by Henry Kissinger delves into the enduring significance of genuine leadership amidst contemporary challenges, reflecting on historical and philosophical insights. By examining six pivotal 20th-century leaders—Konrad Adenauer, Charles de Gaulle, Richard Nixon, Anwar Sadat, Lee Kuan Yew, and Margaret Thatcher—the text explores how transformative leadership tends to surface during times of adversity, as opposed to tranquility, which can cultivate complacency. Philosophers like Machiavelli and Max Weber highlight the critical qualities of resilience, strategic vision, and societal focus, which effective leaders have historically exhibited, emphasizing the role of human agency in leadership despite predictions of impersonal governance.

The narrative demonstrates that successful leaders arose from understanding the social and political contexts of their time, crafting strategies, and inspiring collective progress by elevating societal goals above personal interests. T

In [133]:
with open("simple_book_summary.json", 'w', encoding='utf-8') as fp:
    json.dump(simple_book_summary, fp, ensure_ascii=False, indent=2)

In [142]:
def generate_html(book_dict):
    # Start with the HTML structure and CSS
    html = """<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <style>
        @page {
            size: A4;
            margin: 2.5cm;
        }
        body {
            font-family: "Times New Roman", Times, serif;
            font-size: 12pt;
            line-height: 1.5;
            max-width: 21cm;
            margin: 0 auto;
            padding: 20px;
        }
        h1 {
            font-size: 24pt;
            text-align: center;
            margin-top: 40px;
            margin-bottom: 20px;
        }
        h2 {
            font-size: 18pt;
            margin-top: 30px;
            page-break-before: always;
        }
        .section-container {
            page-break-inside: avoid;
            margin-bottom: 20px;
        }
        h3 {
            font-size: 14pt;
            margin-top: 20px;
            margin-bottom: 10px;
        }
        p {
            margin-bottom: 12pt;
            text-align: justify;
            orphans: 3;
            widows: 3;
        }
        .summary {
            margin: 20px 0;
            font-style: italic;
        }
        .chapter-summary {
            margin: 20px 0;
            font-style: italic;
            padding: 15px;
            background-color: #f8f8f8;
            border-left: 3px solid #333;
        }
        .page-break {
            page-break-after: always;
        }
        .chapter-container {
            margin-bottom: 40px;
        }
        .title-container {
            page-break-inside: avoid;
            margin-bottom: 30px;
        }
        hr {
            margin: 20px 0;
            border: none;
            border-top: 1px solid #000;
        }
    </style>
</head>
<body>
"""
    
    def text_to_paragraphs(text):
        """Convert text with newlines to HTML paragraphs"""
        paragraphs = text.split('\n')
        return '\n'.join([f"<p>{p.strip()}</p>" for p in paragraphs if p.strip()])
    
    # Add title and main summary in a container that won't break
    html += "<div class='title-container'>\n"
    html += f"<h1>{book_dict['title']}</h1>\n"
    html += f"<div class='summary'>{text_to_paragraphs(book_dict['summary']['text'])}</div>\n"
    html += "</div>\n"
    html += "<hr>\n"
    
    # Process each chapter
    for chapter_title, chapter in book_dict['content'].items():
        html += "<div class='chapter-container'>\n"
        html += f"<h2>{chapter_title}</h2>\n"
        html += f"<div class='chapter-summary'>{text_to_paragraphs(chapter['summary']['text'])}</div>\n"
        html += "<hr>\n"
        
        # Process each section in the chapter
        for section in chapter['content']:
            # Wrap each section title and content in a container that won't break
            html += "<div class='section-container'>\n"
            html += f"<h3>{section['title']}</h3>\n"
            html += text_to_paragraphs(section['summary']['text']) + "\n"
            html += "</div>\n"
            
        html += "</div>\n"
    
    # Close the HTML structure
    html += "</body></html>"
    
    return html

def save_html(html_content, filename="book_summary.html"):
    """Save the generated HTML to a file"""
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(html_content)

In [143]:


html_content = generate_html(simple_book_summary)
save_html(html_content)


In [137]:
# Multiple per subsection

In [None]:

min_section_tokens = min([sum(section[2]) for section in groups])

by_count = {}

for section in groups:
    title = section[0]
    
    grouped = {
        "count":[],
        "text": []
    }
    aux = {
        "count":[],
        "text": []
    }
    for i in range(len(section[1])):
        text = section[1][i]
        tokens = section[2][i]
    
        if (sum(aux['count']) + tokens < min_section_tokens):
            aux['count'].append(tokens)
            aux['text'].append(text)

        else:
            aux['count'].append(tokens)
            aux['text'].append(text)
            grouped['count'].append(aux['count'])
            grouped['text'].append(aux['text'])
            aux = {"count":[],"text": []}
            
    if aux['count'] != []:
        grouped['count'].append(aux['count'])
        grouped['text'].append(aux['text'])
    
    grouped['count'] = sum([sum(x) for x in grouped['count']])
    
    by_count[title] = grouped


In [None]:
for k, v in by_count.items():
    texts = v['text']
    for text in texts:
        print(len(encoding.encode("\n".join(text))))

    print()

In [None]:
for k, v in by_count.items():
    texts = v['text']
    for text in texts:
        text = "\n".join(text)
        summary = summarize_text(text)
        if 'summaries' in by_count[k].keys():
            by_count[k]['summaries'].append(summary)
        else:
            by_count[k]['summaries'] = [summary]

In [None]:
print(f"## Introduction")
for g in groups:
    print(f"### {g[0]}:")
    print(f"{g[1]}")
    #print(f"\tText:\n\t\t{g[2]}")
    print()