In [14]:
import pandas as pd
import math
import re
from collections import Counter

pd.set_option("display.max_rows", None)

In [15]:
regex_first_word = re.compile(r"^(\w+)")  # extract the first word


def parse_book(file_part: str, book_name: str, book_order: int) -> pd.DataFrame:
    # Read Tab-separated file and rename columns
    df = pd.read_csv(f"data/{file_part}.tsv", sep="\t")
    df.rename(columns={"Word Count": "Words", "Chapter": "ChapterName"}, inplace=True)

    # Treat numeric chapter names as strings
    df["ChapterName"] = df["ChapterName"].astype("str")
    df["BookName"] = book_name
    df["BookOrder"] = book_order

    # add columns to later be filled
    df["ChapterNumber"] = 0
    df["PartNumber"] = 0.0
    df["PartName"] = ""
    df["CharacterBase"] = ""

    # Start in Part 0.5 named "Prologue"
    current_part_number = 0.5
    current_part_name = "Prologue"
    current_chapter_name = None
    current_chapter_number = 0
    for index, row in df.iterrows():
        if isinstance(row.Character, float):
            # Row denoting a new part
            current_part_number = current_part_number + 0.5
            current_part_name = row.ChapterName
            df.drop(index, inplace=True)  # remove row inplace
        else:
            if math.isnan(row.Percentage):
                # POV-switch; still the same chapter
                # Reuse the chapter name and move all values to the right
                df.at[index, "Percentage"] = row["Words"]
                df.at[index, "Words"] = int(row.Character)
                df.at[index, "Character"] = row.ChapterName
                df.at[index, "ChapterName"] = current_chapter_name
            else:
                # New Chapter
                current_chapter_number = current_chapter_number + 1

                # Fix Part for epilogues
                current_chapter_name = row.ChapterName
                if row.ChapterName.startswith("Epilogue"):
                    current_part_number = current_part_number + 0.5
                    current_part_name = "Epilogue"

            df.at[index, "ChapterNumber"] = current_chapter_number
            df.at[index, "PartNumber"] = current_part_number
            df.at[index, "PartName"] = current_part_name
            df.at[index, "CharacterBase"] = regex_first_word.match(
                df.at[index, "Character"]
            ).group(1)

    # Words is now always an int
    df["Words"] = df["Words"].astype("int")

    return df


def get_all() -> pd.DataFrame:
    # Handle Prelude in the Way of Kings
    pd_wok = parse_book("data_wok", "Way of Kings", 2)
    pd_wok.at[0, "PartNumber"] = 0
    pd_wok.at[0, "PartName"] = "Prelude"

    return pd.concat(
        [
            parse_book("data_warbreaker", "Warbreaker", 1),
            pd_wok,
            parse_book("data_wor", "Words of Radiance", 3),
            parse_book("data_edgedancer", "Edgedancer", 4),
            parse_book("data_oathbringer", "Oathbringer", 5),
            parse_book("data_dawnshard", "Dawnshard", 6),
            parse_book("data_row", "Rhythm of War", 7),
            parse_book("data_sunlit", "Sunlit Man", 8),
        ],
        ignore_index=True,
    )


# Load all books
df = get_all()
print(
    f"Total {df.Words.sum():_} words"
    f" in {df.groupby(['BookName', 'ChapterName']).ngroups} chapters"
    f" in {len(df.BookName.unique())} books"
)

Total 2_098_713 words in 615 chapters in 8 books


In [16]:
# Sum over pov-switches inside chapters ~> only one row per chapter
df_grouped = (
    df.groupby(
        [
            "BookName",
            "BookOrder",
            "ChapterNumber",
            "ChapterName",
            "PartNumber",
            "PartName",
        ]
    )
    .agg(
        Words=("Words", "sum"),
        # Count frequencies
        # Characters=("Character", lambda x: Counter(x)),
        BaseCharacters=("CharacterBase", lambda x: Counter(x)),
    )
    .sort_values(["BookOrder", "ChapterNumber"])
    .reset_index()
)

# Count and number chapters per part
gr = df_grouped.groupby(["BookName", "PartNumber"])
df_grouped["PartChapterCount"] = gr["ChapterName"].transform("count")
df_grouped["PartChapterNumber"] = gr.cumcount() + 1

In [19]:
def week_partition(df: pd.DataFrame, n: int, start_week: int = 1) -> pd.DataFrame:
    """
    Computes the Week-column with values starting in start_week.
    """
    words_per_week = df["Words"].sum() / n
    print(f"Target Words per Week: {words_per_week}")
    df["Week"] = 0  # Add column

    # Compute Week column
    current_week = start_week
    current_week_words = 0
    for index, row in df.iterrows():
        # Count words per week until we have to many
        if current_week_words + row["Words"] * 0.5 > words_per_week:
            # The current chapter shall be read in the next week
            current_week = current_week + 1
            current_week_words = row["Words"]
        else:
            current_week_words = current_week_words + row["Words"]

        df.at[index, "Week"] = current_week

    # Group by week
    df_res = df.groupby("Week").agg(
        WordCount=("Words", "sum"),
        # PartCount=("PartName", lambda x: len(set(x))),
        ChapterCount=("ChapterName", "count"),
        StartBook=("BookName", "first"),
        StartPart=("PartName", "first"),
        StartChapter=("ChapterName", "first"),
        EndBook=("BookName", "last"),
        EndPart=("PartName", "last"),
        EndChapter=("ChapterName", "last"),
        EndPartChapterNumber=("PartChapterNumber", "last"),
        EndPartChapterCount=("PartChapterCount", "last"),
        BaseCharacters=("BaseCharacters", lambda x: sum(x, Counter())),
    )

    # Compute how many chapters remain in the part
    df_res.insert(
        loc=10,
        column="ChaptersRemaining",
        value=df_res.EndPartChapterCount - df_res.EndPartChapterNumber,
    )
    df_res.drop(columns=["EndPartChapterNumber", "EndPartChapterCount"], inplace=True)

    return df_res.reset_index()


def render(df: pd.DataFrame) -> pd.DataFrame:
    df_render = df.assign(
        BaseCharacters=df.BaseCharacters.apply(
            lambda x: ", ".join(
                f"{name} ({occ})" if occ > 1 else name for name, occ in x.most_common()
            )
        )
    )

    df_render.to_clipboard(index=False)

    return df_render.style.hide(axis="index")


df_weekly = week_partition(df_grouped, 46, 2)
render(df_weekly)

Target Words per Week: 45624.19565217391


Week,WordCount,ChapterCount,StartBook,StartPart,StartChapter,EndBook,EndPart,EndChapter,ChaptersRemaining,BaseCharacters
2,44013,14,Warbreaker,Prologue,Prologue,Warbreaker,Main,13,45,"Siri (10), Vivenna (5), Lightsong (4), Vasher (2), Dedelin"
3,47869,13,Warbreaker,Main,14,Warbreaker,Main,26,32,"Lightsong (8), Siri (8), Vivenna (6), Vasher"
4,46801,14,Warbreaker,Main,27,Warbreaker,Main,40,18,"Vivenna (7), Siri (6), Lightsong (3), Vasher"
5,46066,15,Warbreaker,Main,41,Warbreaker,Main,55,3,"Vivenna (10), Lightsong (10), Siri (8), Vasher (4), Chapps"
6,48773,12,Warbreaker,Main,56,Way of Kings,Part One: Above Silence,6: Bridge Four,5,"Siri (5), Vivenna (4), Vasher (4), Kaladin (3), Lightsong (2), Shallan (2), Kalak, Szeth, Cenn"
7,41071,11,Way of Kings,Part One: Above Silence,7: Anything Reasonable,Way of Kings,Part Two: The Illuminating Storms,14: Payday,14,"Kaladin (4), Adolin (3), Shallan (2), Dalinar (2), Ishikk, Balat, Szeth"
8,46863,8,Way of Kings,Part Two: The Illuminating Storms,15: The Decoy,Way of Kings,Part Two: The Illuminating Storms,"22: Eyes, Hands or Spheres",6,"Dalinar (6), Adolin (4), Kaladin (4)"
9,45771,9,Way of Kings,Part Two: The Illuminating Storms,23: Many Uses,Way of Kings,Interludes,Interlude: A Work of Art,0,"Kaladin (3), Dalinar (3), Adolin (2), Rysn, Axies, Szeth"
10,43323,11,Way of Kings,Part Three: Dying,29: Errorgance,Way of Kings,Part Three: Dying,39: Burned into Her,12,"Kaladin (8), Shallan (4), Gaz (2), Teft (2)"
11,45065,9,Way of Kings,Part Three: Dying,40: Eyes of Red and Blue,Way of Kings,Part Three: Dying,48: Strawberry,3,"Kaladin (6), Shallan (3)"


In [18]:
render(df_grouped)

BookName,BookOrder,ChapterNumber,ChapterName,PartNumber,PartName,Words,BaseCharacters,PartChapterCount,PartChapterNumber,Week
Warbreaker,1,1,Prologue,0.5,Prologue,3248,Vasher,1,1,2
Warbreaker,1,2,1,1.0,Main,3721,"Siri, Dedelin",58,1,2
Warbreaker,1,3,2,1.0,Main,2643,"Siri (2), Vivenna (2)",58,2,2
Warbreaker,1,4,3,1.0,Main,3558,Lightsong,58,3,2
Warbreaker,1,5,4,1.0,Main,1848,Siri,58,4,2
Warbreaker,1,6,5,1.0,Main,3405,"Vasher, Lightsong",58,5,2
Warbreaker,1,7,6,1.0,Main,4280,Siri,58,6,2
Warbreaker,1,8,7,1.0,Main,3660,"Siri, Lightsong",58,7,2
Warbreaker,1,9,8,1.0,Main,2948,Siri,58,8,2
Warbreaker,1,10,9,1.0,Main,3386,Vivenna,58,9,2
