In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Everything

In [None]:
# df = pd.read_csv('data/reading/20240101-2024120406.csv')
# df = pd.read_csv('data/reading/20240101-2024120415.csv')
df = pd.read_csv('data/reading/temp.csv')

df.head().T

In [None]:
columns_keep_rename = {
    'Description': 'title',
    'Tags': 'tags',
    'Start Date': 'start_date',
    'Start Time': 'start_time',
    'End Date': 'end_date',
    'End Time': 'end_time',
    'Duration (h)': 'duration',
    'Duration (decimal)': 'duration_frac',
}


df_rename = df.rename(
    columns=columns_keep_rename
)[columns_keep_rename.values()]

df_rename.T

In [None]:
df_clean = df_rename.copy()

# Clean up the titles
df_clean.title = (
    df_clean.title
    .str.replace('Read ','', n=1)  # Removes first instance (at beginning)
    .str.replace("''", "'")        # Changes double quotes to one
    # .str[1:-1]                     # Don't keep single quotes surrounding title
)

# Clean up the dates
df_clean['start'] = pd.to_datetime(
    df_clean.start_date  + ' ' + df_clean.start_time
)
df_clean = df_clean.drop(
    ['start_date','start_time'],
    axis=1,
)
df_clean['end'] = pd.to_datetime(
    df_clean.end_date  + ' ' + df_clean.end_time
)
df_clean = df_clean.drop(
    ['end_date','end_time'],
    axis=1,
)

# Duration as timedelta
df_clean.duration = pd.to_timedelta(df_clean.duration)

# Make chapters sorted by number
df_clean = (
    df_clean.replace(
        {
            'tags': {f'chp{n}': f'chp0{n}' for n in range(10)},
        }
    )
)

df_clean

In [5]:
df_clean.to_csv('data/reading/temp-clean.csv', index=None)

# Individuals

In [None]:
df = pd.read_csv('data/reading/20240425-2024120506-clean.csv')

df.duration = pd.to_timedelta(df.duration)
df.start = pd.to_datetime(df.start)
df.end = pd.to_datetime(df.end)
df

In [None]:
def get_one_title_json(data: pd.DataFrame, title: str) -> dict:
    # Filter for specific title
    one_title = data[
        data.title == title #'Through the Language Glass'
        # df.title == 'How Google Works'
        # df.title == 'Death\'s End'
        # df.title == 'Disability Visibility'
        # df.title == 'In the Land of Invented Languages'
    ].copy()

    td_sum = one_title.duration.sum()
    td_components = td_sum.components
    td_total_seconds = td_sum.seconds

    # Build info json

    one_title_info = dict(
        title=one_title.title.iloc[0],
        start=one_title.start.min().strftime('%Y%m%d-%H%M%S'),
        end=one_title.end.max().strftime('%Y%m%d-%H%M%S'),
        total_seconds=td_total_seconds,
        duration_days=td_components.days,
        duration_hours=td_components.hours,
        duration_minutes=td_components.minutes,
        duration_seconds=td_components.seconds,
    )

    return one_title_info

In [None]:
import json



In [None]:
for t in df.title.unique():
    print(t)
    one_title = get_one_title_json(
        data=df,
        title=t,
    )
    t_ = (
        t
        .replace('\'',' ')
        .replace(' ','_')
    )
    with open(f'data/reading/{t_}.json', 'w') as f:
        json.dump(one_title, f)
    print('\t done')

In [None]:
# Filter for specific title
# one_title = df[
    # df.title == 'Through the Language Glass'
    # df.title == 'How Google Works'
    # df.title == 'Death\'s End'
    # df.title == 'Disability Visibility'
    # df.title == 'In the Land of Invented Languages'
# ].copy()


one_title = get_one_title_json(
    data=df,
    title='Death\'s End'
    # title='Through the Language Glass',
)

one_title

## Plot

In [None]:
df = pd.read_csv('data/reading/20240101-2024120406-clean.csv')

df.duration = pd.to_timedelta(df.duration)
df.start = pd.to_datetime(df.start)
df.end = pd.to_datetime(df.end)

one_title = df[df.title == 'Death\'s End']
one_title

In [None]:

def one_title_data(data: pd.DataFrame, book_name: str) -> pd.DataFrame:
    one_title = data[data.title == book_name].copy()
    one_title.duration = pd.to_timedelta(one_title.duration)
    one_title.start = pd.to_datetime(one_title.start)
    one_title.end = pd.to_datetime(one_title.end)

    return one_title


def get_summary_by_day(
        data: pd.DataFrame,
        col: str = 'duration_frac'
) -> pd.DataFrame:
    ####
    temp_ = (
        data
        .groupby(
            by=data.start.dt.floor('d')
        )[col]
        .sum()
    )

    start_date = data.start.dt.floor('d').min()
    end_date = data.end.dt.floor('d').max()

    idx_ = pd.date_range(
        start=start_date,
        end=end_date,
    )

    s = temp_.reindex(idx_, fill_value=0)
    temp_df_ = pd.DataFrame(s).reset_index()
    return temp_df_

##### PLOTTING

def generate_plot(data: pd.DataFrame, x: str='index', y: str='duration_frac'):
    f, ax = plt.subplots(
        figsize=(10,6)
    )

    sns.barplot(
        data=data,
        x='index',
        y='duration_frac',
        color='green',
        ax=ax,
    )

    # Axis formatting
    x_dates = (
        data['index'].dt
        .strftime('%D')
        .sort_values()
        .unique()
    )
    ax.set_xticklabels(
        labels=x_dates,
    )

    #
    ax.set_title(
        f'Reading `{book_name}`'
        f'\n{data.duration_frac.sum():.2f}hrs'
        f' over {x_dates[0]} - {x_dates[-1]}'
    )
    ax.set_ylabel('Hours Read')
    ax.set_xlabel('Date')

    f.tight_layout()
    f.autofmt_xdate()

In [None]:
df = pd.read_csv('data/reading/20240101-2024120406-clean.csv')
book_name = 'Practical Fairness'
one_title = one_title_data(df, book_name)
one_title_summary = get_summary_by_day(one_title)
generate_plot(one_title_summary)

In [None]:
f, ax = plt.subplots(
    figsize=(10,6)
)

sns.barplot(
    data=one_title,
    x='start',
    y='duration_frac',
    # order=plot_order,
    # hue='part',
    # color='green',
    estimator='sum',
    errorbar=None,
    ax=ax,
)

f.tight_layout()
f.autofmt_xdate()

In [None]:
_temp = one_title.tags.str.split(', ')
one_title['part'] = _temp.str[-1]
one_title['chp'] = _temp.str[0]

one_title

In [None]:
plot_order = ['prologue', 'forward', 'intro'] + (
    'p' + 
    one_title.part[
        one_title.tags.str.contains('part')
    ].str[-1] + '-' + one_title.chp[
        one_title.chp.str.contains('chp')
    ]
).sort_values().to_list() + ['epilogue']
plot_order

In [None]:
# plot_order = ['prologue'] + one_title.chp[
#     one_title.chp.str.contains('chp')
# ].sort_values().to_list() + ['epilogue']

# one_title.tags = 'p'+one_title.part.str[-1] +'-'+ one_title.chp
f, axs = plt.subplots(
    nrows=one_title.part.nunique(),
    # sharex=True,
    figsize=(10,12)
)

for i,ax in zip(one_title.part.unique(),axs):
    ax.set_title(i)
    sns.barplot(
        data=one_title[one_title.part==i],
        x='chp',
        y='duration_frac',
        # order=plot_order,
        # hue='part',
        # color='green',
        # estimator='sum',
        # errorbar=None,
        ax=ax,
    )

f.tight_layout()
f.autofmt_xdate()


In [None]:

plot_order = ['prologue'] + one_title.tags[
    one_title.tags.str.contains('chp')
].sort_values().to_list() + ['epilogue']


f, ax = plt.subplots(
    figsize=(8,6)
)

sns.barplot(
    data=one_title,
    x='tags',
    y='duration_frac',
    order=plot_order,
    color='green',
    estimator='sum',
    errorbar=None,
    ax=ax
)

In [None]:



plot_order = ['prologue'] + one_title.tags[
    one_title.tags.str.contains('chp')
].sort_values().to_list() + ['epilogue']


f, ax = plt.subplots(
    figsize=(16,8)
)

sns.barplot(
    data=one_title,
    x='tags',
    y='duration_frac',
    order=plot_order,
    color='green',
    estimator='sum',
    ci=None,
    ax=ax
)

f.tight_layout()
f.autofmt_xdate()




# Summary

In [None]:
# Find the total time (in hour fractions)
title_summaries = (
    pd.DataFrame(
        df_clean
        .groupby('title')
        .duration_frac
        .sum()
    )
    .reset_index()
    .sort_values(by='duration_frac')
)

# Find the number of chapters
title_summaries = title_summaries.merge(
    (
        df_clean
        .groupby('title')
        .tags
        .count()
    ),                # Simply count the number of entries (assume each a chp)
    left_on='title',  # DataFrame column
    right_index=True, # title column is the index for the series
)
# 
title_summaries