In [None]:
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from ipywidgets import interact, interactive_output, HBox
import ipywidgets as widgets


warnings.filterwarnings('ignore')

In [None]:
numeric_columns = ['Number of Ratings','Number of Reviews']
df = (
    pl.scan_csv('datasets/Good_Reads_Book_Awards_Crawl_2023_12_27_11_14.csv')
    .drop(['source_URL', 'Book Description', 'About the Author'])
    # Remove commas from numeric columns and convert them to integers
    .with_columns([
        pl.col(column).str.replace(',', '').cast(pl.Int32).alias(column)
        for column in numeric_columns
    ])
    # extract kindle version from the kindle version and price column
    .with_columns(
        pl.col('Kindle Version and Price')
        .str.extract('([a-zA-Z ]+)')
        .str.strip_chars()
        .cast(pl.Categorical)
        .alias('Kindle Version'))
).collect()

In [None]:
df.sample(5)

In [None]:
df.describe()

In [None]:
# Group by book category
result_df = df.group_by('Readers Choice Category')\
    .agg(
        pl.sum('Readers Choice Votes'),
        pl.mean('Total Avg Rating').round(2),
        pl.sum('Number of Ratings'),
        pl.sum('Number of Reviews'),
        pl.median('Number of Pages'),
        pl.median('Kindle Price').round(2)
    ).sort('Readers Choice Votes', descending=True)

result_df

In [None]:
layout = {
    'Number of Ratings': (0, 0),
    'Total Avg Rating': (0, 1),
    'Number of Pages': (1, 0),
    'Kindle Price': (1, 1),
}

sort_column = widgets.Dropdown(
    options=layout.keys(),
    value='Number of Ratings',
    description='Sort by:',
    disabled=False,
)

sort_direction = widgets.Dropdown(
    options=['ascending', 'descending'],
    value='ascending',
    description='Direction:',
    disabled=False,
)

dropdowns_row = HBox([sort_column, sort_direction])


def plot_columns_for_each_category(sort_column=sort_column, sort_direction=sort_direction):


    result_df_sorted = result_df.sort(by=sort_column, descending=sort_direction == 'descending') 
    # polars uses descending instead of ascending, which is more in line with Python conventions
    # polars also uses sort instead of sort_values, also more in line with Python conventions

    # figsize is (width, height) in inches
    fig, axes = plt.subplots(2, 2, figsize=(16, 10), sharey=False)


    for column, (i, j) in layout.items():
        ax = axes[i, j]
        sns.barplot(x='Readers Choice Category', y=column, data=result_df_sorted, palette='Blues_d', ax=ax)
        ax.set_title(f'{column} for Each Category')
        ax.set_xticklabels(labels=result_df_sorted['Readers Choice Category'], rotation=30, ha='right')

    plt.tight_layout()
    plt.show()

out = interactive_output(plot_columns_for_each_category, {'sort_column': sort_column, 'sort_direction': sort_direction})
display(dropdowns_row, out)


In [None]:
layout = {
    'Readers Choice Votes': (0, 0),
    'Total Avg Rating': (0, 1),
    'Number of Ratings': (1, 0),
    'Number of Reviews': (1, 1),
    'Number of Pages': (2, 0),
    'Kindle Price': (2, 1),
}


sort_column = widgets.Dropdown(
    options=layout.keys(),
    value='Number of Ratings',
    description='Sort by:',
    disabled=False,
)

sort_direction = widgets.Dropdown(
    options=['ascending', 'descending'],
    value='ascending',
    description='Direction:',
    disabled=False,
)

dropdowns_row = HBox([sort_column, sort_direction])


def plot_boxplot_for_each_category(sort_column=sort_column, sort_direction=sort_direction):
    fig, axes = plt.subplots(3, 2, figsize=(16, 18))

    for column, (i, j) in layout.items():
        ax = axes[i, j]
        cols = result_df.sort(by=sort_column, descending=sort_direction == 'descending')['Readers Choice Category'] # order by corresponding aggregated column
        sns.boxplot(data=df, x='Readers Choice Category', y=column, palette='Set3', ax=ax, order=cols)
        ax.set_title(f'{column} vs Category')
        ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')
        
    fig.tight_layout()
    plt.show()

out = interactive_output(plot_boxplot_for_each_category, {'sort_column': sort_column, 'sort_direction': sort_direction})
display(dropdowns_row, out)

In [None]:
df.sort('Number of Ratings')
# df['Readers Choice Category']

In [None]:
column = widgets.Dropdown(
    options=[
        'Readers Choice Votes',
        'Total Avg Rating',
        'Number of Ratings',
        'Number of Reviews',
        'Number of Pages',
        'Kindle Price'],
    value='Kindle Price',
    description='Column:',
    disabled=False,
)
bw_adjust = widgets.FloatSlider(
    value=0.5,
    min=0,
    max=1.0,
    step=0.05,
    description='BW Adjust:',
    disabled=False,
    continuous_update=True,
    orientation='horizontal',
    readout=True,
    readout_format='.2f',
)

def do_kdeplot(column=column, bw_adjust=bw_adjust):
    # bw adjust adjusts the bandwidth of the kernel.
    # When you set a smaller bw_adjust, you get more contouring, but more noise
    # only use small values if you think your distribution is complex
    # the "cut" parameter determines how much the curve should extend beyond extreme datapoints
    # we set a "0" to not show negative values in the KDE curve
    sns.displot(data=df, x=column, palette='Set3', kind='kde', bw_adjust=bw_adjust, cut=0)
    
    # for a pure histogram plot
    # sns.displot(data=df, x='Kindle Price', palette='Set3')
    sns.displot(data=df, x=column, palette='Set3', kde=True)
    
    plt.show()
interact(do_kdeplot, column=column, bw_adjust=bw_adjust)

In [None]:
options = list(df['Readers Choice Category'].unique())
cols_to_show = widgets.SelectMultiple(
    options=options,
    value=options,
    #rows=10,
    description='Categories',
    disabled=False,
    layout=widgets.Layout(height='300px')
)

title = 'KDE of Kindle Price for Various Book Categories'

def show_kde_with_cols(cols_to_show):
    
    ax = sns.displot(
        data=df.filter(
            pl.col('Readers Choice Category').is_in(cols_to_show)
        ),
        x='Kindle Price', kind='kde', hue='Readers Choice Category', bw_adjust=0.5, hue_order=options)
    ax.set(title=title)
    plt.show()


import plotly.figure_factory as ff

def show_plotly_kde():
    category_price = {category: [] for category in df['Readers Choice Category'].unique()}
    for category, price in df[['Readers Choice Category', 'Kindle Price']].iter_rows():
        category_price[category].append(price)

    categories, price_data = zip(*category_price.items())
    fig = ff.create_distplot(price_data, group_labels=categories, show_hist=False, show_rug=False)
    fig.update_layout(title=title, width=1100, height=800, plot_bgcolor='white')
    fig.update_yaxes(gridcolor='lightgrey', linecolor='black', title='Density')
    fig.update_xaxes(gridcolor='lightgrey', linecolor='black', title='Kindle Price')
    fig.show()

show_plotly_kde()

# out = interactive_output(show_kde_with_cols, {'cols_to_show': cols_to_show})
# display(HBox([out, cols_to_show]))

In [None]:
# Assign the columns
columns_of_interest = ['Number of Reviews', 'Number of Ratings', 'Number of Pages', 'Total Avg Rating', 'Readers Choice Votes', 'Kindle Price']
diverging_cmaps = [
    'coolwarm', 'BrBG', 'PiYG', 'PRGn', 'PuOr', 'RdBu', 'RdGy', 
    'RdYlBu', 'RdYlGn', 'Spectral', 'bwr', 'seismic'
]
cmap_widget = widgets.SelectionSlider(
    options=diverging_cmaps,
    value=diverging_cmaps[0],
    description='Color Map:',
    disabled=False,
    continuous_update=True,
    orientation='horizontal',
    readout=True,
)
rev_widget = widgets.Checkbox(
    value=False,
    description='Reverse: ',
    disabled=False,
    indent=False
)
# Calculate the correlation matrix
correlation_matrix = df[columns_of_interest].corr()
def show_corr(cmap, reversed):
    # Display the correlation matrix
    plt.figure(figsize=(10, 8))
    if reversed:
        cmap += '_r'
    
    # BrBG is another cool heatmap
    sns.heatmap(correlation_matrix, annot=True, cmap=cmap, fmt=".2f", linewidths=.5)
    plt.title('Correlation Matrix')
    plt.xticks(rotation=30, ha='right')
    plt.show()

interact(show_corr, cmap=cmap_widget, reversed=rev_widget)