In [1]:
%pip install --upgrade plotly
%pip install kaleido==0.1.0post1


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objs as go

import matplotlib.pyplot as plt
import os
import plotly.io as pio
pio.renderers.default = 'notebook'

In [21]:
df_tes = pd.read_csv('tesseract_experiment.csv')
df_tes['font_name'] = df_tes['font_name'].replace('peignotbold', 'peignot')

df_tes['accuracy'] = df_tes['accuracy'].str.replace('%', '')
df_tes['accuracy'] = pd.to_numeric(df_tes['accuracy'])
df_tes['accuracy'] = df_tes['accuracy'] / 100



df_gv = pd.read_csv('google_vision_experiment.csv')
df_gv['font_name'] = df_gv['font_name'].replace('peignotbold', 'peignot')

df_gv['accuracy'] = df_gv['accuracy'].str.replace('%', '')
df_gv['accuracy'] = pd.to_numeric(df_gv['accuracy'])
df_gv['accuracy'] = df_gv['accuracy'] / 100


In [22]:
df_tes.rename(columns={'accuracy': 'accuracy_ts'}, inplace=True)
df_gv.rename(columns={'accuracy': 'accuracy_gv'}, inplace=True)

merged_df = pd.merge(df_tes, df_gv, on=[col for col in df_tes.columns if col != 'accuracy_ts'])

In [5]:
scripts_df = merged_df[(merged_df['vox_atypl'] == 'scripts') & (merged_df['underlined'] == True)]

# Melting the DataFrame to long format
df_melted = scripts_df.melt(id_vars=['font_size'], value_vars=['accuracy_ts', 'accuracy_gv'],
                            var_name='accuracy_type', value_name='accuracy_value')

# Create the plot using Plotly
fig = px.box(df_melted, x='font_size', y='accuracy_value', color='accuracy_type',
             color_discrete_map={'accuracy_ts': '#1f77b4', 'accuracy_gv': '#2ca02c'},
             title='Underlined Scripts')


# Define custom x-tick labels
custom_labels = {str(font_size): f'{font_size}' for font_size in df_melted['font_size'].unique()}

# Update x-ticks
fig.update_xaxes(
    tickvals=list(custom_labels.keys()), 
    ticktext=list(custom_labels.values())
)

# Update the traces to modify the legend names
fig.update_traces(
    selector=dict(name='accuracy_ts'),
    name='Tesseract'
)
fig.update_traces(
    selector=dict(name='accuracy_gv'),
    name='Google Vision OCR'
)


# Update the background colors
fig.update_layout(
    plot_bgcolor='lightgray',  # Background color of the plot area
    paper_bgcolor='white',  # Background color of the entire figure
    xaxis_title='Font Size',
    font=dict(size=16),
    yaxis_title='Accuracy',
    legend_title_text='OCR'
)

# Show the plot
fig.show()

In [7]:
scripts_df = merged_df[(merged_df['vox_atypl'] == 'scripts') & (merged_df['underlined'] == False)]

# Melting the DataFrame to long format
df_melted = scripts_df.melt(id_vars=['font_size'], value_vars=['accuracy_ts', 'accuracy_gv'],
                            var_name='accuracy_type', value_name='accuracy_value')

# Create the plot using Plotly
fig = px.box(df_melted, x='font_size', y='accuracy_value', color='accuracy_type',
             color_discrete_map={'accuracy_ts': '#1f77b4', 'accuracy_gv': '#2ca02c'},
             title='Scripts')


# Define custom x-tick labels
custom_labels = {str(font_size): f'{font_size}' for font_size in df_melted['font_size'].unique()}

# Update x-ticks
fig.update_xaxes(
    tickvals=list(custom_labels.keys()), 
    ticktext=list(custom_labels.values())
)

# Update the traces to modify the legend names
fig.update_traces(
    selector=dict(name='accuracy_ts'),
    name='Tesseract'
)
fig.update_traces(
    selector=dict(name='accuracy_gv'),
    name='Google Vision OCR'
)


# Update the background colors
fig.update_layout(
    plot_bgcolor='lightgray',  # Background color of the plot area
    paper_bgcolor='white',  # Background color of the entire figure
    xaxis_title='Font Size',
    font=dict(size=16),
    yaxis_title='Accuracy',
    legend_title_text='OCR'
)

# Show the plot
fig.show()

In [8]:
balzac_df = merged_df[(merged_df['font_name'] == 'balzac')]

# Melting the DataFrame to long format
df_melted = balzac_df.melt(id_vars=['font_size', 'bold', 'underlined'], value_vars=['accuracy_ts', 'accuracy_gv'],
                            var_name='accuracy_type', value_name='accuracy_value')

grouped_df = df_melted.groupby(['accuracy_type', 'font_size', 'underlined', 'bold'])['accuracy_value'].mean().reset_index()


# Create a figure
fig = go.Figure()

# Add traces
fig.add_trace(go.Bar(
    x=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['font_size'],
    y=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['accuracy_value'],
    text=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['accuracy_value'].round(2),
    name='Google Vision OCR - Regular',
    textfont=dict(size=10),
    marker_color='chartreuse',
    textposition='outside',
    textangle=270
))

fig.add_trace(go.Bar(
    x=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['font_size'],
    y=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['accuracy_value'],
    text=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['accuracy_value'].round(2),
    name='Google Vision OCR - Underlined',
    textfont=dict(size=10),
    marker_color='cyan',
    textposition='outside',
    textangle=270
))

fig.add_trace(go.Bar(
    x=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['font_size'],
    y=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['accuracy_value'],
    text=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['accuracy_value'].round(2),
    name='Tesseract - Regular',
    textfont=dict(size=10),
    marker_color='firebrick',
    textposition='outside',
    textangle=270
))

fig.add_trace(go.Bar(
    x=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['font_size'],
    y=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['accuracy_value'],
    text=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['accuracy_value'].round(2),
    name='Tesseract - Underlined',
    textfont=dict(size=10),
    marker_color='pink',
    textposition='outside',
    textangle=270
))

# Update layout for vertical bars
fig.update_layout(
    title='Balzac',
    xaxis=dict(title='Font Size', tickvals=grouped_df['font_size'], ticktext=grouped_df['font_size']),  # updating x-axis
    yaxis=dict(title='Mean Accuracy'),  # updating y-axis
    font=dict(size=16),
    legend=dict(
        orientation="h",
        yanchor="top",
        y=1.2,
        xanchor="center",
        x=0.5
    ),
    bargap=0.2,
    height=500,
    width=1000,
    plot_bgcolor='lightgray',  # Background color of the plot area
    paper_bgcolor='white'  # Background color of the entire figure
)

fig.update_yaxes(range=[0, 1.05])  # Update y-axis range if necessary

# Show the plot
fig.show()


In [9]:
lineal_df = merged_df[(merged_df['vox_atypl'] == 'lineal') & (merged_df['underlined'] == False)]

# Melting the DataFrame to long format
df_melted = lineal_df.melt(id_vars=['font_size'], value_vars=['accuracy_ts', 'accuracy_gv'],
                            var_name='accuracy_type', value_name='accuracy_value')

# Create the plot using Plotly
fig = px.box(df_melted, x='font_size', y='accuracy_value', color='accuracy_type',
             color_discrete_map={'accuracy_ts': '#1f77b4', 'accuracy_gv': '#2ca02c'},
             title='Lineal')


# Define custom x-tick labels
custom_labels = {str(font_size): f'{font_size}' for font_size in df_melted['font_size'].unique()}

# Update x-ticks
fig.update_xaxes(
    tickvals=list(custom_labels.keys()), 
    ticktext=list(custom_labels.values())
)

# Update the traces to modify the legend names
fig.update_traces(
    selector=dict(name='accuracy_ts'),
    name='Tesseract'
)
fig.update_traces(
    selector=dict(name='accuracy_gv'),
    name='Google Vision OCR'
)


# Update the background colors
fig.update_layout(
    plot_bgcolor='lightgray',  # Background color of the plot area
    paper_bgcolor='white',  # Background color of the entire figure
    xaxis_title='Font Size',
    font=dict(size=16),
    yaxis_title='Accuracy',
    legend_title_text='OCR'
)

# Show the plot
fig.show()

In [10]:
lineal_df = merged_df[(merged_df['vox_atypl'] == 'lineal') & (merged_df['underlined'] == True)]

# Melting the DataFrame to long format
df_melted = lineal_df.melt(id_vars=['font_size'], value_vars=['accuracy_ts', 'accuracy_gv'],
                            var_name='accuracy_type', value_name='accuracy_value')

# Create the plot using Plotly
fig = px.box(df_melted, x='font_size', y='accuracy_value', color='accuracy_type',
             color_discrete_map={'accuracy_ts': '#1f77b4', 'accuracy_gv': '#2ca02c'},
             title='Underlined Lineal')


# Define custom x-tick labels
custom_labels = {str(font_size): f'{font_size}' for font_size in df_melted['font_size'].unique()}

# Update x-ticks
fig.update_xaxes(
    tickvals=list(custom_labels.keys()), 
    ticktext=list(custom_labels.values())
)

# Update the traces to modify the legend names
fig.update_traces(
    selector=dict(name='accuracy_ts'),
    name='Tesseract'
)
fig.update_traces(
    selector=dict(name='accuracy_gv'),
    name='Google Vision OCR'
)


# Update the background colors
fig.update_layout(
    plot_bgcolor='lightgray',  # Background color of the plot area
    paper_bgcolor='white',  # Background color of the entire figure
    xaxis_title='Font Size',
    font=dict(size=16),
    yaxis_title='Accuracy',
    legend_title_text='OCR'
)

# Show the plot
fig.show()

In [11]:
peignot_df = merged_df[(merged_df['font_name'] == 'peignot')]


# Melting the DataFrame to long format
df_melted = peignot_df.melt(id_vars=['font_size', 'bold', 'underlined'], value_vars=['accuracy_ts', 'accuracy_gv'],
                            var_name='accuracy_type', value_name='accuracy_value')

grouped_df = df_melted.groupby(['accuracy_type', 'font_size', 'underlined', 'bold'])['accuracy_value'].mean().reset_index()


# Create a figure
fig = go.Figure()

# Add traces
fig.add_trace(go.Bar(
    x=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['font_size'],
    y=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['accuracy_value'],
    text=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['accuracy_value'].round(2),
    name='Google Vision OCR - Regular',
    textfont=dict(size=16),
    marker_color='chartreuse',
    textposition='outside',
    textangle=270
))

fig.add_trace(go.Bar(
    x=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['font_size'],
    y=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['accuracy_value'],
    text=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['accuracy_value'].round(2),
    name='Google Vision OCR - Underlined',
    textfont=dict(size=16),
    marker_color='cyan',
    textposition='outside',
    textangle=270
))

fig.add_trace(go.Bar(
    x=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['font_size'],
    y=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['accuracy_value'],
    text=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['accuracy_value'].round(2),
    name='Tesseract - Regular',
    textfont=dict(size=16),
    marker_color='firebrick',
    textposition='outside',
    textangle=270
))

fig.add_trace(go.Bar(
    x=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['font_size'],
    y=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['accuracy_value'],
    text=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == False) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['accuracy_value'].round(2),
    name='Tesseract - Underlined',
    textfont=dict(size=16),
    marker_color='pink',
    textposition='outside',
    textangle=270
))

# Update layout for vertical bars
fig.update_layout(
    title='Peignot',
    xaxis=dict(title='Font Size', tickvals=grouped_df['font_size'], ticktext=grouped_df['font_size']),  # updating x-axis
    yaxis=dict(title='Mean Accuracy'),  # updating y-axis
    font=dict(size=16),
    legend=dict(
        orientation="h",
        yanchor="top",
        y=1.2,
        xanchor="center",
        x=0.5
    ),
    bargap=0.2,
    height=500,
    width=1000,
    plot_bgcolor='lightgray',  # Background color of the plot area
    paper_bgcolor='white'  # Background color of the entire figure
)

fig.update_yaxes(range=[0, 1.05])  # Update y-axis range if necessary

# Show the plot
fig.show()

# FONTS

In [43]:
def plotAndSaveByFontName(fontName):
    font_df = merged_df[(merged_df['font_name'] == fontName)]

    # Melting the DataFrame to long format
    df_melted = font_df.melt(id_vars=['font_size', 'bold', 'underlined'], value_vars=['accuracy_ts', 'accuracy_gv'],
                                var_name='accuracy_type', value_name='accuracy_value')

    grouped_df = df_melted.groupby(['accuracy_type', 'font_size', 'underlined', 'bold'])['accuracy_value'].mean().reset_index()


    # Create a figure
    fig = go.Figure()

    # Add traces
    fig.add_trace(go.Bar(
        x=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == True) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['font_size'],
        y=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == True) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['accuracy_value'],
        text=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == True) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['accuracy_value'].round(2),
        name='Google Vision OCR - Bold Regular',
        textfont=dict(size=10),
        marker_color='chartreuse',
        textposition='outside',
        textangle=270
    ))

    fig.add_trace(go.Bar(
        x=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == True) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['font_size'],
        y=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == True) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['accuracy_value'],
        text=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == True) & (grouped_df['accuracy_type'] == 'accuracy_gv')]['accuracy_value'].round(2),
        name='Google Vision OCR - Bold Underlined',
        textfont=dict(size=10),
        marker_color='cyan',
        textposition='outside',
        textangle=270
    ))

    fig.add_trace(go.Bar(
        x=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == True) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['font_size'],
        y=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == True) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['accuracy_value'],
        text=grouped_df[(grouped_df['underlined'] == False) & (grouped_df['bold'] == True) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['accuracy_value'].round(2),
        name='Tesseract - Bold Regular',
        textfont=dict(size=10),
        marker_color='firebrick',
        textposition='outside',
        textangle=270
    ))

    fig.add_trace(go.Bar(
        x=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == True) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['font_size'],
        y=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == True) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['accuracy_value'],
        text=grouped_df[(grouped_df['underlined'] == True) & (grouped_df['bold'] == True) & (grouped_df['accuracy_type'] == 'accuracy_ts')]['accuracy_value'].round(2),
        name='Tesseract - Bold Underlined',
        textfont=dict(size=10),
        marker_color='pink',
        textposition='outside',
        textangle=270
    ))

    # Update layout for vertical bars
    fig.update_layout(
        title=fontName,
        xaxis=dict(title='Font Size', tickvals=grouped_df['font_size'], ticktext=grouped_df['font_size']),  # updating x-axis
        yaxis=dict(title='Mean Accuracy'),  # updating y-axis
        font=dict(size=16),
        legend=dict(
            orientation="h",
            yanchor="top",
            y=1.2,
            xanchor="center",
            x=0.6
        ),
        bargap=0.2,
        height=500,
        width=1000,
        plot_bgcolor='lightgray',  # Background color of the plot area
        paper_bgcolor='white'  # Background color of the entire figure
    )

    fig.update_yaxes(range=[0, 1.05])  # Update y-axis range if necessary

    pio.write_image(fig, os.path.join('plots', f'{fontName}.png'), scale=6, format='png', engine='kaleido')

    # Show the plot
    fig.show()  

# Peignot Bold

In [44]:
df_tes = pd.read_csv('tesseract_peignot.csv')
df_tes['accuracy'] = df_tes['accuracy'].str.replace('%', '')
df_tes['accuracy'] = pd.to_numeric(df_tes['accuracy'])
df_tes['accuracy'] = df_tes['accuracy'] / 100



df_gv = pd.read_csv('google_peignot.csv')
df_gv['accuracy'] = df_gv['accuracy'].str.replace('%', '')
df_gv['accuracy'] = pd.to_numeric(df_gv['accuracy'])
df_gv['accuracy'] = df_gv['accuracy'] / 100


df_tes.rename(columns={'accuracy': 'accuracy_ts'}, inplace=True)
df_gv.rename(columns={'accuracy': 'accuracy_gv'}, inplace=True)

merged_df = pd.merge(df_tes, df_gv, on=[col for col in df_tes.columns if col != 'accuracy_ts'])
merged_df['font_name'] = merged_df['font_name'].replace('peignotbold', 'Peignot Bold')

plotAndSaveByFontName('Peignot Bold')

# Perpetua Bold

In [None]:
df_tes = pd.read_csv('tesseract_perpetua.csv')
df_tes['accuracy'] = df_tes['accuracy'].str.replace('%', '')
df_tes['accuracy'] = pd.to_numeric(df_tes['accuracy'])
df_tes['accuracy'] = df_tes['accuracy'] / 100



df_gv = pd.read_csv('google_perpetua.csv')
df_gv['accuracy'] = df_gv['accuracy'].str.replace('%', '')
df_gv['accuracy'] = pd.to_numeric(df_gv['accuracy'])
df_gv['accuracy'] = df_gv['accuracy'] / 100


df_tes.rename(columns={'accuracy': 'accuracy_ts'}, inplace=True)
df_gv.rename(columns={'accuracy': 'accuracy_gv'}, inplace=True)

merged_df = pd.merge(df_tes, df_gv, on=[col for col in df_tes.columns if col != 'accuracy_ts'])
merged_df['font_name'] = merged_df['font_name'].replace('perpetua-bold', 'Perpetua Bold')

plotAndSaveByFontName('Peignot Bold')