# Data Journalism Lesson 23: Tables

Learn how to make a table with visual interestingness.

In [None]:
import warnings
from IPython.core.interactiveshell import InteractiveShell

# Keep hold of the real method
_orig_should_run = InteractiveShell.should_run_async

# Wrap it so that any DeprecationWarning it emits is silenced
def should_run_async(self, code, *args, **kwargs):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=DeprecationWarning)
        return _orig_should_run(self, code, *args, **kwargs)

# Apply the monkey‑patch
InteractiveShell.should_run_async = should_run_async

In [None]:
import micropip
await micropip.install('Jinja2')

In [None]:
from IPython.display import display, HTML
import pandas as pd

# --- Simple Grading/Checking Functions ---
def display_feedback(correct, message_correct, message_incorrect):
    if correct:
        display(HTML(f'<div style="background-color: #dff0d8; padding: 10px; border-radius: 5px;"><strong>Correct!</strong> {message_correct}</div>'))
    else:
        display(HTML(f'<div style="background-color: #f2dede; padding: 10px; border-radius: 5px;"><strong>Not quite!</strong> {message_incorrect}</div>'))

def check_df_creation(df, df_name, expected_shape=None, expected_cols=None, check_col_content=None):
    if not isinstance(df, pd.DataFrame):
        display_feedback(False, f'{df_name} is not a DataFrame.', 'Check your code.')
        return False
    correct = True
    msg_incorrect_list = []

    if expected_shape is not None and df.shape != expected_shape:
        msg_incorrect_list.append(f'Expected shape {expected_shape}, got {df.shape}.')
        correct = False
    if expected_cols is not None:
        actual_cols = sorted(list(df.columns))
        expected_cols_sorted = sorted(expected_cols)
        if actual_cols != expected_cols_sorted:
            msg_incorrect_list.append(f'Expected columns {expected_cols_sorted}, got {actual_cols}.')
            correct = False
    if check_col_content is not None:
       col, expected_val = check_col_content # e.g. ('col_name', some_value)
       if col not in df.columns:
           correct=False; msg_incorrect_list.append(f'Missing column {col}.')
       elif not expected_val in df[col].values:
           correct=False; msg_incorrect_list.append(f'Expected value {expected_val} not found in {col}.')

    if correct:
        display_feedback(True, f'{df_name} DataFrame created successfully.', '')
    else:
        display_feedback(False, '', ' '.join(msg_incorrect_list))
    return correct

def check_styler_property(styler, check_type, expected_value, property_path=None):
    # Basic check for styler properties (e.g., caption, format)
    actual_value = None
    try:
        if check_type == 'caption':
            actual_value = styler.caption
        elif check_type == 'formatter':
            # This is simplified; formatters are complex
            actual_value = styler.formatters
            # A real check would inspect the formatter dict more deeply
            if expected_value in str(actual_value): # Crude check for format string presence
                actual_value = expected_value
            else:
                actual_value = None # Indicate mismatch
        # Add more checks for styles, etc. if needed
    except Exception as e:
        print(f"Error checking styler property: {e}")
        display_feedback(False, '', f'Could not check {check_type}.')
        return

    if actual_value == expected_value:
        display_feedback(True, f'{check_type.capitalize()} seems correct.', '')
    else:
        display_feedback(False, '', f'{check_type.capitalize()} seems incorrect. Expected {expected_value}, Got {actual_value}.')


In [None]:
# --- State Setup and Data Loading ---
default_state_abbr = 'MN'
state_full_name = 'Minnesota'

uninsured_data_url = f"../_static/uninsured/{state_full_name.lower().replace(' ', '-')}.csv"
uninsured_df_initial = pd.read_csv(uninsured_data_url)
uninsuredrows_expected = len(uninsured_df_initial)

# Pre-calculate expected top10 for checks if possible
top10_expected_df = pd.DataFrame() # Default to empty
temp_df = uninsured_df_initial.copy()
# Ensure columns are numeric for calculation
temp_df['nui_pt'] = pd.to_numeric(temp_df['nui_pt'], errors='coerce')
temp_df['nic_pt'] = pd.to_numeric(temp_df['nic_pt'], errors='coerce')
temp_df = temp_df.dropna(subset=['nui_pt', 'nic_pt']) # Drop rows where values are missing
denominator = temp_df['nui_pt'] + temp_df['nic_pt']
# Avoid division by zero
temp_df['percent_uninsured'] = temp_df['nui_pt'].divide(denominator).fillna(0)
# Handle cases where denominator might be zero
temp_df.loc[denominator == 0, 'percent_uninsured'] = 0 
top10_expected_df = temp_df.nlargest(10, 'percent_uninsured')
# Ensure expected columns for later steps
top10_expected_df = top10_expected_df[['ctyname', 'nic_pt', 'nui_pt', 'percent_uninsured']]
top10_expected_df = top10_expected_df.sort_values(by='percent_uninsured', ascending=False).reset_index(drop=True)

In [None]:
from myst_nb import glue

glue("state_full_name", state_full_name, display=False)
glue("uninsured_csv_name", f"{state_full_name.lower().replace(' ', '-')}.csv", display=False)
glue("uninsuredrows_expected", uninsuredrows_expected, display=False)

## The Goal

In this lesson, you'll learn how to create informative and visually appealing tables using the pandas Styler API. By the end of this tutorial, you'll understand how to structure data for table creation, customize table elements such as headers and labels using method chaining, and apply styling (like formatting, alignment, and conditional coloring) to enhance readability. You'll practice these skills using real-world data on estimates of people who have health insurance, gaining practical experience in presenting complex information in a clear, tabular format. This ability to create professional-looking tables directly from your DataFrames will be invaluable for effectively communicating data findings in your journalism projects.

## Why Visualize Data?

There are few graphical forms of data more criticized than the data table. For a long time, they were all there was. And they were used heavily – and not particularly creatively. And people knew it wasn’t good.

“Information, that is imperfectly acquired, is generally as imperfectly retained; and a man who has carefully investigated a printed table, finds, when done, that he has only a very faint and partial idea of what he has read; and that like a figure imprinted on sand, is soon totally erased and defaced,” Playfair wrote in 1786.

Fast forward nearly 200 years, Tufte gives a little more room and grace to the table, but only under limited circumstances. And he has no love for pie charts.

“Tables are clearly the best way to show exact numerical values, although the entries can also be arranged in semi-graphical form,” Tufte wrote. “Tables are preferable to graphics for many small data sets. A table is nearly always better than a dumb pie chart; the only worse design than a pie chart is several of them, for then the viewer is asked to compare quantities located in spatial disarray both within and between pies, as in this heavily encoded example from an atlas. Given their low data-density and failure to order numbers along a visual dimension, pie charts should never be used.”

So a table is better than a pie chart, which should never be used. Got it.

But now more than 40 years after Tufte first published The Visual Display of Quantitative Information, tables have evolved a bit further. Tables are still excellent at showing exact numerical values, but now we can incorporate more. Color, gradients, even graphics in each row.

Let’s explore.

## The Basics

A table is the most simple form of data presentation. Tables have been around for millennia. We've improved on them since, so we're not just making a basic table. We're making a table with features.

What features you ask? Because the library we're using today (`pandas.style`) is different from plotting libraries like Plotly or Seaborn, we're going to incorporate some best practices for presenting data, including necessary contextual elements. There are some exceptions to this rule, but not many, and used only by people who really, really know what they’re doing. The rest of us should live by every chart having these parts:

1. A headline.
2. Explanatory text under the headline, called chatter in the business.
3. A source line, where you say where you got the data from.
4. A credit line, where you put your name on the chart. All the glory and all the blame belong to you.

There are more parts that your chart might need – annotations, arrows, shapes to indicate regions – and we’ve talked about using color to draw a reader’s eye, but every chart needs those four. So we’re going to build a table with those four things … and some color to draw the eye.

Why a table? Sometimes, the best way to show your data is with a table – simple rows and columns. It allows a reader to compare whatever they want to compare a little easier than a graph where you’ve chosen what to highlight.

We will use the pandas library, specifically the `Styler` object available via `DataFrame.style`, which allows method chaining to apply formatting and styles.

In [None]:
import pandas as pd

This is being written in the middle of a presidential election that seems like it never ended from the last one, and I have a terrible feeling that the next one will start as soon as this one is over. A permanent campaign issue seems to be health care -- who should get it, how it should be paid for, etc. It's such an issue that the Census Bureau has a program called the Small Area Health Insurance Estimates (SAHIE) program, which ... you guessed it ... estimates the number of people who have health insurance in an area. It's an estimate, based on a model, but it's the best we've got if you've got a question about differences in your state when it comes to health insurance coverage. 

Let's take a look.

In [None]:
uninsured_df = pd.read_csv("../_static/uninsured/Minnesota.csv")

Tables, like bar charts, beeswarms, waffles and a few others, can only fit so many rows of data. Generally speaking, tables are great at top 10 lists. Even top 20 can work if you’ve got the space and your readers have the time. So let’s first take a quick peek at our data and then we’re going to make a list of the 10 counties with the largest percentage of uninsured people.

In [None]:
display(uninsured_df.head())

The column `nic_pt` represents the number of insured people, and the column `nui_pt` represents the number of uninsured people. One thing to realize about this data: If you add up the insured people (`nic_pt`) and the uninsured people (`nui_pt`), you will **not** necessarily get the total population of the county. Why? The answer is pretty simple, once you think about it – in the United States, Medicare kicks in for people 65 and over. If you dig into the SAHIE docs, they cut their data off at 64. What this all means is that the numbers we’re looking at are the estimates of non-Medicare aged people. So something to keep in mind when wording certain things later.

### Exercise 1: Making a top 10 list

To make a top 10 list, we have to convert our numbers into percentages first, then use `nlargest` to get that list. We’re going to calculate the percentage of uninsured people.

```{admonition} Key Concept
Calculating a percentage is taking the small thing – the component – and dividing it by the total things. So if five students in a class of 20 are left-handed, the percentage is 5/20.
```

We don’t have the total number of insurable people, so we’ll have to add our two insurance estimate numbers together to get it. Then we’ll name our new dataframe `top10_df`.

In [None]:
temp_df = uninsured_df.copy()
denominator = temp_df['_____'] + temp_df['_____']
temp_df['percent_uninsured'] = temp_df['nui_pt'].divide(_____)

# Get top 10
top10_df = temp_df.nlargest(n=_____, columns=____)

display(top10_df)

Now, you can display a basic table in a Jupyter notebook just by having the DataFrame as the last item in a cell. Pandas renders it as HTML.

In [None]:
top10_df 

So there you have it. You made a table. You're done, right? Not hardly. That default table is functional but not very presentable.

The pandas Styler API (`df.style`) provides many customization options using method chaining.

### Exercise 2: Limiting column names, fixing column names

We’ll start with limiting the number of columns. For this table, let’s select the county name, the insured people, the uninsured people, and the percent uninsured in that order. While we’re at it, let’s add an arrange so we get them in order by percent uninsured.

Then, the header names for those columns are terrible. No one can read them beyond some very nerdy people (like us). We need to fix them. The way this works is we need to tell cols_label what columns we want to change and what we want to change them into. They go in that order – column name from the data: “New Name I Want To Show People”.

In [None]:
cols_to_keep = ['ctyname', 'nic_pt', 'nui_pt', 'percent_uninsured']
rename_map = {
    'ctyname': 'County',
    'nic_pt': 'Insured people',
    'nui_pt': 'Uninsured people', 
    'percent_uninsured': 'Percent uninsured'
}

top10_formatted_df = top10_df[_____].sort_values(by=____, ascending=____)
top10_formatted_df = top10_formatted_df.rename(columns=____)

display(top10_formatted_df)

check_df_creation(top10_formatted_df, 'top10_formatted_df', 
                  expected_shape=(len(top10_formatted_df), 4), 
                  expected_cols=list(rename_map.values()))

Better. The columns are selected, sorted, and renamed. Now we can start working with the Styler object to add presentation elements.

### Exercise 3: Styling, part 1 - Headers

The truth is most of your code in tables is going to be dedicated to styling specific things. First we need: A headline and some chatter. They’re required parts of a graphic so they’re a good place to start. The Styler API uses `.set_caption()` for this. We can use HTML within the caption string for basic formatting like line breaks (`<br>`).

In [None]:
# Start with the formatted DataFrame and apply .style
styler_ex3 = top10_formatted_df.style

title_text = "Where are the most uninsured people?"
subtitle_text = "Here are the top counties in the state by uninsured percentage, according to the Census Bureau."
caption_html = f"<b>{title_text}</b><br>{subtitle_text}"

styler_ex3.set_caption(____)

display(styler_ex3)

We have a headline and some chatter, but … gross. Centered? The extra lines? No real difference in font weight? We can do better.

### Exercise 4: Changing typography

We can style table elements like the caption using CSS via `.set_table_styles()`. This method takes a list of dictionaries, where each dictionary specifies a CSS selector and the properties to apply.

Let's left-align the caption (which contains our title and subtitle). The selector for the caption element is `'caption'`.

In [None]:
# Start with the Styler object from Ex 3
# styler_ex3 = top10_formatted_df.style.set_caption(caption_html)
styler_ex4 = top10_formatted_df.style.set_caption(caption_html) # Recreate for this step

styler_ex4.set_table_styles([
    {'selector': 'caption', 
     'props': [('text-align', 'left'), 
               ('font-size', '1.1em'), # Slightly larger caption
               ('font-weight', 'bold'), # Make entire caption bold initially
               ('margin-bottom', '10px')]}
], overwrite=False)

display(styler_ex4)

### Exercise 5: Adding source and credit lines

The next items on the required elements list are Source and Credit lines. The common ways to add this are:
1.  Include it in the caption (often at the end).
2.  Add it as a separate markdown cell below the table display.

Let's add it to the caption for simplicity in this exercise. We'll use HTML `<em>` tags for italics and line breaks.

In [None]:
# Define components
title_text = "Where are the most uninsured people?"
subtitle_text = "Top counties in the state by uninsured percentage, according to the Census Bureau."
source_credit_text = "<em>Source: US Census Bureau | By: Your Name</em>"

# Combine into caption with HTML breaks
full_caption_html = f"<b>{title_text}</b><br>{subtitle_text}<br><br>{source_credit_text}"

# Apply to styler
styler_ex5 = top10_formatted_df.style.set_caption(____)

# Apply previous alignment style
styler_ex5.set_table_styles([
    {'selector': 'caption', 
     'props': [('text-align', 'left'), 
               ('font-size', '1.0em'), # Reset font size slightly
               ('margin-bottom', '10px')]}
], overwrite=False)

display(styler_ex5)

## More styling

We can do a lot with table styling. For instance, we can make the headers bold and reduce the size a bit to reduce font congestion in the area.

In [None]:
# Start with Styler object from Ex5
styler_step7 = top10_formatted_df.style.set_caption(full_caption_html)
styler_step7.set_table_styles([
    {'selector': 'caption', 'props': [('text-align', 'left'), ('font-size', '1.0em'), ('margin-bottom', '10px')]},
    {'selector': 'th', # Target column headers
     'props': [('text-align', 'left'), 
               ('font-weight', 'bold'), 
               ('font-size', '0.9em'), # Slightly smaller 
               ('border-bottom', '2px solid black')]}
], overwrite=True) # Overwrite previous styles if needed, or set to False to append

display(styler_step7)

Next up: There’s a lot of lines in this that don’t need to be there. We can get rid of them easily and add in some other readability improvements.

We can also format percentages without having to go back to the original data.

In [None]:
# Start with Styler object from previous step
styler_step8 = top10_formatted_df.style.set_caption(full_caption_html)

# Apply formatting first
styler_step8.format({'Percent uninsured': "{:.1%}", # Format as percentage
                      'Insured people': "{:,.0f}", # Add comma separators
                      'Uninsured people': "{:,.0f}"}) # Add comma separators

# Apply CSS Styles
styler_step8.set_table_styles([
    # Caption style
    {'selector': 'caption', 'props': [('text-align', 'left'), ('font-size', '1.0em'), ('margin-bottom', '10px')]},
    # Header style
    {'selector': 'th', 'props': [('text-align', 'left'), ('font-weight', 'bold'), ('font-size', '0.9em'), ('border-bottom', '2px solid black')]},
    # Remove cell borders
    {'selector': 'td, th', 'props': [('border-style', 'none')]},
    # Add row striping (targeting even rows in the table body)
    {'selector': 'tbody tr:nth-child(even)', 'props': [('background-color', '#f2f2f2')]}
], overwrite=True)

# Hide the index
styler_step8.hide(axis='index')

display(styler_step8)

**Conditional Formatting:** Throughout this series, we've been using color and other signals to highlight things. With the Styler's `.apply()` method, we can apply CSS styles conditionally based on data values. 

Let's highlight the row for the county with the largest *number* of uninsured people (`Uninsured people` column, originally `nui_pt`). We'll define a function that checks if a row's value matches the maximum and returns the CSS string if it does.

In [None]:
# Start with Styler object from previous step
styler_step9 = top10_formatted_df.style

# --- Apply all formatting and styling from previous steps --- 
styler_step9.format({'Percent uninsured': "{:.1%}", 
                      'Insured people': "{:,.0f}", 
                      'Uninsured people': "{:,.0f}"}) 
styler_step9.set_caption(full_caption_html)
styler_step9.set_table_styles([
    {'selector': 'caption', 'props': [('text-align', 'left'), ('font-size', '1.0em'), ('margin-bottom', '10px')]},
    {'selector': 'th', 'props': [('text-align', 'left'), ('font-weight', 'bold'), ('font-size', '0.9em'), ('border-bottom', '2px solid black')]},
    {'selector': 'td, th', 'props': [('border-style', 'none')]},
    {'selector': 'tbody tr:nth-child(even)', 'props': [('background-color', '#f2f2f2')]}
], overwrite=True)
styler_step9.hide(axis='index')
# --- End of reapplying previous styles ---

# --- Add conditional formatting --- 
# Find the max value in the original numeric column BEFORE formatting
max_uninsured_value = top10_df['nui_pt'].max() # Use original data for comparison

def highlight_max_uninsured(row):
    # Compare the 'Uninsured people' value in the formatted df 
    # This requires knowing the *original* column name used to find the max
    # We need access to the original DataFrame's value for the comparison row
    # A common pattern is to apply based on index or pass the original df
    
    # Let's re-find the corresponding original value using the County name (assuming unique)
    original_nui_value = top10_df.loc[top10_df['ctyname'] == row['County'], 'nui_pt'].iloc[0]
    
    if original_nui_value == max_uninsured_value:
        return ['background-color: red; color: white'] * len(row)
    else:
        return [''] * len(row) # No style for other rows

styler_step9.apply(highlight_max_uninsured, axis=1)

display(styler_step9)

We’ve arrived where we want to be: We’ve created a clear table that allows a reader to compare counties at will while also using color to draw attention to the thing we want to draw attention to. **We’ve kept it simple so the color has impact.**

Note for tables: Copy and paste is your friend. This looks like a lot of code, but 90 percent of it is copy, paste, edit slightly and move on. Once you develop a style on tables, you’ll copy and paste that over and over again and adjust for your new column names. Getting your first one done is a lot of work. Getting your second one done takes minutes.

## The Recap

Throughout this lesson, you’ve mastered the art of creating sophisticated tables using gt. You’ve learned to transform raw data into structured, easy-to-read tables by customizing headers, applying styles, and highlighting key information. Remember, effective tables strike a balance between informativeness and visual appeal. You’ll find that well-designed tables can be powerful tools for presenting complex data in a format that’s accessible to your audience. Keep experimenting with different styling options to find the perfect balance for your specific data and story needs.

## Terms to Know

- **pandas Styler (`DataFrame.style`)**: An object returned by `df.style` that allows applying conditional formatting and other styles to DataFrames for display.
- **Method Chaining**: Applying multiple methods sequentially to an object (like the Styler object), e.g., `df.style.format(...).set_caption(...).hide(axis='index')`.
- **`.format()`**: A Styler method to apply string formatting (like percentages, commas, decimal places) to specific columns.
- **`.set_caption()`**: A Styler method to add a table caption (can include title, subtitle, etc., often using HTML).
- **`.set_table_styles()`**: A Styler method to apply CSS styles to table elements (like `caption`, `th`, `td`, `tr`) using selectors.
- **`.apply()`**: A Styler method to apply a function row-wise, column-wise, or table-wise to determine CSS styles based on data values (for conditional formatting).
- **`.hide(axis='index')`**: A Styler method to hide the DataFrame index when displaying the table.
- **CSS Selectors**: Used within `set_table_styles` to target specific HTML elements of the table for styling (e.g., `'th'` for headers, `'tbody tr:nth-child(even)'` for even rows).