# Data Journalism Lesson 25: Color

Learn how to add color to draw attention.

In [None]:
import warnings
from IPython.core.interactiveshell import InteractiveShell

# Keep hold of the real method
_orig_should_run = InteractiveShell.should_run_async

# Wrap it so that any DeprecationWarning it emits is silenced
def should_run_async(self, code, *args, **kwargs):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=DeprecationWarning)
        return _orig_should_run(self, code, *args, **kwargs)

# Apply the monkey‑patch
InteractiveShell.should_run_async = should_run_async

In [None]:
import micropip
await micropip.install('plotly')
await micropip.install('nbformat>=4.2.0') 

In [None]:
from IPython.display import display, HTML
import pandas as pd

# --- Simple Grading/Checking Functions ---
def display_feedback(correct, message_correct, message_incorrect):
    """Displays feedback message in an HTML div based on correctness."""
    if correct:
        display(HTML(f'<div style="background-color: #dff0d8; color: #3c763d; border: 1px solid #d6e9c6; padding: 10px; border-radius: 5px;"><strong>Correct!</strong> {message_correct}</div>'))
    else:
        display(HTML(f'<div style="background-color: #f2dede; color: #a94442; border: 1px solid #ebccd1; padding: 10px; border-radius: 5px;"><strong>Not quite.</strong> {message_incorrect}</div>'))

def check_df_rows(inputted_df, expected_rows, df_name="DataFrame"):
    """Checks if the DataFrame has the expected number of rows."""
    if isinstance(inputted_df, pd.DataFrame) and len(inputted_df) == expected_rows:
        display_feedback(True, f"{df_name} has the correct number of rows ({expected_rows}).", "")
        return True
    elif not isinstance(inputted_df, pd.DataFrame):
        display_feedback(False, "", f"The variable '{df_name}' is not a DataFrame.")
        return False
    else:
        display_feedback(False, "", f"{df_name} has {len(inputted_df)} rows, but expected {expected_rows} rows.")
        return False

def check_column_exists_and_type(df, column_name, expected_type_str, df_name="DataFrame"):
    """Checks if a column exists and is of a broadly expected numeric type."""
    if column_name not in df.columns:
        display_feedback(False, "", f"Column '{column_name}' is missing from {df_name}.")
        return False
    # Basic type check (e.g., numeric for 'percent_uninsured')
    if expected_type_str == "numeric" and pd.api.types.is_numeric_dtype(df[column_name]):
        display_feedback(True, f"Column '{column_name}' exists in {df_name} and is numeric.", "")
        return True
    else:
        display_feedback(False, "", f"Column '{column_name}' in {df_name} is not of the expected type (numeric). It is {df[column_name].dtype}.")
        return False

def check_plot_bar_params(fig, expected_params):
    """Checks specific parameters for bar charts in Plotly."""
    messages = []
    all_correct = True
    num_traces = len(fig.data)

    if "num_traces" in expected_params and num_traces != expected_params["num_traces"]:
        messages.append(f"Incorrect number of traces. Expected {expected_params['num_traces']}, got {num_traces}.")
        all_correct = False
    
    if "bar_colors" in expected_params: # List of expected colors for traces
        for i, color in enumerate(expected_params["bar_colors"]):
            if i < num_traces:
                actual_color = fig.data[i].marker.color
                if actual_color == color:
                    messages.append(f"Trace {i} color ('{actual_color}') is correct.")
                else:
                    messages.append(f"Trace {i} color is incorrect. Expected '{color}', got '{actual_color}'.")
                    all_correct = False
            else:
                messages.append(f"Expected color for trace {i} but trace does not exist.")
                all_correct = False

    if "y_axis_categoryorder" in expected_params:
        if fig.layout.yaxis.categoryorder == expected_params["y_axis_categoryorder"]:
            messages.append("Y-axis category order is correct.")
        else:
            messages.append(f"Y-axis category order is incorrect. Expected '{expected_params['y_axis_categoryorder']}', got '{fig.layout.yaxis.categoryorder}'.")
            all_correct = False
            
    if "template" in expected_params:
        if fig.layout.template.name == expected_params["template"]:
            messages.append(f"Plot template ('{fig.layout.template.name}') is correct.")
        else:
            messages.append(f"Plot template is incorrect. Expected '{expected_params['template']}', got '{fig.layout.template.name}'.")
            all_correct = False

    if "orientation" in expected_params:
        is_horizontal = all(trace.orientation == 'h' for trace in fig.data if hasattr(trace, 'orientation'))
        if expected_params["orientation"] == 'h' and is_horizontal:
            messages.append("Bar orientation is correctly horizontal.")
        elif expected_params["orientation"] != 'h' and not is_horizontal:
             messages.append("Bar orientation is correctly vertical (or not specified as horizontal).")
        else:
            messages.append(f"Bar orientation is incorrect. Expected horizontal: {expected_params['orientation'] == 'h'}. Actual: {is_horizontal}")
            all_correct = False

    if all_correct:
        display_feedback(True, "All checked plot elements are correct!", "")
    else:
        feedback_msg = "Some plot elements are incorrect or missing:<ul>" + "".join([f"<li>{m}</li>" for m in messages]) + "</ul>"
        display_feedback(False, "", feedback_msg)
    return all_correct

# --- Data Loading and Initial Preparation ---
default_state_abbr = 'MN' # Default from RMD
state_full_name_glue = "Minnesota"

data_url = f"../_static/uninsured/{state_full_name_glue.lower().replace(' ', '-')}.csv"

uninsured_df = pd.read_csv(data_url)
uninsured_rows_expected = len(uninsured_df)

uninsured_df['percent_uninsured'] = uninsured_df['nui_pt'] / (uninsured_df['nui_pt'] + uninsured_df['nic_pt'])
# Ensure percent_uninsured is not NaN for sorting, fill with 0 for top_n if necessary
top20_df = uninsured_df.fillna({'percent_uninsured': 0}).nlargest(20, 'percent_uninsured').copy()

# Find the county with the max number of uninsured people within the top 20 by percentage
top_county_df = top20_df.loc[[top20_df['nui_pt'].idxmax()]].copy() if not top20_df['nui_pt'].empty else pd.DataFrame()
top_county_name_glue = top_county_df['ctyname'].iloc[0]

In [None]:
from myst_nb import glue

glue("state_full_name", state_full_name_glue, display=False)
glue("top_county_name", top_county_name_glue, display=False)

## The Goal

In this lesson, you'll learn how to effectively use color in data visualizations to enhance storytelling and guide reader attention. By the end of this tutorial, you'll understand key principles of color usage in charts, including limiting color palette, using contrast for emphasis, and avoiding overuse of color. You'll practice applying these concepts by creating a bar chart of uninsured rates, using color strategically to highlight specific data points. These skills will enable you to create more impactful and clear visualizations in your data journalism projects.

## Why Visualize Data?

The main focus of this whole class -- indeed the whole journalism major -- is to tell a story. If the chart is not telling a story, then it doesn't serve a purpose and we've failed. And if our chart does tell a story, but the reader can't find it, then that means we've still failed the main purpose. 

Some charts do a lot of work showing the reader what the story is before we even do anything to it. Some need more help. One way we can offer that help is to use color. 

Color can be quite powerful. It can also ruin a graphic. And the right use of color isn't science -- it's art. That means color has been argued about for centuries, even in the world of graphics. 

The basic rules of color we're going to use are:

1. Limit the number of colors. The fewer the better. If everything is a color, nothing is a color. 
2. Use contrasting colors to draw attention. A color in a sea of grey stands out. 
3. Don't abuse color. Choose appropriate colors, avoid the defaults.

Where do these rules come from? Experience of people who have made graphics before. Looking at what has succeeded and what has failed. 

**Rule 1:** Alberto Cairo, a professor in the University of Miami School of Communication and expert in data visualization, wrote in his book The Functional Art that limiting color was "not just a minimalist aesthetic choice, but a practical one. Limiting the amount of colors and different fonts in your graphics will help you create a sense of unity in the composition."

He went on: 

"The best way to disorient your readers is to fill your graphic with objects colored in pure accent tones. Pure colors are uncommon in nature, so limit them to highlight whatever is important in your graphics, and use subdued hues — grays, light blues, and greens — for everything else. In other words: If you know that the brain prioritizes what it pays attention to, prioritize beforehand."

**Rule 2:** Swiss cartographer Eduard Imhof wrote in 1982 his first rule of of color composition: "Pure, bright or very strong colors have loud unbearable effects when they stand unrelieved over large areas adjacent to each other, but extraordinary effects can be achieved when they are used sparingly on or between dull background tones."

**Rule 3:** Edward Tufte, in Envisioning Information, wrote that adding color was the easy part; adding good color in the right place is hard. "The often scant benefits derived from coloring data indicate that putting a good color in a good place is a complex matter. Indeed, so difficult and subtle that avoiding catastrophe becomes the first principle in bringing color to information: Above all, do no harm."

## The Basics

We're going to build a simple bar chart and layer in color as an example. We're going to look at data on the uninsured population by county.

First, ensure your core libraries like pandas and Plotly are ready.

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

And the data. We’re going to do three things here: first, we’ll load in the uninsured data.

In [None]:
uninsured_df = pd._____("../_static/uninsured/minnesota.csv")

Now we need a subset of this data.

### Exercise 1: Make a top 20 list

What we need is a top 20 list of counties. We did this exact same thing in the tables tutorial, except there we used the top 10. We’re going to create a column called `percent_uninsured` and use that same column in the `nlargest`.

In [None]:
uninsured_df_calc = uninsured_df.copy()
uninsured_df_calc['percent_uninsured'] = uninsured_df_calc['____'] / (uninsured_df_calc['____'] + uninsured_df_calc['____'])

top20_df = uninsured_df_calc.nlargest(20, '____')

We can see the `top20_df` data with `head()`.

In [None]:
_____.head()

### Exercise 2: Making the first bar chart

Let's start by making a simple horizontal bar chart of `percent_uninsured` for these top 20 counties. We’re going to re-order it and put the labels on the y-axis like we’ve done before.

In [None]:
fig_ex2 = go.Figure()

top20_sorted_ex2 = top20_df.sort_values(by='____', ascending=True)

fig_ex2 = px.bar(
    data_frame=top20_sorted_ex2,
    x='____', # The value for bar length
    y='____', # The category for each bar
    orientation='h',
    title=f"Top 20 Counties by Percent Uninsured - {state_full_name_glue}"
)

fig_ex2.update_layout(
    xaxis_title="Percent Uninsured",
    yaxis_title="County",
    height=600 # Adjust height as needed for 20 bars
)
fig_ex2.update_xaxes(tickformat='.0%') # Format x-axis as percentage
fig_ex2.show()

Now we've got a base. 

### Exercise 3: Using color to reduce attention

Let's make that base fade into the background by changing the color to light grey. With Plotly bar charts, you set the color of the bars using `marker_color` within `px.bar` or by updating traces.

Recreate the chart from Exercise 2, but set the `marker_color` for the bars to `"lightgrey"`.

In [None]:
fig_ex3 = go.Figure()
top20_sorted_ex3 = top20_df.sort_values(by='percent_uninsured', ascending=True)

fig_ex3 = px.bar(
    data_frame=top20_sorted_ex3,
    x='percent_uninsured',
    y='ctyname',
    orientation='h',
    title=f"Top 20 Counties by Percent Uninsured (Faded) - {state_full_name_glue}",
    color_discrete_sequence=["____"]
)

fig_ex3.update_layout(
    xaxis_title="Percent Uninsured",
    yaxis_title="County",
    height=600
)
fig_ex3.update_xaxes(tickformat='.0%')
fig_ex3.show()

Now we can add layers.

### Exercise 4: More layers, more colors

Now we're going to add a second bar trace to our chart. We want this new trace to stand out from the rest, so we'll make its bars dark green. We’re going to use `top20_df`, but we’re going to add a filter, like we’ve done in previous exercise, that finds the county in the `top20_df` that has the most uninsured people.

In [None]:
# Start by recreating the base figure from Exercise 3 to ensure a clean state for this exercise
fig_ex4 = go.Figure()
top20_sorted_ex4 = top20_df.sort_values(by='percent_uninsured', ascending=True)
max_uninsured_county = top20_df['_____'].idxmax()
top_county_df = top20_df[top20_df.index == _____].copy()

# Base trace: all top 20 counties in lightgrey
fig_ex4.add_trace(go.Bar(
    x=top20_sorted_ex4['percent_uninsured'],
    y=top20_sorted_ex4['ctyname'],
    orientation='h',
    marker_color='____',
    name='Top 20 Counties by % Uninsured'
))

fig_ex4.add_trace(go.Bar(
    x=top_county_df['percent_uninsured'], 
    y=top_county_df['ctyname'], 
    orientation='h',
    marker_color='____',
    name=f"{top_county_name_glue} (Max Uninsured Count in Top 20%)"
))

fig_ex4.update_layout(
    title_text=f"Uninsured Rate: Highlighting {top_county_name_glue} - {state_full_name_glue}",
    xaxis_title="Percent Uninsured",
    yaxis_title="County",
    height=600,
    barmode='overlay', # Ensures the highlight bar is drawn over the grey one if y-values match
    legend=dict(x=0.6, y=0.1) # Adjust legend position
)
fig_ex4.update_xaxes(tickformat='.0%')
# Ensure y-axis is sorted by percent_uninsured from the base layer for consistency
fig_ex4.update_yaxes(categoryorder='array', categoryarray=top20_sorted_ex4['ctyname'])
fig_ex4.show()

In {glue:text}`state_full_name`, the county with the most uninsured people (numerically) among the top 20 by percentage is {glue:text}`top_county_name`. The method we used just picked a county to highlight based on this criterion. You could filter differently. For example, counties where there were more than X uninsured people, or you could filter for a specific county by name: `top20_df[top20_df['ctyname'] == "Some County Name"]`. The point here is to use color to draw the reader's eye, guiding them to a focal point in your story. Ideally, your narrative would mention {glue:text}`top_county_name` to make this highlighting relevant.

We've got one last color-based task -- get rid of the grey background.

### Exercise 5: Getting out of the way of our colors

Plotly's default theme (`plotly`) has a light grey background. I don’t know why, but it’s there. That grey takes away from our contrast and makes the reader’s eye wander more. We want to draw attention to our shapes, and use color to draw the eye to the shape we want them to see. Something that impacts that is bad, so we want to get rid of it.

The fastest way is to use pre-made themes. `plotly_white` is a common choice for a minimal theme with a white background.

Let's use `plotly_white` here.

In [None]:
fig_ex5 = go.Figure()
top20_sorted_ex5 = top20_df.sort_values(by='percent_uninsured', ascending=True)

fig_ex5.add_trace(go.Bar(
    x=top20_sorted_ex5['percent_uninsured'],
    y=top20_sorted_ex5['ctyname'],
    orientation='h',
    marker_color='lightgrey',
    name='Top 20 Counties by % Uninsured'
))

fig_ex5.add_trace(go.Bar(
    x=top_county_df['percent_uninsured'], 
    y=top_county_df['ctyname'], 
    orientation='h',
    marker_color='darkgreen',
    name=f"{top_county_name_glue} (Max Uninsured Count in Top 20%)"
))

fig_ex5.update_layout(
    title_text=f"Uninsured Rate (Minimal Theme) - {state_full_name_glue}",
    xaxis_title="Percent Uninsured",
    yaxis_title="County",
    height=600,
    barmode='overlay',
    legend=dict(x=0.6, y=0.1),
    template="____"
)
fig_ex5.update_xaxes(tickformat='.0%')
fig_ex5.update_yaxes(categoryorder='array', categoryarray=top20_sorted_ex5['ctyname'])
fig_ex5.show()

This chart has some work left -- headlines, some text to explain the dark green bars, the axis labels could be improved -- but this chart tells a story and our reader can find it. 

## Using color wrong

The best way to know you're using color wrong is to watch what is -- without argument -- the best Pixar movie: The Incredibles. In it, the bad guy tries to not only destroy superheroes but the very idea of them by making everyone a superhero. This is the money quote: 

<iframe width="560" height="315" src="https://www.youtube.com/embed/VYLdI6n2-yQ" title="Syndrome - When Everyone's Super... No One Will Be" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>

So how do you do that here? By mapping a categorical variable to the `color` argument in `px.bar` without a specific, limited color sequence. This makes everything a different color by default.

### Exercise 6: Doing it wrong

Recreate our bar chart, but this time, map the `color` aesthetic to `ctyname`.

In [None]:
fig_ex6 = go.Figure()
if not top20_df.empty:
    top20_sorted_ex6 = top20_df.sort_values(by='percent_uninsured', ascending=True)
    
    fig_ex6 = px.bar(
        data_frame=top20_sorted_ex6,
        x='percent_uninsured',
        y='ctyname',
        orientation='h',
        color='____',
        title="When Every County is Highlighted..."
    )
    
    fig_ex6.update_layout(
        xaxis_title="Percent Uninsured",
        yaxis_title="County",
        height=700, # Taller to accommodate legend if it appears
        showlegend=False # Usually too many items for a legend to be useful here
    )
    fig_ex6.update_xaxes(tickformat='.0%')
    fig_ex6.show()

Good luck reading that. Looks like a pack of crayons from when you were a little kid. Because everything is a color, good luck finding something to focus on. If you give your reader too much to focus on, there's nothing they can focus on. 

## The Recap

Throughout this lesson, you've explored the power of color in data visualization and learned how to use it judiciously to enhance your charts. You've practiced creating a base chart, using color to fade less important elements into the background, and highlighting key data points with contrasting colors. Remember, effective use of color is about balance - drawing attention to the most important aspects of your data while avoiding visual clutter. Consider how thoughtful color choices can make your visualizations more engaging and easier to understand, ultimately helping you tell stronger data-driven stories.

## Terms to Know

- **Color Contrast**: The use of different colors to create visual distinction between elements in a chart. High contrast (e.g., a bright color on a muted background) draws attention.
- **`marker_color`**: In Plotly `go.Bar` or `go.Scatter`, this property sets the fill color of markers or bars. In `px` functions, often controlled by `color` argument or `color_discrete_sequence`.
- **`color` (in `px` functions)**: When mapped to a data column in Plotly Express (e.g., `px.bar(..., color='category_column')`), it assigns different colors based on the unique values in that column.
- **`color_discrete_sequence`**: An argument in Plotly Express functions to provide a list of specific colors to use for categorical data when `color` is mapped.
- **`fig.add_trace(go.Bar(...))`**: Method to add a new bar trace to an existing Plotly figure, allowing for layered charts with different colors or properties per trace.
- **Plotly Templates (e.g., `plotly_white`)**: Predefined styles for Plotly charts that control background, fonts, gridlines, etc. Set via `fig.update_layout(template=...)`.
- **`barmode='overlay'`**: In `fig.update_layout()`, when multiple bar traces share the same categories, `'overlay'` draws them on top of each other. Useful for highlighting.