In [None]:
import pandas as pd

# For reference, see chapter 9 of Python for Data Analysis, 3rd Edition
# https://wesmckinney.com/book/plotting-and-visualization.html

# Plotting directly from pandas

pandas has built-in capabilities that allow you to make some simple kinds of graphs directly from DataFrame indices and columns.

## Line graphs

In [None]:
url = 'https://github.com/nytimes/covid-19-data/raw/master/us.csv'
covid = pd.read_csv(url)
covid.tail()

In [None]:
covid.plot()
# covid.plot(kind='line') # .plot() defaults to this specific kind of plot
# covid.plot.line() # alternate method of specifying the kind of plot

Convert date strings to datetime objects, then set the date column as the row index.

In [None]:
# See https://stackoverflow.com/questions/25416955/plot-pandas-dates-in-matplotlib for setting X axis as dates
covid['date'] = pd.to_datetime(covid['date'], format='%Y-%m-%d') # converts string to datetime object
covid = covid.set_index(['date'])
covid.head()

In [None]:
covid.plot()

## Scatterplot

In [None]:
# Scatter plot requires specifying both the X and Y columns as arguments
url = 'https://github.com/HeardLibrary/digital-scholarship/raw/master/data/codegraf/falling_cats.csv'
cats = pd.read_csv(url)
cats.head()


In [None]:
cats.plot.scatter('stories_fallen', 'injury_rate')

## Pie chart

In [None]:
url = 'https://github.com/HeardLibrary/digital-scholarship/raw/master/data/codegraf/co2_state_2016_sector.xlsx'
state_co2_sector = pd.read_excel(url)
state_co2_sector = state_co2_sector.set_index('State')
state_co2_sector.head()

In [None]:
# Slice the Commercial through Transportation columns for a single row, resulting in a pandas series
az_sector_components = state_co2_sector.loc['Arizona', 'Commercial': "Transportation"]
az_sector_components

In [None]:
# The series labels will automatically be used as the labels on the chart
az_sector_components.plot(kind='pie')

## Bar chart

In [None]:
# Slice the state rows from Alabama through Wyoming for the single Total column, resulting in a pandas series
totals_by_state = state_co2_sector.loc['Alabama': 'Wyoming', 'Total']
totals_by_state.head()

In [None]:
# The series labels will automatically be used as the bar chart labels.
totals_by_state.sort_index(ascending=False).plot(kind='barh', figsize=(10,10))

# pyplot from matplotlib

## Controlling display with figures and subplots

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create a figure object
fig = plt.figure()

# Create 2 subplots with 1 row and 2 columns
axes1 = fig.add_subplot(1, 2, 1)
axes2 = fig.add_subplot(1, 2, 2)

In [None]:
# Create a figure object
fig = plt.figure()

# Create 2 subplots with 2 rows and 1 column
axes1 = fig.add_subplot(2, 1, 1)
axes2 = fig.add_subplot(2, 1, 2)
plt.show() # displays the graph if you aren't using Jupyter notebooks

In [None]:
url = 'https://github.com/nytimes/covid-19-data/raw/master/us.csv'
covid = pd.read_csv(url)

# Slice the first 50 days of COVID 19 data

first_cases = covid.iloc[:50]
first_cases.tail()

In Jupyter notebooks, plots are reset after every cell, so setup code must be included in a single cell

In [None]:
# Create a figure object
fig = plt.figure(figsize=(10,10))

# Create 2 subplots with 2 rows and 1 column
axes1 = fig.add_subplot(2, 1, 1)
axes2 = fig.add_subplot(2, 1, 2)

# .plot() method creates a line plot (points connected by a line)
axes1.plot(first_cases.index, first_cases['cases'], color='k', linestyle='dashed', marker='o')
axes1.set_title('cases')
# For convenience here I used the "dot" method for specifying the column instead of specifying it as
# an index as I did above.
axes2.plot(first_cases.index, first_cases.deaths, color='r', linestyle='dashed', marker='x')
axes2.set_title('deaths')

## Plot in a single subplot

In [None]:
# Create a figure object
fig = plt.figure(figsize=(10,10))

# Create a single subplot
ax = fig.add_subplot(1, 1, 1)

ax.plot(first_cases.index, first_cases.cases, color='k', linestyle='dashed', marker='o')
ax.plot(first_cases.index, first_cases.deaths, color='r', linestyle='dashed', marker='x')

# set labels for subplot axes
ax.set_xlabel('days since first case')
ax.set_ylabel('number of persons')

ax.set_title('start of the COVID 19 pandemic in the U.S.')

Display as a bar graph (unstacked)

In [None]:
# Create a figure object
fig = plt.figure(figsize=(10,10))

# Create a single subplot
ax = fig.add_subplot(1, 1, 1)

# .bar() method creates a bar plot
# Including the plot labels allows the creation of a legend.
ax.bar(first_cases.index, first_cases.cases, color='k', label='cases')
ax.bar(first_cases.index, first_cases.deaths, color='r', label='deaths')

ax.set_xlabel('days since first case')
ax.set_ylabel('number of persons')
ax.set_title('start of the COVID 19 pandemic in the U.S.')
ax.legend() # Display the legend

## Scatterplot with best fit curve

In [None]:
# Reload the falling cats data
url = 'https://github.com/HeardLibrary/digital-scholarship/raw/master/data/codegraf/falling_cats.csv'
cats = pd.read_csv(url)
cats.head()

In [None]:
# For convenience, extract the column data as pandas Series
# We'll use them multiple times later.
stories_fallen = cats['stories_fallen']
print(stories_fallen)
print()
injury_rate = cats['injury_rate']
print(injury_rate)

Recreate the cat scatterplot, this time using `pyplot`.

In [None]:
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(1, 1, 1)

# .scatter() method creates an x,y scatterplot
# Pass in the two series as the x and y values.
ax.scatter(stories_fallen, injury_rate, color='r')

# set labels for subplot axes
ax.set_xlabel('stories fallen')
ax.set_ylabel('average injury per cat')

We can create a best-fit polynomial using numpy

In [None]:
import numpy as np

# Add a second-order polynomial best-fit function using NumPy
# In this case, we used second order (parabolic fit) instead of a straight line.
z = np.polyfit(stories_fallen, injury_rate, 2)
p = np.poly1d(z)


Recreate the plot, but now add the best fit line (as a line plot) to the subplot.

In [None]:
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel('stories fallen')
ax.set_ylabel('average injury per cat')

# Create the scatterplot
ax.scatter(stories_fallen, injury_rate)

# Add the polynomial best fit curve as a dashed line

# plot type .plot() is a line plot rather than scatterplot
ax.plot(stories_fallen, p(stories_fallen), "r--") # r for red, -- for dashed line


In [None]:
# The deviations can be passed in as any of several one-dimensional data objects: list, series, or Numpy array
# The deviations must have previously been calculated from the data -- in this case the numbers are just made up.
upper_deviation = [0.1, 0.2, 0.3, 0.1, 0.5, 0.2, 0.6]
lower_deviation = pd.Series([0.1, 0.4, 0.1, 0.2, 0.5, 0.6, 0.3])

fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(1, 1, 1)
ax.set_xlabel('stories fallen')
ax.set_ylabel('average injury per cat')

# change plot type to error bar
ax.errorbar(stories_fallen, injury_rate, yerr=[lower_deviation, upper_deviation], fmt='o')


# Creating multiple graphs programatically

## Multiple pie charts

We will use some of our pandas DataFrame wrangling skills to prepare data for creating pie charts. We will create multiple charts by looping rather than by manually creating them one at a time.

In [None]:
url = 'https://github.com/HeardLibrary/digital-scholarship/raw/master/data/codegraf/co2_state_2016_sector.xlsx'
state_co2_sector = pd.read_excel(url)
state_co2_sector = state_co2_sector.set_index('State')
state_co2_sector.tail()

Drop the last row (totals by sector)

In [None]:
state_co2_sector = state_co2_sector.drop('Total')
state_co2_sector.tail()

Sort decending by the state totals

In [None]:
decreasing = state_co2_sector.sort_values(by='Total', ascending=False)
decreasing.head()

Delete the total column (no longer needed after sorting)

In [None]:
decreasing = decreasing.drop(['Total'], axis='columns')
decreasing.head()

Since the number of states to plot isn't hard-coded in the script and since we are generating multiple plots programatically, the user can chose any number of states to plot.

The `.subplots()` method generates a sequence of subplots that can be accessed by their index number. This allows us to generate many subplots with a little bit of code in a loop.

In [None]:
number_of_states = int(input('How many states to plot? '))
fig = plt.figure(figsize=(5, 4*number_of_states))
ax = fig.subplots(number_of_states, 1)
for subplot in range(number_of_states):
    ax[subplot].pie(decreasing.iloc[subplot], labels=decreasing.columns)
    ax[subplot].set_title(decreasing.index[subplot])


# Optional

This last example is a lot more complicated, but shows more possibilities for plotting.

## Creating a stacked bar chart 

One of the deficiencies of pie charts is that they only show the relative fraction of the sectors. A stacked bar chart shows not only the relative fraction of the sectors, but also the overall magnitude. So it's a better visualization type. Unfortunately, it's not easy to create stacked bar charts in Matplotlib.

In [None]:
# Reload state_co2_sector if necessary
url = 'https://github.com/HeardLibrary/digital-scholarship/raw/master/data/codegraf/co2_state_2016_sector.xlsx'
state_co2_sector = pd.read_excel(url)
state_co2_sector.tail()

In [None]:
# Extract sector data for the top few states
number_of_states = 4
top_state_sectors = state_co2_sector.set_index('State').drop('Total').sort_values(by='Total', ascending=False).drop(['Total'], axis='columns')[:number_of_states]
top_state_sectors

In [None]:
# Based on example at https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781849513265/1/ch01lvl1sec17/plotting-stacked-bar-charts
# See also https://matplotlib.org/3.1.1/gallery/lines_bars_and_markers/bar_stacked.html

import numpy as np

# Create a figure object
fig = plt.figure(figsize=(15,10))

# Create a single subplot
ax = fig.add_subplot(1, 1, 1)

# Create a numpy array with one element for each row
ind = np.arange(len(top_state_sectors))
#print(ind)

# Extract the row and column labels as numpy arrays from pandas series
row_labels = top_state_sectors.index.values
column_labels = top_state_sectors.columns.values

for sector_number in range(len(top_state_sectors.columns)):
    #print(sector_number)
    #print(top_state_sectors.iloc[:, :sector_number])
    sector_sums = top_state_sectors.iloc[:, :sector_number].sum(axis='columns')
    #print(sector_sums)
    ax.bar(ind, top_state_sectors.iloc[:, sector_number], bottom=sector_sums)

# These functions operate on the most recently active subplot; we have only one in this example
plt.xticks(ind, row_labels)
plt.legend(column_labels)


There are many, many types of plots and options. See the [matplotlib gallery](https://matplotlib.org/3.1.1/gallery/index.html) for examples. For more complex plot types, it may be easier to use seaborn, a high-level statistical graphics library built on matplotlib. See https://seaborn.pydata.org/ for more information.

# Practice

See [this page](https://github.com/HeardLibrary/digital-scholarship/tree/master/data/codegraf) for information about the dataset.

In [None]:
url = 'data/flight_data_set.csv'
flights = pd.read_csv(url)
flights.head()

Calculate the average values for the carriers and slice out the Minutes of Delay per flight. Create a bar chart of the resulting series.

In [None]:
grouped = flights.groupby(['Carrier Name']).mean()
grouped.drop(['Minutes of Delay', 'Number of Flights'], axis='columns', inplace=True)
grouped.sort_values(by='Minutes of Delay per Flight', ascending=True).plot(kind='barh', figsize=(20,10))

Recreate the plot, but this time replace `NaN` values with zeros.

In [None]:
flights.fillna({'Minutes of Delay per Flight': 0}, inplace=True)
grouped = flights.groupby(['Carrier Name']).mean()
grouped.drop(['Minutes of Delay', 'Number of Flights'], axis='columns', inplace=True)
grouped.sort_values(by='Minutes of Delay per Flight', ascending=True).plot(kind='barh', figsize=(20,10))
#grouped.plot(kind='barh', figsize=(20,10))

Convert date column to a datetime object and group by Carrier Name.

In [None]:
date_flights = flights.copy()
date_flights['Date'] = pd.to_datetime(date_flights['Date'], format = '%m/%d/%Y')
grouped = date_flights.groupby(['Carrier Name'])
grouped.head()

Slice only the Delta data and sum by date. Plot only the Minutes of Delay by date

In [None]:
delta = grouped.get_group('Delta')
time_series = delta.groupby('Date').sum()
time_series.drop(['Minutes of Delay per Flight', 'Number of Flights'], axis='columns', inplace=True)
print(time_series.head())
time_series.plot(kind='line', figsize=(20,10))


Let's see if this pattern holds across airlines. Group by both Carrier Name and Date rather than selecting only one airline. Limit output to Minutes of Delay data.

In [None]:
date_flights = flights.copy()
date_flights['Date'] = pd.to_datetime(date_flights['Date'], format = '%m/%d/%Y')
grouped = date_flights.groupby(['Carrier Name', 'Date']).sum()
grouped.drop(['Minutes of Delay per Flight', 'Number of Flights'], axis='columns', inplace=True)
grouped.head()

Unstack Carrier Name so that we get a column for each carrier. Now if we redo the plot, Pandas will plot a line for each carrier.

In [None]:
columned = grouped.unstack('Carrier Name')
print(columned.head())
columned.plot(kind='line', figsize=(20,10))

Let's compare the number of flights across airlines using a bar chart.

In [None]:
grouped = flights.groupby(['Carrier Name']).sum()
grouped.drop(['Minutes of Delay per Flight', 'Minutes of Delay'], axis='columns', inplace=True)
grouped.plot(kind='bar', figsize=(20,10))