In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# For reference, see chapter 9 of Python for Data Analysis, 2nd Edition

# Line graphs

In [None]:
url = 'https://github.com/nytimes/covid-19-data/raw/master/us.csv'
covid = pd.read_csv(url)
covid.tail()

In [None]:
covid.plot()
# covid.plot(kind='line') # .plot() defaults to this specific kind of plot
# covid.plot.line() # alternate method of specifying the kind of plot

In [None]:
covid.set_index(['date']).plot() # sets the index of the dataframe to the date column

Fix the dates on the X axis

In [None]:
# See https://stackoverflow.com/questions/25416955/plot-pandas-dates-in-matplotlib for setting X axis as dates
covid['date'] = pd.to_datetime(covid['date'], format = '%Y-%m-%d') # converts string to datetime object
covid.set_index(['date'], inplace=True)
covid.head()

In [None]:
covid.plot()

## Subplots

In [None]:
covid.plot(title = 'Covid 19 cases in the U.S.', subplots = True)

In [None]:
covid.plot(title = 'Covid 19 cases in the U.S.', subplots = True, figsize=(10,10))

In [None]:
covid.plot(title = 'Covid 19 cases in the U.S.', subplots = True, figsize=(10,10), logy = True)

## Scatterplot

In [None]:
# Scatter plot requires specifying both the X and Y columns as arguments
covid.plot.scatter('cases', 'deaths')

## Pie chart

In [None]:
url = 'https://github.com/HeardLibrary/digital-scholarship/raw/master/data/codegraf/co2_state_2016_sector.xlsx'
state_co2_sector = pd.read_excel(url)
state_co2_sector.set_index('State', inplace=True)
state_co2_sector.head()

In [None]:
az_sector = state_co2_sector.loc['Arizona']
az_sector

In [None]:
az_sector_components = az_sector['Commercial': "Transportation"]
az_sector_components


In [None]:
az_sector_components.plot(kind = 'pie')

In [None]:
no_totals = state_co2_sector.copy()
no_totals.drop('Total', inplace=True)
no_totals.tail()

In [None]:
decreasing = no_totals.sort_values(by='Total', ascending=False)
decreasing.head()

In [None]:
decreasing.drop(['Total'], axis='columns', inplace=True)
decreasing.head()

In [None]:
decreasing[:3]

In [None]:
# Transpose so that states will be subplots and sectors will be plot categories
decreasing[:3].T.plot(kind='pie', subplots=True, legend=False, figsize=(20,10))

In [None]:
decreasing.loc[['Texas', 'Alaska', 'Ohio', 'District of Columbia']].T.plot(kind='pie', subplots=True, legend=False, figsize=(20,10))

## Bar chart

In [None]:
totals_by_state = state_co2_sector.Total['Alabama': 'Wyoming']
totals_by_state.head()

In [None]:
totals_by_state.plot(kind = 'barh')

In [None]:
totals_by_state.sort_index(ascending=False).plot(kind = 'barh', figsize=(10,10))

# pyplot from matplotlib

In [None]:
first_cases = covid[:50]
first_cases.head()

## Using the plt.plot function

In [None]:
plt.plot(first_cases.cases, first_cases.deaths)

In [None]:
plt.scatter(first_cases.cases, first_cases.deaths)

In [None]:
plt.plot(first_cases.cases, first_cases.deaths, color='k', linestyle='dashed', marker='o')

In [None]:
# pyplot will accept calculated series in addition to series that are columns from DataFrames
thousands_cases = covid['cases']/1000
thousands_deaths = covid['deaths']/1000
plt.scatter(thousands_cases, thousands_deaths)

## Controlling display with figures and subplots

In [None]:
# Create a figure object
fig = plt.figure()

# Create 2 subplots with 1 row and 2 columns
axes1 = fig.add_subplot(1, 2, 1)
axes2 = fig.add_subplot(1, 2, 2)

In [None]:
# Create a figure object
fig = plt.figure()

# Create 2 subplots with 2 rows and 1 column
axes1 = fig.add_subplot(2, 1, 1)
axes2 = fig.add_subplot(2, 1, 2)
plt.show() # displays the graph if you aren't using Jupyter notebooks

In Jupyter notebooks, plots are reset after every cell, so setup code must be included in a single cell

In [None]:
# Create a figure object
fig = plt.figure(figsize=(10,10))

# Create 2 subplots with 2 rows and 1 column
axes1 = fig.add_subplot(2, 1, 1)
axes2 = fig.add_subplot(2, 1, 2)
axes1.plot(first_cases.index, first_cases.cases, color='k', linestyle='dashed', marker='o')
axes1.set_title('cases')
axes2.plot(first_cases.index, first_cases.deaths, color='r', linestyle='dashed', marker='x')
axes2.set_title('deaths')

## Plot in a single subplot

In [None]:
# Create a figure object
fig = plt.figure(figsize=(10,10))

# Create a single subplot
ax = fig.add_subplot(1, 1, 1)
ax.plot(first_cases.index, first_cases.cases, color='k', linestyle='dashed', marker='o')
ax.plot(first_cases.index, first_cases.deaths, color='r', linestyle='dashed', marker='x')
ax.set_title('start of the COVID 19 pandemic in the U.S.')

Display as a bar graph (unstacked)

In [None]:
# Create a figure object
fig = plt.figure(figsize=(10,10))

# Create a single subplot
ax = fig.add_subplot(1, 1, 1)
ax.bar(first_cases.index, first_cases.cases, color='k')
ax.bar(first_cases.index, first_cases.deaths, color='r')
ax.set_title('start of the COVID 19 pandemic in the U.S.')

## Creating a plot programatically
Stacked bar graph 

In [None]:
# Reload state_co2_sector if necessary
url = 'https://github.com/HeardLibrary/digital-scholarship/raw/master/data/codegraf/co2_state_2016_sector.xlsx'
state_co2_sector = pd.read_excel(url)
state_co2_sector.tail()

In [None]:
# Extract sector data for the top few states
number_of_states = 4
top_state_sectors = state_co2_sector.set_index('State').drop('Total').sort_values(by='Total', ascending=False).drop(['Total'], axis='columns')[:number_of_states]
top_state_sectors

In [None]:
# Based on example at https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781849513265/1/ch01lvl1sec17/plotting-stacked-bar-charts
# See also https://matplotlib.org/3.1.1/gallery/lines_bars_and_markers/bar_stacked.html

# Create a figure object
fig = plt.figure(figsize=(15,10))

# Create a single subplot
ax = fig.add_subplot(1, 1, 1)

# Create a numpy array with one element for each row
ind = np.arange(len(top_state_sectors))
#print(ind)

# Extract the row and column labels as numpy arrays from pandas series
row_labels = top_state_sectors.index.values
column_labels = top_state_sectors.columns.values

for sector_number in range(len(top_state_sectors.columns)):
    #print(sector_number)
    #print(top_state_sectors.iloc[:, :sector_number])
    sector_sums = top_state_sectors.iloc[:, :sector_number].sum(axis='columns')
    #print(sector_sums)
    ax.bar(ind, top_state_sectors.iloc[:, sector_number], bottom=sector_sums)

# These functions operate on the most recently active subplot; we have only one in this example
plt.xticks(ind, row_labels)
plt.legend(column_labels)


There are many, many types of plots and options. See the [matplotlib gallery](https://matplotlib.org/3.1.1/gallery/index.html) for examples.

# Practice

See [this page](https://github.com/HeardLibrary/digital-scholarship/tree/master/data/codegraf) for information about the dataset.

In [None]:
url = 'data/flight_data_set.csv'
flights = pd.read_csv(url)
flights.head()

Calculate the average values for the carriers and slice out the Minutes of Delay per flight. Create a bar chart of the resulting series.

In [None]:
grouped = flights.groupby(['Carrier Name']).mean()
grouped.drop(['Minutes of Delay', 'Number of Flights'], axis='columns', inplace=True)
grouped.sort_values(by='Minutes of Delay per Flight', ascending=True).plot(kind='barh', figsize=(20,10))

Recreate the plot, but this time replace `NaN` values with zeros.

In [None]:
flights.fillna({'Minutes of Delay per Flight': 0}, inplace=True)
grouped = flights.groupby(['Carrier Name']).mean()
grouped.drop(['Minutes of Delay', 'Number of Flights'], axis='columns', inplace=True)
grouped.sort_values(by='Minutes of Delay per Flight', ascending=True).plot(kind='barh', figsize=(20,10))
#grouped.plot(kind='barh', figsize=(20,10))

Convert date column to a datetime object and group by Carrier Name.

In [None]:
date_flights = flights.copy()
date_flights['Date'] = pd.to_datetime(date_flights['Date'], format = '%m/%d/%Y')
grouped = date_flights.groupby(['Carrier Name'])
grouped.head()

Slice only the Delta data and sum by date. Plot only the Minutes of Delay by date

In [None]:
delta = grouped.get_group('Delta')
time_series = delta.groupby('Date').sum()
time_series.drop(['Minutes of Delay per Flight', 'Number of Flights'], axis='columns', inplace=True)
print(time_series.head())
time_series.plot(kind='line', figsize=(20,10))


Let's see if this pattern holds across airlines. Group by both Carrier Name and Date rather than selecting only one airline. Limit output to Minutes of Delay data.

In [None]:
date_flights = flights.copy()
date_flights['Date'] = pd.to_datetime(date_flights['Date'], format = '%m/%d/%Y')
grouped = date_flights.groupby(['Carrier Name', 'Date']).sum()
grouped.drop(['Minutes of Delay per Flight', 'Number of Flights'], axis='columns', inplace=True)
grouped.head()

Unstack Carrier Name so that we get a column for each carrier. Now if we redo the plot, Pandas will plot a line for each carrier.

In [None]:
columned = grouped.unstack('Carrier Name')
print(columned.head())
columned.plot(kind='line', figsize=(20,10))

Let's compare the number of flights across airlines using a bar chart.

In [None]:
grouped = flights.groupby(['Carrier Name']).sum()
grouped.drop(['Minutes of Delay per Flight', 'Minutes of Delay'], axis='columns', inplace=True)
grouped.plot(kind='bar', figsize=(20,10))