# Introduction to data visualisation

In [None]:
# import standard pandas and numpy packages
import pandas as pd
import numpy as np

# importing library and setting to generate figures
import matplotlib.pyplot as plt
%matplotlib inline

# importing seaborn
import seaborn as sns

# Quick start - very basic plots .....what's wrong with these?

## mathematical plot

In [None]:
# Generating toy data 
x = np.linspace(0, 5, 11) # Generate 11 numbers equally spaced between 0 and 5 inclusive
y = x ** 2 # y = x squared

In [None]:
plt.plot(x,y)

In [None]:
# Adding basic colour and style using string notation
plt.plot(y, x, 'r--')

## Introducing real world data from the gapminder site

In [None]:
data = pd.read_csv('key_data.csv')
data.tail(10)

In [None]:
# quick plot of population data
# plt.plot(data.year, data.pop_ch)
plt.plot(data.year, data.pop_fr)
plt.plot(data.year, data.pop_ug)

# Adding titles and axis labels

In [None]:
plt.plot(x, y)
plt.xlabel('X Axis Title Here')
plt.ylabel('Y Axis Title Here')
plt.title('Title Here')
plt.show()

# Changing line colours
https://matplotlib.org/2.0.2/api/colors_api.html

basic matplotlib colours - 
b: blue,
g: green,
r: red,
c: cyan,
m: magenta,
y: yellow,
k: black,
w: white,
### Shades of grey
color = a string between 0 and 1 depending on darkness eg color='0.75'
### HTML color codes
https://htmlcolorcodes.com/
color = '#23EA1A'
### RGB color ratios
Tuple of the three primary colours with values between 0 and 1
eg (1,0,0) is 'pure' red and (0, 0.5, 0.5) is bluey green
RGBA is as above but the a stands for alpha - referring to transparency

In [None]:
# Changing line colours
plt.plot(data.year, data.pop_ug, color = '#23EA1A') # Give column dataframe and column names as x and y values
plt.xlabel('Year')
plt.ylabel('Population')
plt.title('Population of Uganda (tens of millions)')
plt.show()

# Changing line width, style and markers

In [None]:
# plotting lines y= x + ? to shift line up with each new style
plt.plot(x, x+1, color="red", linewidth=0.25)
plt.plot(x, x+2, color="red", linewidth=0.50)
plt.plot(x, x+3, color="red", linewidth=1.00)
plt.plot(x, x+4, color="red", linewidth=2.00)

# possible linestype options ‘-‘, ‘–’, ‘-.’, ‘:’, ‘steps’
# https://matplotlib.org/gallery/lines_bars_and_markers/line_styles_reference.html

plt.plot(x, x+5, color="green", lw=3, linestyle='-')
plt.plot(x, x+6, color="green", lw=3, ls='-.')
plt.plot(x, x+7, color="green", lw=3, ls=':')

# possible marker symbols: marker = '+', 'o', '*', 's', ',', '.', '1', '2', '3', '4', ...
# https://matplotlib.org/api/markers_api.html
plt.plot(x, x+ 9, color="blue", lw=3, ls='-', marker='+')
plt.plot(x, x+10, color="blue", lw=3, ls='--', marker='o')
plt.plot(x, x+11, color="blue", lw=3, ls='-', marker='s')
plt.plot(x, x+12, color="blue", lw=3, ls='--', marker='2')

# marker size and color - using markerfacecolor, markeredgewidth and markeredgecolor
plt.plot(x, x+13, color="purple", lw=1, ls='-', marker='o', markersize=2)
plt.plot(x, x+14, color="purple", lw=1, ls='-', marker='o', markersize=4)
plt.plot(x, x+15, color="purple", lw=1, ls='-', marker='o', markersize=8, markerfacecolor="red")
plt.plot(x, x+16, color="purple", lw=1, ls='-', marker='s', markersize=8, 
        markerfacecolor="yellow", markeredgewidth=3, markeredgecolor="green");

# Tick size and labels

In [None]:
# Changing which ticks are shown other than default eg every 25 years
plt.plot(data.year, data.pop_ug)
plt.plot(data.year, data.pop_fr)
plt.xlabel('Year')
plt.ylabel('Population (billions)')
plt.title('Population over time')
# plt.xticks([1900, 1925, 1950, 1975, 2000, 1925])
# plt.ticklabel_format(style='sci', axis='y',scilimits=(5,5))
plt.show()

In [None]:
# Changing which ticks are shown other than default eg every 25 years
plt.plot(data.year, data.pop_ug)
plt.plot(data.year, data.pop_fr)
plt.plot(data.year, data.pop_ch)
plt.xlabel('Year')
plt.ylabel('Population (billions)')
plt.title('Population over time')
# plt.xticks([1900, 1925, 1950, 1975, 2000, 1925])
# plt.ticklabel_format(style='sci', axis='y',scilimits=(5,5))
plt.show()

In [None]:
# Introducing a log scale to one of the axes
plt.plot(data.year, data.pop_ug)
plt.plot(data.year, data.pop_fr)
plt.plot(data.year, data.pop_ch)
plt.xlabel('Year')
plt.ylabel('log Population')
plt.title('Population over time')
plt.yscale('log')
# plt.xticks([1900, 1925, 1950, 1975, 2000, 1925])
# plt.ticklabel_format(style='sci', axis='y',scilimits=(5,5))
plt.show()

## Changing limits or range of the graph

In [None]:
# For example only interested in lookint at dates from 1960 to 2020
plt.plot(data.year, data.pop_ug)
plt.plot(data.year, data.pop_fr)
plt.xlabel('Year')
plt.ylabel('Population (tens of millions)')
plt.title('Population over time')
plt.xlim(1960, 2020)
plt.xticks(np.arange(1960, 2020, 5))     # setting ticks at 5 year intervals between 1960 and 2020
plt.show()

## Changing your spines

In [None]:
# Hide the right and top spines
# For example only interested in lookint at dates from 1960 to 2020
plt.plot(data.year, data.pop_ug)
plt.plot(data.year, data.pop_fr)

plt.xlabel('Year')
plt.ylabel('Population (tens of millions)')
plt.title('Population over time')

plt.xlim(1960, 2020)
plt.xticks(np.arange(1960, 2020, 5))     # setting ticks at 5 year intervals between 1960 and 2020
plt.show()

## The location of the legend. 
To generate a legend simply plot plt.legend(). Matplotlib will choose style and location
To position the legend box use parameter loc=  followed by the following string or numeric codes

'best'	0, 
'upper right'	1, 
'upper left'	2, 
'lower left'	3, 
'lower right'	4, 
'right'	5, 
'center left'	6, 
'center right'	7, 
'lower center'	8, 
'upper center'	9, 
'center'	10

In [None]:
# Adding a legend
plt.plot(data.year, data.pop_ug)        # to avoid using column name add parameter , label = 'Uganda'
plt.plot(data.year, data.pop_fr)        # to avoid using column name add the parameter , label = 'France'
plt.xlabel('Year')
plt.ylabel('Population (tens of millions)')
plt.title('Population over time')
plt.xlim(1960, 2020)
plt.xticks(np.arange(1960, 2020, 5))

# add the method plt.legend() to generate a legend in the 'best' place
plt.legend()
# plt.legend(loc=5)  # refer to the above strings and codes
# plt.legend(bbox_to_anchor = (0.5, 0.5)) # proportion of the way across the x axis and up the y axis from 0 to 1
                                          # greater than 1 places legend outside the axis
plt.show()

# More complex plotting - figure and axes as separate objects

In order to adjust figure size, add multiple subplots to the same figure and to sync in with seaborn a slightly different format is required. 

Instead of treating the figure as the object by default we now consider the figure and the axis as different objects with their own methods

The following format allows us to create a figure (ie the full image) and a number of axes (even if that is only 1)

In [None]:
# Introducing the object oriented notation for a single axes object
# This allows us to manipulate the figuresize and resolution depending on final output

fig, ax = plt.subplots(figsize=(20,4), dpi=200)    # default is 8, 6 ie 8 wide, 6 high, default dpi is 100
plt.plot(x, y)
plt.xlabel('X Axis Title Here')
plt.ylabel('Y Axis Title Here')
plt.title('String Title Here')
plt.show()

In [None]:
# Changing which ticks are shown other than default eg every 25 years
fig, ax = plt.subplots(figsize=(20,4))
plt.plot(data.year, data.pop_ug, label='Uganda')
plt.plot(data.year, data.pop_fr, label = 'France')
plt.xlabel('Year')
plt.ylabel('Population (tens of millions)')
plt.title('Population over time')
plt.legend(loc=4)
plt.show()

# Plotting more than one axis in a figure
### must use the format 
    fig, ax = plt.subplots(nrows=  , ncols= , figsize=( , ) )
### must use ax1, ax2.... tuple to name axes
    fig, (ax1, ax2) = plt.subplots(nrows=  , ncols= , figsize=( , ) )
### must state number of rows,number of columns of axes
    nrows = number of rows of axes to include in the figure
    ncoles = number of columns of axes to include int he figure
### Our object now becomes the axis rather than the figure and we have to prefix methods with 'set_'

In [None]:
# For a plot with 2 horizontal axes
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20,4), dpi=100)

# parameters affecting axis 1
ax1.plot(data.year, data.pop_ug, 'g--', label='Uganda', lw=4, )
ax1.set_xlabel('Year')
ax1.set_ylabel('Population')
ax1.set_title('Population of Uganda')

#parameters affecting axis 2
ax2.plot(data.year, data.pop_fr, 'r.', label = 'France')
ax2.set_xlabel('Year')
ax2.set_ylabel('Population')
ax2.set_title('Population of France')

#parameters affecting the whole figure
fig.show()

In [None]:
# For a plot with 2 vertical axes
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(6,6), dpi=100)

# parameters affecting axis 1
ax1.plot(data.year, data.pop_ug, 'g--', label='Uganda', lw=4, )
ax1.set_xlabel('Year')
ax1.set_ylabel('Population')
ax1.set_title('Population of Uganda')
# ax1.ticklabel_format(style='sci', axis='y',scilimits=(5,5))
# Removing the spines
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)

#parameters affecting axis 2
ax2.plot(data.year, data.pop_fr, 'r.', label = 'France')
ax2.set_xlabel('Year')
ax2.set_ylabel('Population')
ax2.set_title('Population of France')
# removing the spines
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)

#parameters affecting the whole figure
fig.suptitle('Comparison of population changes between Uganda and France 1900 - 2000', y=1.05)
fig.tight_layout()
fig.show()

# Special plot types

## Scatter plot

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(20, 4))

ax1.scatter(data.pop_ug, data.life_exp_ug, color='b')
ax1.set_xlabel('Population')
ax1.set_ylabel('Life expectancy')
ax1.set_title('Uganda: Population size vs life expectancy')

ax2.scatter(data.pop_ch, data.life_exp_ch, color='r')
ax2.set_xlabel('Population')
ax2.set_ylabel('Life expectancy')
ax2.set_title('China: Population size vs life expectancy')

ax3.scatter(data.pop_fr, data.life_exp_fr, color='m')
ax3.set_xlabel('Population')
ax3.set_ylabel('Life expectancy')
ax3.set_title('France: Population size vs life expectancy')

fig.show()

# histogram

This shows the distribution of a single column of data. Ideal for test scores, ages etc so not perfect in this case but we can still make it work. NaNs or missing values have to be either dropped or replaced before this can work

In [None]:
plt.hist(data.life_exp_ch)
plt.xlabel('Life expectancy in years')
plt.ylabel('Number of years')
plt.title('China: Number of years with given life expectancy 1900-2018')
plt.show()

# See also barplot and boxplot