In [1]:
# data source: https://www.gapminder.org/data/
# blog source: http://sergilehkyi.com/ru/interactive-data-visualization-with-python-using-bokeh/
# question: how to visualize data with > 3 dimensions? (+glyphs, colors)

import pandas as pd
import numpy as np
import os

from bokeh.io import curdoc
from bokeh.plotting import figure
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper, Slider
from bokeh.palettes import Spectral6
from bokeh.layouts import widgetbox, row

In [2]:
os.listdir()
data = pd.read_csv('co2_emissions_tonnes_per_person.csv')
data.head()

Unnamed: 0,country,1800,1801,1802,1803,1804,1805,1806,1807,1808,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014
0,Afghanistan,,,,,,,,,,...,0.0529,0.0637,0.0854,0.154,0.242,0.294,0.412,0.35,0.316,0.299
1,Albania,,,,,,,,,,...,1.38,1.28,1.3,1.46,1.48,1.56,1.79,1.68,1.73,1.96
2,Algeria,,,,,,,,,,...,3.22,2.99,3.19,3.16,3.42,3.3,3.29,3.46,3.51,3.72
3,Andorra,,,,,,,,,,...,7.3,6.75,6.52,6.43,6.12,6.12,5.87,5.92,5.9,5.83
4,Angola,,,,,,,,,,...,0.98,1.1,1.2,1.18,1.23,1.24,1.25,1.33,1.25,1.29


In [3]:
gapminder = pd.read_csv('gapminder_tidy.csv')
gapminder.head()

Unnamed: 0,Country,Year,fertility,life,population,child_mortality,gdp,region
0,Afghanistan,1964,7.671,33.639,10474903.0,339.7,1182.0,South Asia
1,Afghanistan,1965,7.671,34.152,10697983.0,334.1,1182.0,South Asia
2,Afghanistan,1966,7.671,34.662,10927724.0,328.7,1168.0,South Asia
3,Afghanistan,1967,7.671,35.17,11163656.0,323.3,1173.0,South Asia
4,Afghanistan,1968,7.671,35.674,11411022.0,318.1,1187.0,South Asia


In [4]:
df = gapminder[['Country', 'region']].drop_duplicates()

# DB-style join
data_with_regions = pd.merge(data, df, left_on='country', right_on='Country', how='inner')
data_with_regions = data_with_regions.drop('Country', axis='columns')
data_with_regions.head(3)

Unnamed: 0,country,1800,1801,1802,1803,1804,1805,1806,1807,1808,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,region
0,Afghanistan,,,,,,,,,,...,0.0637,0.0854,0.154,0.242,0.294,0.412,0.35,0.316,0.299,South Asia
1,Albania,,,,,,,,,,...,1.28,1.3,1.46,1.48,1.56,1.79,1.68,1.73,1.96,Europe & Central Asia
2,Algeria,,,,,,,,,,...,2.99,3.19,3.16,3.42,3.3,3.29,3.46,3.51,3.72,Middle East & North Africa


In [5]:
new_df = pd.melt(data_with_regions, id_vars=['country', 'region']) # to long format
new_df.head()

Unnamed: 0,country,region,variable,value
0,Afghanistan,South Asia,1800,
1,Albania,Europe & Central Asia,1800,
2,Algeria,Middle East & North Africa,1800,
3,Angola,Sub-Saharan Africa,1800,
4,Antigua and Barbuda,America,1800,


In [6]:
columns = ['country', 'region', 'year', 'co2']
new_df.columns = columns

upd_new_df = new_df[new_df['year'].astype('int64') > 1963]
upd_new_df.info()
upd_new_df = upd_new_df.sort_values(by=['country', 'year'])
upd_new_df['year'] = upd_new_df['year'].astype('int64')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8823 entries, 28372 to 37194
Data columns (total 4 columns):
country    8823 non-null object
region     8823 non-null object
year       8823 non-null object
co2        8375 non-null float64
dtypes: float64(1), object(3)
memory usage: 344.6+ KB


In [7]:
df_gdp = gapminder[['Country', 'Year', 'gdp']]
df_gdp.columns = ['country', 'year', 'gdp']
df_gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10111 entries, 0 to 10110
Data columns (total 3 columns):
country    10111 non-null object
year       10111 non-null int64
gdp        9000 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 237.1+ KB


In [8]:
final_df = pd.merge(upd_new_df, df_gdp, on=['country', 'year'], how='left')
final_df = final_df.dropna()
final_df.head()

Unnamed: 0,country,region,year,co2,gdp
0,Afghanistan,South Asia,1964,0.0863,1182.0
1,Afghanistan,South Asia,1965,0.101,1182.0
2,Afghanistan,South Asia,1966,0.108,1168.0
3,Afghanistan,South Asia,1967,0.124,1173.0
4,Afghanistan,South Asia,1968,0.116,1187.0


In [9]:
np_co2 = np.array(final_df['co2'])
np_gdp = np.array(final_df['gdp'])
np.corrcoef(np_co2, np_gdp)

array([[1.        , 0.78219731],
       [0.78219731, 1.        ]])

In [10]:
regions_list = final_df.region.unique().tolist()
color_mapper = CategoricalColorMapper(factors=regions_list, palette=Spectral6)

In [11]:
source = ColumnDataSource(data={
    'x': final_df.gdp[final_df['year'] == 1964],
    'y': final_df.co2[final_df['year'] == 1964],
    'country': final_df.country[final_df['year'] == 1964],
    'region': final_df.region[final_df['year'] == 1964],
}) # 1 year sample

xmin, xmax = min(final_df.gdp), max(final_df.gdp)
ymin, ymax = min(final_df.co2), max(final_df.co2)

In [12]:
plot = figure(title='Gapminder Data for 1964', 
              plot_height=600, plot_width=1000,
              x_range=(xmin, xmax),
              y_range=(ymin, ymax), y_axis_type='log')

plot.circle(x='x', y='y', fill_alpha=0.8, source=source, legend='region',
            color=dict(field='region', transform=color_mapper),
            size=7)

plot.legend.location = 'bottom_right'
plot.xaxis.axis_label = 'Income per person (Gross domestic product per person)'
plot.yaxis.axis_label = 'CO2 emissions (tonnes per person)'

In [13]:
slider = Slider(start=min(final_df.year),
                end=max(final_df.year), step=1,
                value=min(final_df.year), title='Year')

def update_plot(attr, old, new):
    
    yr = slider.value

    new_data = {
        'x': final_df.gdp[final_df['year'] == yr],
        'y': final_df.co2[final_df['year'] == yr],
        'country': final_df.country[final_df['year'] == yr],
        'region': final_df.region[final_df['year'] == yr],
    }
    source.data = new_data

    plot.title.text = 'Gapminder data for %d' % yr
    
slider.on_change('value', update_plot)

hover = HoverTool(tooltips=[('Country', '@country'), ('GDP', '@x'), ('CO2 emission', '@y')])
plot.add_tools(hover)

In [14]:
layout = row(widgetbox(slider), plot)
curdoc().add_root(layout)

# bokeh serve --show CO2_emissions_in_relation_to_gdp.ipynb