## Data Visualization using Bokeh

In [1]:
# Standard imports
from bokeh.io import output_notebook, show
output_notebook()
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool
import numpy as np

In [2]:
import bokeh.sampledata
bokeh.sampledata.download()

Using data directory: C:\Users\GADKARI\.bokeh\data
Downloading: CGM.csv (1589982 bytes)
   1589982 [100.00%]
Downloading: US_Counties.zip (3182088 bytes)
   3182088 [100.00%]
Unpacking: US_Counties.csv
Downloading: us_cities.json (713565 bytes)
    713565 [100.00%]
Downloading: unemployment09.csv (253301 bytes)
    253301 [100.00%]
Downloading: AAPL.csv (166698 bytes)
    166698 [100.00%]
Downloading: FB.csv (9706 bytes)
      9706 [100.00%]
Downloading: GOOG.csv (113894 bytes)
    113894 [100.00%]
Downloading: IBM.csv (165625 bytes)
    165625 [100.00%]
Downloading: MSFT.csv (161614 bytes)
    161614 [100.00%]
Downloading: WPP2012_SA_DB03_POPULATION_QUINQUENNIAL.zip (5148539 bytes)
   5148539 [100.00%]
Unpacking: WPP2012_SA_DB03_POPULATION_QUINQUENNIAL.csv
Downloading: gapminder_fertility.csv (64346 bytes)
     64346 [100.00%]
Downloading: gapminder_population.csv (94509 bytes)
     94509 [100.00%]
Downloading: gapminder_life_expectancy.csv (73243 bytes)
     73243 [100.00%]
Downloadi

In [3]:
# create data for line plot
def f(t):
    return np.exp(-t) * np.cos(2*np.pi*t)

t1 = np.arange(0.0, 5.0, 0.02)

In [4]:
p1 = figure(plot_width=400, plot_height=400)
p1.line(t1, f(t1), line_width=2)

show(p1)

Exercise: Generate 'cosine' data and plot it using Bokeh line plot (see matplotlib notebook for generating cosine data)

In [5]:
p2 = figure(plot_width=400, plot_height=400)
p2.line(t1, np.cos(2*np.pi*t1), line_width=2)
show(p2)

In [6]:
p3 = figure(plot_width=400, plot_height=400)
p3.line(t1, f(t1), line_width=2, color='red', alpha=0.5)
p3.line(t1, np.cos(2*np.pi*t1), line_width=2)

show(p3)

In [7]:
from bokeh.sampledata.stocks import AAPL, IBM, MSFT, GOOG
from bokeh.palettes import Spectral4
import pandas as pd

In [8]:
type(AAPL)

dict

In [9]:
p = figure(plot_width=800, plot_height=250, x_axis_type="datetime")
p.title.text = 'Click on legend entries to hide the corresponding lines'

for data, name, color in zip([AAPL, IBM, MSFT, GOOG], ["AAPL", "IBM", "MSFT", "GOOG"], Spectral4):
    df = pd.DataFrame(data)

    df['date'] = pd.to_datetime(df['date'])
    p.line(df['date'], df['close'], line_width=2, color=color, alpha=0.8, legend=name)

p.legend.location = "top_left"
p.legend.click_policy="hide"

output_file("interactive_legend.html", title="interactive_legend.py example")

show(p)

Exercise: Use pandas_datareader to download three stocks: NUE, X, and STLD. You can download them individually. Create a similar plot as above for the three stocks.

In [11]:
import pandas_datareader.data as web
from datetime import datetime
from bokeh.palettes import Spectral4
import pandas as pd
from bokeh.io import output_notebook, show

from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool


NUE = web.DataReader('NUE','yahoo')
X = web.DataReader('X','yahoo')
STLD = web.DataReader('STLD','yahoo')

p4 = figure(plot_width=800, plot_height=250, x_axis_type="datetime")
p4.title.text = 'Click on legend entries to hide the corresponding lines'

NUE.reset_index(inplace=True,drop=False)
X.reset_index(inplace=True,drop=False)
STLD.reset_index(inplace=True,drop=False)

for data, name, color in zip([NUE, X, STLD], ["NUE", "X", "STLD"], Spectral4):
    
    df = pd.DataFrame(data)
    df['Date'] = pd.to_datetime(df['Date'])
    p4.line(df['Date'], df['Close'], line_width=2, color=color, alpha=0.8, legend=name)

p4.legend.location = "top_left"
p4.legend.click_policy="hide"

output_file("interactive_legend.html", title="interactive_legend.py example")

show(p4)    



In [12]:
#### Hover Tool

In [13]:
# create data using python dictionary
source = ColumnDataSource(data=dict(
    x=[1, 2, 3, 4, 5],
    y=[2, 5, 8, 2, 7],
    desc=['A', 'b', 'C', 'd', 'E'],
))

In [14]:
hover = HoverTool(tooltips=[
    ("index", "$index"),
    ("(x,y)", "($x, $y)"),
    ("desc", "@desc"),
])

Field names that begin with $ are “special fields”. These often correspond to values that are intrinsic to the plot, such as the coordinates of the mouse in data or screen space. These special fields are listed here:

\$index:	index of selected point in the data source

\$x:	x-coordinate under the cursor in data space

\$y:	y-coordinate under the cursor in data space

Field names that begin with @ are associated with columns in a ColumnDataSource. Note that if a column name contains spaces, the it must be supplied by surrounding it in curly braces, e.g. @{adjusted close} will display values from a column named "adjusted close".

In [15]:
p5 = figure(plot_width=400, plot_height=400, tools=[hover],
           title="Mouse over the dots")

p5.circle('x', 'y', size=20, source=source)

show(p5)

Exercise: Load the tips data from seaborn package and draw a scatter plot for total_bill and tip with hover tool that list sex, smoker, day, and time fields for a data point. 

In [22]:
import seaborn as sns
import matplotlib.pyplot as plt
tips = sns.load_dataset("tips")
#sns.lmplot(x="total_bill", y="tip", data=tips)
#print(tips.head())
total_bill1=[]
tip1=[]
sex1=[]
smoker1=[]
day1=[]
time1=[]
for data1 in tips['total_bill']:
    format(data1, '.2f')
    total_bill1.append(data1)
for data2 in tips['tip']:
    format(data2, '.2f')
    tip1.append(data2)
for data3 in tips['sex']:
       
    sex1.append(data3)
for data4 in tips['smoker']:
    
    smoker1.append(data4)
for data5 in tips['day']:
    
    day1.append(data5)
for data6 in tips['time']:
    
    time1.append(data6)
    

    
   
    
         
        
        
source1 = ColumnDataSource(data=dict(
    x=total_bill1,
    y=tip1,
    sex=sex1,
    smoker=smoker1,
    day=day1,
    time=time1,
))

hover1 = HoverTool(tooltips=[
    ("index", "$index"),
    ("(x,y)", "($x, $y)"),
    ("sex", "@sex"),
    ("smoker", "@smoker"),
    ("day", "@day"),
    ("time", "@time"),
    
])


p6 = figure(plot_width=400, plot_height=400, tools=[hover1],
           title="Scatter Plot for tips dataset")

p6.circle('x', 'y', size=10, source=source1)


show(p6)


Exercise: Use Bokeh to visualize housing data. Please select fields which you would like to explore and see how Bokeh features like hovering can help in making better visualization. 

In [32]:
'''
For the housing dataset I was curious to know fields like neighbourhood, year built, lot configuration, 
house style, total squarefeet and sale condition might have affected the sales price.
x-axis: total squareft.
y-axis: sales price
I have normalised the values for x and y by dividing them with 1000
'''
ds=pd.read_csv("train_house.csv")
total = ds['1stFlrSF'] + ds['2ndFlrSF']
totalsf=[]
sp=[]
neigh=[]
style=[]
config=[]
yb=[]
condn=[]
for data in total:
    data=data/1000 #normalized
    totalsf.append(data)
    
for sales in ds['SalePrice']:
    sales=sales/1000 #normalized
    sp.append(sales)

for nei in ds['Neighborhood']: 
    neigh.append(nei)
for bld in ds['HouseStyle']: 
    style.append(bld)
for con in ds['LotConfig']:
    config.append(con)
for year in ds['YearBuilt']:
    yb.append(year)
for cond in ds['SaleCondition']:
    condn.append(cond)
    
source2 = ColumnDataSource(data=dict(
    x=totalsf,
    y=sp,
    Neighbourhood=neigh,
    Housestyle=style,
    YearBuilt=yb,
    LotConfiguration=config,
    SaleCondition=condn,
))

hover2 = HoverTool(tooltips=[
    ("index", "$index"),
    ("(x,y)", "($x, $y)"),
    ("Neighbourhood", "@Neighbourhood"),
    ("HouseStyle", "@Housestyle"),
    ("YearBuilt", "@YearBuilt"),
    ("LotConfiguration", "@LotConfiguration"),
    ("SaleCondition", "@SaleCondition"),
    
])


p7 = figure(plot_width=400, plot_height=400, tools=[hover2],
           title="Scatter Plot for housing dataset")

p7.circle('x', 'y', size=5, source=source2)


show(p7)  

In [16]:
# read housing data
ds=pd.read_csv("train_house.csv")
ds.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [17]:
# Create and deploy interactive data applications

from IPython.display import IFrame
IFrame('https://demo.bokehplots.com/apps/sliders', width=900, height=500)