## Linyao Li (linyaol2)

In [1]:
import pandas as pd
import bqplot
import bqplot.pyplot
import numpy as np
import traitlets
import ipywidgets
from ipywidgets import Dropdown
from ipywidgets import VBox, HBox, Layout
import matplotlib.pyplot as plt
import matplotlib.colors as mpl_colors
%matplotlib inline

In [2]:
buildings = pd.read_csv("/Users/linyaoli/Downloads/building_inventory.csv")

In [3]:
buildings.head()

Unnamed: 0,Agency Name,Location Name,Address,City,Zip code,County,Congress Dist,Congressional Full Name,Rep Dist,Rep Full Name,...,Bldg Status,Year Acquired,Year Constructed,Square Footage,Total Floors,Floors Above Grade,Floors Below Grade,Usage Description,Usage Description 2,Usage Description 3
0,Department of Natural Resources,Anderson Lake Conservation Area - Fulton County,Anderson Lake C.a.,Astoria,61501,Fulton,17,Cheri Bustos,93,Hammond Norine K.,...,In Use,1975,1975,144,1,1,0,Unusual,Unusual,Not provided
1,Department of Natural Resources,Anderson Lake Conservation Area - Fulton County,Anderson Lake C.a.,Astoria,61501,Fulton,17,Cheri Bustos,93,Hammond Norine K.,...,In Use,2004,2004,144,1,1,0,Unusual,Unusual,Not provided
2,Department of Natural Resources,Anderson Lake Conservation Area - Fulton County,Anderson Lake C.a.,Astoria,61501,Fulton,17,Cheri Bustos,93,Hammond Norine K.,...,In Use,2004,2004,144,1,1,0,Unusual,Unusual,Not provided
3,Department of Natural Resources,Anderson Lake Conservation Area - Fulton County,Anderson Lake C.a.,Astoria,61501,Fulton,17,Cheri Bustos,93,Hammond Norine K.,...,In Use,2004,2004,144,1,1,0,Unusual,Unusual,Not provided
4,Department of Natural Resources,Anderson Lake Conservation Area - Fulton County,Anderson Lake C.a.,Astoria,61501,Fulton,17,Cheri Bustos,93,Hammond Norine K.,...,In Use,2004,2004,144,1,1,0,Unusual,Unusual,Not provided


In [4]:
# discover the dataset
print(buildings['Congress Dist'].min(),buildings['Congress Dist'].max(),buildings['Congress Dist'].nunique())
print(buildings['Agency Name'].nunique())
print(buildings[buildings['Year Acquired']>0]['Year Acquired'].min())

0 18 19
35
1753


In [5]:
# assign unique indexes to unique agency names
dic = {}
for i in range(len(buildings['Agency Name'].unique())):
    dic[buildings['Agency Name'].unique()[i]] = i
# map the values of categorical variable to values of numerical variable
buildings['Agency ID'] = buildings['Agency Name'].map(dic)
buildings.head()

Unnamed: 0,Agency Name,Location Name,Address,City,Zip code,County,Congress Dist,Congressional Full Name,Rep Dist,Rep Full Name,...,Year Acquired,Year Constructed,Square Footage,Total Floors,Floors Above Grade,Floors Below Grade,Usage Description,Usage Description 2,Usage Description 3,Agency ID
0,Department of Natural Resources,Anderson Lake Conservation Area - Fulton County,Anderson Lake C.a.,Astoria,61501,Fulton,17,Cheri Bustos,93,Hammond Norine K.,...,1975,1975,144,1,1,0,Unusual,Unusual,Not provided,0
1,Department of Natural Resources,Anderson Lake Conservation Area - Fulton County,Anderson Lake C.a.,Astoria,61501,Fulton,17,Cheri Bustos,93,Hammond Norine K.,...,2004,2004,144,1,1,0,Unusual,Unusual,Not provided,0
2,Department of Natural Resources,Anderson Lake Conservation Area - Fulton County,Anderson Lake C.a.,Astoria,61501,Fulton,17,Cheri Bustos,93,Hammond Norine K.,...,2004,2004,144,1,1,0,Unusual,Unusual,Not provided,0
3,Department of Natural Resources,Anderson Lake Conservation Area - Fulton County,Anderson Lake C.a.,Astoria,61501,Fulton,17,Cheri Bustos,93,Hammond Norine K.,...,2004,2004,144,1,1,0,Unusual,Unusual,Not provided,0
4,Department of Natural Resources,Anderson Lake Conservation Area - Fulton County,Anderson Lake C.a.,Astoria,61501,Fulton,17,Cheri Bustos,93,Hammond Norine K.,...,2004,2004,144,1,1,0,Unusual,Unusual,Not provided,0


## Dashboard

In [6]:
# (I) CREATE LABEL
mySelectedLabel = ipywidgets.Label()

### Left component : Grid heat map

#### Rows: congressional district
#### Columns: the governmental department (Agency Name)
#### Values: sum of total square footage

In [7]:
# make every combination of the governmental department and the congressional district a seperate cell
nagen = 35 
ncong = 19
hist2d, agen_edges, cong_edges = np.histogram2d(buildings['Agency ID'], 
                                               buildings['Congress Dist'], 
                                               weights=buildings['Square Footage'],bins = [nagen,ncong])
# calculate the center of the agency ID and the congress district ID
agen_centers = (agen_edges[:-1] + agen_edges[1:]) / 2
cong_centers = (cong_edges[:-1] + cong_edges[1:]) / 2
# rotate the orientation of the data
hist2d=hist2d.T
# set zeros to NaNs
hist2d[hist2d <= 0] = np.nan
# take log
hist2d = np.log10(hist2d)

In [8]:
#(II) HEAT MAP
# (1) Scales - colors, x & y
col_sc = bqplot.ColorScale(scheme="RdPu", 
                           min=np.nanmin(hist2d), 
                           max=np.nanmax(hist2d))
x_sc = bqplot.LinearScale()
y_sc = bqplot.LinearScale()

# (2) Axis - colors, x & y
c_ax = bqplot.ColorAxis(scale = col_sc, 
                        orientation = 'vertical', 
                        side = 'right')

x_ax = bqplot.Axis(scale = x_sc, label='Agency ID')
y_ax = bqplot.Axis(scale = y_sc, 
                   orientation = 'vertical', 
                   label = 'Congress Dist')

# (3) Marks
heat_map = bqplot.GridHeatMap(color = hist2d,
                              row = cong_centers, 
                              column = agen_centers,
                              scales = {'color': col_sc,
                                        'row': y_sc,
                                        'column': x_sc},
                              interactions = {'click': 'select'},
                              anchor_style = {'fill':'blue'}, 
                              selected_style = {'opacity': 1.0},
                              unselected_style = {'opacity': 1.0})

### Right component : Line plot

#### x: year
#### y: total square footage acquired that year

In [9]:
# (III) LINE PLOT

# (1) Scales
x_scl = bqplot.LinearScale()
y_scl = bqplot.LogScale()
# (2) Axis
ax_xcl = bqplot.Axis(label='Year', scale=x_scl)
ax_ycl = bqplot.Axis(label='Total square footage', scale=y_scl, 
                    orientation='vertical', side='left')
# (3) Marks
i,j = 17,0
agen = [agen_edges[j], agen_edges[j+1]] 
cong = [cong_edges[i], cong_edges[i+1]]
data_mask = ((buildings['Congress Dist'] >= cong[0]) & (buildings['Congress Dist']<=cong[1]) &\
                (buildings['Agency ID'] >= agen[0]) & (buildings['Agency ID']<=agen[1]) )
squ_line = bqplot.Lines(x = sorted(buildings['Year Acquired'][data_mask][buildings['Year Acquired']>0]),
                        y = buildings.loc[sorted(buildings['Year Acquired'][data_mask][buildings['Year Acquired']>0])]['Square Footage'].values,
                              scales={'x':x_scl, 'y':y_scl},marker='circle', marker_size=32)

In [10]:
# (IV) LINKING TOGETHER DASHBOARD WITH INTERACTIVITY
def get_data_value(change):
    if len(change['owner'].selected) == 1: #only 1 selected
        i,j = change['owner'].selected[0]
        v = hist2d[i,j] # grab data value
        mySelectedLabel.value = 'Total square footage in log = ' + str(v) # set our label
        # for the line plot
        agen = [agen_edges[j], agen_edges[j+1]] 
        cong = [cong_edges[i], cong_edges[i+1]]
        data_mask = ((buildings['Congress Dist'] >= cong[0]) & (buildings['Congress Dist']<=cong[1]) &\
                (buildings['Agency ID'] >= agen[0]) & (buildings['Agency ID']<=agen[1]))
        # To make sure the line plot could be displayed properly
        if len(sorted(buildings['Year Acquired'][data_mask][buildings['Year Acquired']>0]))>0:
            if len(set(sorted(buildings['Year Acquired'][data_mask][buildings['Year Acquired']>0])))==1:
                pass
            else:
                squ_line.x = sorted(buildings['Year Acquired'][data_mask][buildings['Year Acquired']>0])
                squ_line.y = buildings.loc[sorted(buildings['Year Acquired'][data_mask][buildings['Year Acquired']>0])]['Square Footage'].values
heat_map.observe(get_data_value, 'selected')

In [11]:
# (V) FIGURE
fig_heatmap = bqplot.Figure(marks = [heat_map], axes = [c_ax, y_ax, x_ax])
fig_squ = bqplot.Figure(marks = [squ_line], axes = [ax_xcl, ax_ycl])

##### To add interactivity: change the style for the grid heat map:

In [12]:
# a list of available schemes
div_schemes = [
    'RdPu',
    'OrRd',
    'PuBu',
    'BuPu',
    'Oranges',
    'BuGn',
    'YlOrBr',
    'YlGn',
    'Reds',
    'Greens',
    'YlGnBu',
    'Purples',
    'GnBu',
    'Greys',
    'YlOrRd',
    'PuRd',
    'Blues',
    'PuBuGn',
    'viridis',
    'plasma',
    'inferno',
    'magma'
    'Spectral',
    'RdYlGn',
    'RdBu',
    'PiYG',
    'PRGn',
    'RdYlBu',
    'BrBG',
    'RdGy',
    'PuOr',
]

In [13]:
# To change the scheme for the heat map
# reference: https://github.com/bloomberg/bqplot/blob/master/examples/Scales/Color%20Scales.ipynb
def change_scheme(fig, schemes):
    # Get the color scale
    col_sc = fig.marks[0].scales['color']
    
    # Create a dropdown widget to select the colorscheme
    scheme_dd = Dropdown(description='Scheme',
                         options=schemes)
    
    def update_scheme(*args):
        col_sc.scheme = scheme_dd.value

    scheme_dd.observe(update_scheme, 'value')    
    update_scheme()
    return VBox([scheme_dd, fig])

In [14]:
# Arrange the dashboard
fig_heatmap.layout.min_width='500px'
fig_squ.layout.min_width='500px'

# Note: if you choose a new cell in the heat map and the line plot keeps the same, 
# it may encounter some cases that are not suitable for the line plot to display 
# such as there is no "year" information for this cell
# then you can just change to another cell which could be display properly
myDashboard = ipywidgets.VBox([mySelectedLabel, ipywidgets.HBox([change_scheme(fig_heatmap,div_schemes),fig_squ])])
myDashboard

VBox(children=(Label(value=''), HBox(children=(VBox(children=(Dropdown(description='Scheme', options=('RdPu', …

### For the "things to think about" part
Q: (1) Can you keep the x and y ranges static on the line plot?
(2) Can you change the style?

(1) We can keep the x and y ranges static on the plot by finding the minimum values and maximum values for x and y axis in the dataset and assign them when we creating the scales. For the y-axis, since the values have already been taken log, the big range has been deducted to a reasonable range. Therefore, keeping y range static may not influence the visualization effect. However, for the x-axis, there is a big range from 1753 to 2019. Therefore, most of the space in the line plot will be blank, and the useful information will not be clearly displayed. That is why I do not choose to do it that way in this case but we can keep them static if necessary.

(2) We can add a dropdown list as shown in the above dashboard which provides a lot of schemes for the grid heat map. Users can change the scheme for it by clicking the scheme name that they want. Whenever the dropdown list "observes" a change, the "change scheme" function will update the graph in the new color scheme and return the new graph.