In [1]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import linregress
import matplotlib.pyplot as plt
import matplotlib.cm as cm 
from matplotlib.colors import ListedColormap
#bokeh
from bokeh.io import output_file, output_notebook, reset_output, show
from bokeh.layouts import row, column, widgetbox
from bokeh.models.widgets import Select
from bokeh.models import BasicTicker, ColorBar, CustomJS, ColumnDataSource, LinearColorMapper
from bokeh.palettes import RdBu
from bokeh.plotting import figure
from bokeh.transform import transform

In [2]:
#this stops outputting to a file just in case it has been started
reset_output()
#this puts output within notebook
output_notebook()
#this stops notebook server stopping sending output
#jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000

In [3]:
# loading datasets
ycom = pd.read_csv('YCOM_2018_Data.csv', encoding='latin-1')
census = pd.read_csv('us-census-demographic-data/acs2015_county_data.csv')

In [4]:
# Deselct Puerto Rico from census, since Ycom data doesn't cover this state.
census = census.iloc[:3142]

In [5]:
#Scaling Men, Women, Employed and Citizen by TotalPop to get a % because that's more meaningful
census.Men = 100*census.Men/census.TotalPop
census.Women = 100*census.Women/census.TotalPop
census.Citizen = 100*census.Citizen/census.TotalPop
census.Employed = 100*census.Employed/census.TotalPop

In [6]:
# selecting only the county rows and reseting the index.
ycom_county = ycom.loc[ycom['GeoType'] == 'County']
ycom_county = ycom_county.reset_index(drop=True)

In [7]:
# Separating the counties and states from 'GeoType' column and add those as
# separated columns to the 'ycom_county' dataframe.
county_state_sep = pd.DataFrame(ycom_county.GeoName.str.split(',').tolist())
ycom_county['State'] = county_state_sep[1]
ycom_county['County'] = county_state_sep[0]

In [8]:
# Dropping 'county' and 'Parish' words and the last whitespace
# from the counties.
ycom_county['County'] = ycom_county['County'].str.replace('County', '')
ycom_county['County'] = ycom_county['County'].str.replace('Parish', '')
ycom_county['County'] = ycom_county['County'].str.strip()

In [9]:
# test if the counties are the same and in order in two datasets.
if np.all(ycom_county['County'] == census['County']):
    print('Yaaaaay!')

Yaaaaay!


In [10]:
# getting one dataframe from the two datasets 
data2 = pd.concat(([ycom_county,census]),axis = 1)

In [11]:
# testing stats for a single combination of variables
datain = pd.concat(([ycom_county['worriedOppose'], census['Drive']]), axis = 1)
stats_datain = linregress(datain['worriedOppose'], datain['Drive']) 
stats_datain

LinregressResult(slope=0.58483955162064671, intercept=52.336859719026997, rvalue=0.47036081410347225, pvalue=9.6571823750892251e-173, stderr=0.019581357331979458)

In [12]:
n_ycom = list(ycom_county)[3:-2]
n_census = list(census)[4:]

In [13]:
#Pearson's correlation coefficient,2-tailed p-value
stats_outputs = np.zeros((len(n_ycom),len(n_census),5))
stats_outputs_standard = np.zeros((len(n_ycom),len(n_census),5))
for x in range(len(n_ycom)):
    for y in range(len(n_census)):
        #nans when ny (census) index is 9,10,14 ie. income, incomeErr, childpoverty
        #reason is Loving Texas (not kidding), ind=2673, a county with no data for these variables
        #census.Income is same as #census[ny[9]]
        #n.b. if missing values are in census for given variable then county is ignored for that calculation
        ycom_notnull = ycom_county[n_ycom[x]][census[n_census[y]].notnull()]
        census_notnull = census[n_census[y]][census[n_census[y]].notnull()]
        
        #also doing calculations on standardized variables #standardized_column = (column - mean(column)) / std(column)
        ycom_standard = (ycom_notnull - np.mean(ycom_notnull)) / np.std(ycom_notnull)
        census_standard = (census_notnull - np.mean(census_notnull)) / np.std(census_notnull)
        
        
        stats_outputs[x,y,:] = linregress(ycom_notnull, census_notnull)
        stats_outputs_standard[x,y,:] = linregress(ycom_standard, census_standard)
        

print('size of stats array created')        
print('#ycom,#census,#stattypes')        
print(stats_outputs.shape)

size of stats array created
#ycom,#census,#stattypes
(56, 33, 5)


In [14]:
#making dataframe of correlation coefficients
cors = pd.DataFrame(stats_outputs[:,:,2],
                    columns=n_census, 
                    index=n_ycom)

In [18]:
#bokeh heatmap

# You can use your own palette here
colors = ['#d7191c', '#fdae61', '#ffffbf', '#a6d96a', '#1a9641']
colors = RdBu[11]

cors.index.name = 'ycom'
cors.columns.name = 'census'

# Prepare data.frame in the right format
cors_stack = cors.stack().rename("value").reset_index()

#getting these for hover over feature
census_vars=list(cors_stack.census.drop_duplicates())
ycom_vars=list(cors_stack.ycom.drop_duplicates())

# Had a specific mapper to map color with value
#mapper = LinearColorMapper(palette=colors, low=cors_stack.value.min(), high=cors_stack.value.max())
mapper = LinearColorMapper(palette=colors, low=-1, high=1)
# Define a figure
plot_heatmap = figure(
    plot_width=600,
    plot_height=400,
    title="",
    x_range=ycom_vars,
    y_range=census_vars,
    toolbar_location=None,
    tools="",
    x_axis_location="below",
    tooltips=[('Census', '@ycom'), ('YCOM', '@census'), ('R', '@value%')])

plot_heatmap.axis.major_label_text_font_size = "5pt"
plot_heatmap.xaxis.major_label_orientation = 1.2

# Create rectangle for heatmap
plot_heatmap.rect(
    x="ycom",
    y="census",
    width=1,
    height=1,
    source=ColumnDataSource(cors_stack),
    line_color=None,
    fill_color=transform('value', mapper))
# Add legend
color_bar = ColorBar(
    color_mapper=mapper,
    location=(0, 0),
    ticker=BasicTicker(desired_num_ticks=np.int(len(colors))))

plot_heatmap.add_layout(color_bar, 'right')

show(plot_heatmap)

In [19]:
#interactive scatter plots

# creating extra columns which are going to be filled with whatever data is chosen from dropdown
data2['x'] = 99
data2['y'] = 99

#only taking 1 in 2 data points because otherwise things often crash
df=data2[1::2];

#telling plot what source to use
source = ColumnDataSource(df)

#setting up dropdowns
census_menu = Select(options=n_census, value='v', title='Census Variables') 
ycom_menu = Select(options=n_ycom, value='v', title='YCOM Variables')

callback_census = CustomJS(args={'source':source},code="""
        // cb_obj is the callback object
        // cb_obj.value is the selected value.
        
        // create a new variable for the data of the column data source
        // this is linked to the plot
        var data = source.data;

        // allocate the selected column to the field for the x values
        data['x'] = data[cb_obj.value];

        // register the change - this is required to process the change in 
        // the x values
        source.change.emit();
""")

callback_ycom = CustomJS(args={'source':source},code="""
        var data = source.data;

        data['y'] = data[cb_obj.value];

        // register the change - this is required to process the change in 
        // the y values
        source.change.emit();
""")

# generating plot
plot_scatter = figure(plot_width=350, plot_height=350)
plot_scatter.scatter('x', 'y', source=source)

# Add the callback to the select widget. 
# This executes each time the selected option changes
census_menu.callback = callback_census
ycom_menu.callback = callback_ycom
layout_rhs=column(census_menu, ycom_menu,plot_scatter)
show(row(plot_heatmap,layout_rhs))
#show(column(census_menu, ycom_menu, plot_scatter))
        