In [1]:
# Standard library imports

# Third party imports
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import linregress
import matplotlib.pyplot as plt
import matplotlib.cm as cm 
from matplotlib.colors import ListedColormap
# Bokeh imports
from bokeh.io import output_file, output_notebook, reset_output, show
from bokeh.layouts import row, column, widgetbox
from bokeh.models.widgets import Select,Panel, Tabs
from bokeh.models import BasicTicker, ColorBar, CustomJS, ColumnDataSource, LinearAxis, LinearColorMapper
from bokeh.palettes import RdBu
from bokeh.plotting import figure, save
from bokeh.transform import transform

# Local imports
import calculate_statistics
import prepare_data
import plot_heatmap

In [2]:
# this stops outputting to a file just in case it has been started
#reset_output()
# this puts output within notebook
#output_notebook()
# this stops notebook server stopping sending output
# jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000

output_file("heatmap.html")

In [3]:
# Preparing census data

# Loading census data
census = pd.read_csv('../data/acs2015_county_data.csv')

# Scaling Men, Women, Employed and Citizen by TotalPop to get a percentage
census = prepare_data.scale_census_variables(census)

# Removing counties not in ycom data (i.e. puerto rico)
census = prepare_data.remove_census_not_in_ycom(census)

# Removing counties not in land area data
census = prepare_data.remove_not_in_land_area(census)

# Getting list of census variables
n_census = list(census)[3:]

In [4]:
# Preparing YCOM data

# Loading ycom data
ycom = pd.read_csv('../data/YCOM_2018_Data.csv', encoding='latin-1')
ycom_meta = pd.read_csv('../data/YCOM_2018_Metadata.csv', encoding='latin-1')

# Get county level data matching census county names
ycom_county = prepare_data.get_ycom_counties(ycom)

# Removing counties not in land area data
ycom_county = prepare_data.remove_not_in_land_area(ycom_county)

# Getting list of YCOM variables
n_ycom = list(ycom_county)[3:-2]

# Editing and getting list of YCOM variable descriptions
ycom_meta = prepare_data.fix_ycom_descriptions(ycom_meta)
n_ycom_meta = list(ycom_meta['VARIABLE DESCRIPTION'])[3:] 

In [5]:
# Preparing land area data

# Loading land_area_data
land_area_data = pd.read_excel('../data/LND01.xls')

# Selecting only counties
land_area_data = prepare_data.select_land_area_county(land_area_data)

# Removing rows which are in land area but not census
land_area_data = prepare_data.remove_land_area_not_in_census(land_area_data)

# Fixing land area data county names so that they match those in census data
land_area_data = prepare_data.fix_land_area_county_names(land_area_data, census)

# Adding land area values where missing
land_area_data = prepare_data.add_missing_land_areas(land_area_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  land_area_data['County'] = land_area_county


In [6]:
# Things to use in unit tests

# Testing if the counties are the same and in order for ycom and census
if np.all(ycom_county['County'] == census['County']):
    print('Cool, lets put this in the unit tests')
    
# Testing if the counties are the same and in order for land area and census
if np.all(land_area_data['County'] == census['County']):
    print('Cool, lets put this in the unit tests')
    
# Backup test, shows non matching county names and indices if there area any
pd.concat([land_area_data['County'],census['County']]).drop_duplicates(keep=False)

Cool, lets put this in the unit tests
Cool, lets put this in the unit tests


Series([], Name: County, dtype: object)

In [7]:
# Getting one dataframe from the three datasets 
n_census.append('LogPopDensity')
combined_data = prepare_data.join_data(ycom_county, census, land_area_data)

In [8]:
# Generate correlation (R), regression (b) and pvalues for relationships between variables
n_ycom, n_census
stats_outputs = calculate_statistics.calculate_stats_outputs(n_ycom, n_census, ycom_county, census)
stats_outputs_standard =  calculate_statistics.calculate_stats_outputs_standard(n_ycom, n_census, ycom_county, census)

In [9]:
#making dataframe of regression coefficients
#these are kinda standardized
#i.e. they show what % change in an opinion is given a 1 standard deviation change in a census variable
regs = pd.DataFrame(stats_outputs_standard[:,:,0], columns=n_census, index=n_ycom)

#making dataframe of correlation coefficients
cors = pd.DataFrame(stats_outputs[:,:,2], columns=n_census, index=n_ycom)

#making dataframes of pvalues
pval = pd.DataFrame(stats_outputs[:,:,3], columns=n_census, index=n_ycom)

In [10]:
# Prepare dataframe in the right format for heatmap
all_stack = plot_heatmap.stack_stats(cors, regs, pval)

In [12]:
# Create and plot heatmap of either 'R' (correlation), 'b' (regression) or 'pval' (p value) statistics
heatmap_plot_r = plot_heatmap.create_heatmap_fig(all_stack,'R')
tab1 = Panel(child = heatmap_plot_r,title = 'Correlation')

heatmap_plot_b = plot_heatmap.create_heatmap_fig(all_stack,'b')
tab2 = Panel(child = heatmap_plot_b,title = 'Regression')

heatmap_plot_p = plot_heatmap.create_heatmap_fig(all_stack,'pval')
tab3 = Panel(child = heatmap_plot_p,title = 'p value')

tabs = Tabs(tabs=[tab1,tab2,tab3])
show(tabs)

save(obj=tabs,filename='heatmap.html')

'/Users/robin/Desktop/Classes/Python/Climate opinions/project/climaps/heatmap.html'

In [13]:
# Interactive scatter plots
output_file("scatter.html")

# Creating extra columns which are going to be filled with whatever data is chosen from dropdown
combined_data['x'] = 99
combined_data['y'] = 99

# Setting sources for scatter plots
# (Taking every other data point because otherwise too much memory is used)
source = ColumnDataSource(combined_data[1::2])
source_ycom_meta = ColumnDataSource(ycom_meta)

# Generating scatter plot
scatter_plot = figure(plot_width=350, plot_height=350)
scatter_plot.scatter('x', 'y', source=source)
    
# Adding some axes that can have their labels dynamically updated
scatter_plot.xaxis.visible = None
scatter_plot.yaxis.visible = None
xaxis = LinearAxis(axis_label="Census Variable")
yaxis = LinearAxis(axis_label="YCOM Variable")
scatter_plot.add_layout(xaxis, 'below')
scatter_plot.add_layout(yaxis, 'left')

# Creating javascript callbacks allowing for scatter plot to automatically update
callback_census = plot_heatmap.set_callback_census(source, xaxis)
callback_ycom = plot_heatmap.set_callback_ycom(source, yaxis, source_ycom_meta)

# Setting up dropdowns
census_menu = plot_heatmap.create_dropdown_census(n_census, callback_census)
ycom_menu = plot_heatmap.create_dropdown_ycom(n_ycom_meta, callback_ycom)

# Plotting scatter
layout_scatter=column(census_menu, ycom_menu, scatter_plot)
show(layout_scatter)

save(obj=layout_scatter, filename='scatter.html')

'/Users/robin/Desktop/Classes/Python/Climate opinions/project/climaps/scatter.html'

In [1]:
# Showing whole figure (only works for plotting in notebook)
#layout_rhs=column(census_menu, ycom_menu,scatter_plot)
#show(row(tabs,layout_rhs))
#show(column(census_menu, ycom_menu, scatter_plot))
#save(obj=layout_rhs,filename='scatter.html')

In [16]:
# to-do list:
# Add best fit line to scatter (probably will have to add another source)
# Add units to plot_scatter axis labels for census
# Add more thorough descriptions of variables on hoverover 
    #use ycom_meta['VARIABLE DESCRIPTION'][ycom_meta['YCOM VARIABLE NAME']=='TotalPop']
# Add a display of R, P, regr for combo of variables on plot_scatter
# https://stackoverflow.com/questions/46884648/storing-3-dimensional-data-in-pandas-dataframe

# Ok, so to do option of having heatmap for regression coefficient/heatmap based on button:
# Can put in multiple sources right? So have one source be cors, one be regs, one be pvals
# Have a dummy variable which is updated from one of these sources depending on the button that is pressed

# would be cool if hovering over a point on the scatter map showed the county/state