In [2]:
#add necessary libraries
import networkx as nx #library supporting networks
import matplotlib.pyplot as plt #plotting
import pandas as pd
import geopandas as gpd
import numpy as np
import scipy.stats as stat
from scipy import optimize
import pysal as ps
from zipfile import ZipFile
from StringIO import StringIO
#make sure plots are embedded into the notebook
%pylab inline 
import statsmodels.formula.api as smf

Populating the interactive namespace from numpy and matplotlib


## 1. Data Loading

In [3]:
# load state gdp data (2014)
gdp = pd.read_csv('bea_gdp_by_state_in_millions.csv', header = 0, names=['fips', 'state', 'gdp'], index_col=['state'])
print gdp.head()
print gdp.columns
print len(gdp)

            fips      gdp
state                    
Alabama     1000   199440
Alaska      2000    57080
Arizona     4000   284156
Arkansas    5000   121395
California  6000  2311616
Index([u'fips', u'gdp'], dtype='object')
51


In [4]:
# load state capitals
caps = pd.read_csv('Capitals.csv', header = 0, index_col=['state'])
print caps.head()
print caps.columns
print len(caps)

            id abbrev      capital   latitude   longitude  population
state                                                                
Alabama      1     AL   Montgomery  32.380120  -86.300629      205764
Alaska       2     AK       Juneau  58.299740 -134.406794       31275
Arizona      4     AZ      Phoenix  33.448260 -112.075774     1445632
Arkansas     5     AR  Little Rock  34.748655  -92.274494      193524
California   6     CA   Sacramento  38.579065 -121.491014      466488
Index([u'id', u'abbrev', u'capital', u'latitude', u'longitude', u'population'], dtype='object')
50


In [5]:
# load state population centers
popcenter = pd.read_csv('CenPop2010_Mean_ST.txt', index_col=['STNAME'])
print popcenter.head()
print popcenter.columns

            STATEFP  POPULATION   LATITUDE   LONGITUDE
STNAME                                                
Alabama           1     4779736  33.008097  -86.756826
Alaska            2      710231  61.399882 -148.873973
Arizona           4     6392017  33.368266 -111.864310
Arkansas          5     2915918  35.142580  -92.655243
California        6    37253956  35.463595 -119.325359
Index([u'STATEFP', u'POPULATION', u'LATITUDE', u'LONGITUDE'], dtype='object')


In [57]:
# load state tax rates (all types)
tax = pd.read_excel('Taxes rates by state.xlsx', index_col=['State'])
print len(tax)

# clean col names to make easier to work with
tax.columns = ['State_Sales', 'Avg_Local_Sales', 'Combined_Sales', 'Max_Local_Sales'
               , 'Property', 'Income_Low', 'Income_High', 'Mature_Firm_HQ', 'New_Firm_HQ']
print tax.columns
#print tax.head()

# clean index names
tax.index =  [state.replace("\"", "") for state in tax.index]
#print tax.index

# convert percentages to floats
tax['New_Firm_HQ'] = tax['New_Firm_HQ'].replace('%','',regex=True).astype('float')/100
tax['Mature_Firm_HQ'] = tax['Mature_Firm_HQ'].replace('%','',regex=True).astype('float')/100

print tax.head()

51
Index([u'State_Sales', u'Avg_Local_Sales', u'Combined_Sales',
       u'Max_Local_Sales', u'Property', u'Income_Low', u'Income_High',
       u'Mature_Firm_HQ', u'New_Firm_HQ'],
      dtype='object')
Index([             u'Alabama',               u'Alaska',
                    u'Arizona',             u'Arkansas',
                 u'California',             u'Colorado',
                u'Connecticut',             u'Delaware',
                    u'Florida',              u'Georgia',
                    u'Hawaii ',                u'Idaho',
                   u'Illinois',              u'Indiana',
                       u'Iowa',               u'Kansas',
                   u'Kentucky',            u'Louisiana',
                      u'Maine',             u'Maryland',
              u'Massachusetts',             u'Michigan',
                  u'Minnesota',          u'Mississippi',
                   u'Missouri',             u'Montana ',
                   u'Nebraska',               u'Nevada',
 

In [37]:
# This is only for the shapes that will be used by PySAL to 
# build the spatial weights matrix
data = gpd.read_file('cb_2014_us_state_5m/cb_2014_us_state_5m.shp')
psGeom = ps.open('cb_2014_us_state_5m/cb_2014_us_state_5m.shp', 'r')

print data.columns

Index([u'AFFGEOID',    u'ALAND',   u'AWATER',    u'GEOID',     u'LSAD',
           u'NAME',  u'STATEFP',  u'STATENS',   u'STUSPS', u'geometry'],
      dtype='object')


## 2. Build spatial weight matrices

### 2.1. Rook 

In [39]:
# We are building the spatial weight matrix and using the 
# state names as IDs of the matrix.

R = ps.buildContiguity(psGeom, criterion='rook', ids=data['NAME'].values.tolist())
R.transform = 'R' # normalize

Island ids:  [u'Puerto Rico', u'Commonwealth of the Northern Mariana Islands', u'Alaska', u'Hawaii', u'United States Virgin Islands', u'American Samoa', u'Guam']


In [41]:
#for (loc, neighbors) in R:
    #print loc, neighbors

### 2.2. Queen 

In [42]:
# We are building the spatial weight matrix and using the 
# state names as IDs of the matrix. Noted that we
# running a 'queen', shared vertices, neighborhood test.

Q = ps.buildContiguity(psGeom, criterion='queen', ids=data['NAME'].values.tolist())
Q.transform = 'R' # normalize

Island ids:  [u'Puerto Rico', u'Commonwealth of the Northern Mariana Islands', u'Alaska', u'Hawaii', u'United States Virgin Islands', u'American Samoa', u'Guam']


In [47]:
#for (loc, neighbors) in Q:
    #print loc, neighbors

### 2.3. Distance-weighted by state capitals (50 by 50 matrix) 

### 2.4. Distance-weighted by population centers (50 by 50 matrix) 

### 2.5. Gravity model (???) 

## 3. Calculate spatial auto-correlations for tax rates (each type)
### Morans I (see lab9_sa from NYU Classes)

In [None]:
# Template

# Y is the normalized list of values

# Y = data['percent'].values
# Y = (Y-Y.mean())/Y.std() # <<<---- normalization


# W is the standardized dict. of all the weights. This can be 
# done by specifying 'R' as the matrix transformation. All the weights
# should add up to 1.

# W.transform = 'R'


# Execute the Moran's I calculation

# mi = ps.Moran(Y, W)

# This is the Moran's I value, that would tell us whether tax rates
# among states are clustered, or not.

# mi.I

# Check the p-value of the calculation. This has to be < 0.05 for our
# calculation to be statistically significant.

# mi.p_sim

### 2.1. Rook 

In [51]:
# let's try this for the rook method and state sales
subset = tax[~tax['State_Sales'].isnull()]['State_Sales']
print subset.keys()

Y = subset.values
Y = (Y-Y.mean())/Y.std() # <<<---- normalization

print Y

for (x,y) in R:
    print type(x), x,y
#mi_r = ps.Moran(R, )

Index([              u'Alabama',               u'Arizona',
                    u'Arkansas',            u'California',
                    u'Colorado',           u'Connecticut',
                     u'Florida',               u'Georgia',
                     u'Hawaii ',                 u'Idaho',
                    u'Illinois',               u'Indiana',
                        u'Iowa',                u'Kansas',
                    u'Kentucky',             u'Louisiana',
                       u'Maine',              u'Maryland',
               u'Massachusetts',              u'Michigan',
                   u'Minnesota',           u'Mississippi',
                    u'Missouri',              u'Nebraska',
                      u'Nevada',           u'New Jersey ',
                 u'New Mexico ',              u'New York',
              u'North Carolina',          u'North Dakota',
                        u'Ohio',              u'Oklahoma',
                u'Pennsylvania',          u'Rhode Island

### First, normalize all tax rates