# ``mggg-states`` Data QA

In [None]:
!pip install numpy
!pip install pandas
!pip install geopandas

!pip install git+https://github.com/KeiferC/gdutils.git # install gdutils package

In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
import json

import gdutils.datamine as dm
import gdutils.dataqa as dq

## Data Collection

In [None]:
# This is going to take a while to complete
dm.clone_gh_repos(account='mggg-states', 
                  account_type='orgs', 
                  outpath='shps/')

In [4]:
mggg_shapefiles = dm.list_files_of_type('.zip', 'shps/')
# ^ used to see what shapefiles to work with

## Data Standardization Check

### Generate Standards

In [5]:
with open('naming_convention.json') as json_file:
    standards_raw = json.load(json_file)

In [6]:
offices = dm.get_keys_by_category(standards_raw, 'offices')
parties = dm.get_keys_by_category(standards_raw, 'parties')
counts = dm.get_keys_by_category(standards_raw, 'counts')
others = dm.get_keys_by_category(standards_raw, 
            ['geographies', 'demographics', 'districts', 'other'])

In [7]:
elections = [office + format(year, '02') + party 
                 for office in offices
                 for year in range(0, 21)
                 for party in parties 
                 if not (office == 'PRES' and year % 4 != 0)]

counts = [count + format(year, '02') for count in counts 
                                     for year in range(0, 20)]

In [8]:
standards = elections + counts + others

### Compare Data with Standards

In [9]:
def gdf_from_zip(path: str) -> gpd.GeoDataFrame:
    return gpd.read_file('zip://' + path)

states_gdf = []
all_discrepancies = []

_Note:_ All states are compared manually below so that an individual can examine at state by itself if wanted

#### Alaska

In [10]:
ak_gdf = gdf_from_zip('shps/AK-shapefiles.git/AK_precincts.zip')

In [11]:
(ak_matches, ak_discrepancies) = dq.compare_column_names(ak_gdf, standards)
states_gdf.append(('AK', ak_gdf, ak_matches, None))
all_discrepancies.append(('AK', ak_discrepancies))

#### Arizona

In [12]:
az_gdf = gdf_from_zip('shps/AZ-shapefiles.git/az_precincts.zip')

In [13]:
(az_matches, az_discrepancies) = dq.compare_column_names(az_gdf, standards)
states_gdf.append(('AZ', az_gdf, az_matches, 
                   "Shapefile name does not fit convention."))
all_discrepancies.append(('AZ', az_discrepancies))

#### Colorado

In [14]:
co_gdf = gdf_from_zip('shps/CO-shapefiles.git/CO_precincts.zip')

In [15]:
(co_matches, co_discrepancies) = dq.compare_column_names(co_gdf, standards)
states_gdf.append(('CO', co_gdf, co_matches, None))
all_discrepancies.append(('CO', co_discrepancies))

#### Connecticut

In [16]:
ct_gdf = gdf_from_zip('shps/CT-shapefiles.git/CT_precincts.zip')

In [17]:
(ct_matches, ct_discrepancies) = dq.compare_column_names(ct_gdf, standards)
states_gdf.append(('CT', ct_gdf, ct_matches, None))
all_discrepancies.append(('CT', ct_discrepancies))

#### Delaware

In [18]:
de_gdf = gdf_from_zip('shps/DE-shapefiles.git/DE_precincts.zip')

In [19]:
(de_matches, de_discrepancies) = dq.compare_column_names(de_gdf, standards)
states_gdf.append(('DE', de_gdf, de_matches, None))
all_discrepancies.append(('DE', de_discrepancies))

#### Georgia

In [20]:
ga_gdf = gdf_from_zip('shps/GA-shapefiles.git/GA_precincts.zip')

In [21]:
(ga_matches, ga_discrepancies) = dq.compare_column_names(ga_gdf, standards)
states_gdf.append(('GA', ga_gdf, ga_matches, None))
all_discrepancies.append(('GA', ga_discrepancies))

#### Hawaii

In [22]:
hi_gdf = gdf_from_zip('shps/HI-shapefiles.git/HI_precincts.zip')

In [23]:
(hi_matches, hi_discrepancies) = dq.compare_column_names(hi_gdf, standards)
states_gdf.append(('HI', hi_gdf, hi_matches, None))
all_discrepancies.append(('HI', hi_discrepancies))

#### Indiana

In [24]:
ia_gdf = gdf_from_zip('shps/IA-shapefiles.git/IA_counties.zip')

In [25]:
(ia_matches, ia_discrepancies) = dq.compare_column_names(ia_gdf, standards)
states_gdf.append(('IA', ia_gdf, ia_matches, None))
all_discrepancies.append(('IA', ia_discrepancies))

#### Illinois
_Note:_ Chicago only

In [26]:
il_gdf = gdf_from_zip('shps/IL-shapefiles.git/Chicago/Chicago_Precincts.zip')

In [27]:
(il_matches, il_discrepancies) = dq.compare_column_names(il_gdf, standards)
states_gdf.append(('IL', il_gdf, il_matches, 
                   "Chicago only. Shapefile name does not fit convention."))
all_discrepancies.append(('IL', il_discrepancies))

#### Massachusetts 
_Note:_ Multiple shapefiles exist. Includes (``'MA_no_islands*'``).

In [28]:
ma_gdf = gdf_from_zip('shps/MA-shapefiles.git/MA_precincts_12_16.zip')

In [29]:
(ma_matches, ma_discrepancies) = dq.compare_column_names(ma_gdf, standards)
states_gdf.append(('MA', ma_gdf, ma_matches, 
                   "Multiple shapefiles. Using MA_precincts_12_16.zip. Shapefile name does not fit convention."))
all_discrepancies.append(('MA', ma_discrepancies))

#### Maryland
_Note:_ Not sure difference between ``MD_precincts`` and ``MD_precincts_abs``.

In [30]:
md_gdf = gdf_from_zip('shps/MD-shapefiles.git/MD_precincts.zip')

In [31]:
(md_matches, md_discrepancies) = dq.compare_column_names(md_gdf, standards)
states_gdf.append(('MD', md_gdf, md_matches, 
                   "Multiple shapefiles. Using MD_precincts.zip"))
all_discrepancies.append(('MD', md_discrepancies))

#### Michigan

In [32]:
mi_gdf = gdf_from_zip('shps/MI-shapefiles.git/MI_precincts.zip')

In [33]:
(mi_matches, mi_discrepancies) = dq.compare_column_names(mi_gdf, standards)
states_gdf.append(('MI', mi_gdf, mi_matches, None))
all_discrepancies.append(('MI', mi_discrepancies))

#### Minnesota
_Note:_ Multiple files exist. Assuming ``MN16`` is the most up-to-date.

In [34]:
mn_gdf = gdf_from_zip('shps/MN-shapefiles.git/MN16.zip')

In [35]:
(mn_matches, mn_discrepancies) = dq.compare_column_names(mn_gdf, standards)
states_gdf.append(('MN', mn_gdf, mn_matches, 
                   "Multiple shapefiles. Using MN16.zip. Shapefile name does not fit convention."))
all_discrepancies.append(('MN', mn_discrepancies))

#### North Carolina
_Note:_ Unsure if VTDs in shapefile represent precincts accurately.

In [36]:
nc_gdf = gdf_from_zip('shps/NC-shapefiles.git/NC_VTD.zip')

In [37]:
(nc_matches, nc_discrepancies) = dq.compare_column_names(nc_gdf, standards)
states_gdf.append(('NC', nc_gdf, nc_matches, 
                   "Multiple shapefiles. Using NC_VTD.zip. Shapefile name does not fit convention."))
all_discrepancies.append(('NC', nc_discrepancies))

#### New Mexico

In [38]:
nm_gdf = gdf_from_zip('shps/NM-shapefiles.git/new_mexico_precincts.zip')

In [39]:
(nm_matches, nm_discrepancies) = dq.compare_column_names(nm_gdf, standards)
states_gdf.append(('NM', nm_gdf, nm_matches, 
                   "Shapefile name does not fit convention."))
all_discrepancies.append(('NM', nm_discrepancies))

#### Ohio

In [40]:
oh_gdf = gdf_from_zip('shps/OH-shapefiles.git/OH_precincts.zip')

In [41]:
(oh_matches, oh_discrepancies) = dq.compare_column_names(oh_gdf, standards)
states_gdf.append(('OH', oh_gdf, oh_matches, None))
all_discrepancies.append(('OH', oh_discrepancies))

#### Oklahoma

In [42]:
ok_gdf = gdf_from_zip('shps/OK-shapefiles.git/OK_precincts.zip')

In [43]:
(ok_matches, ok_discrepancies) = dq.compare_column_names(ok_gdf, standards)
states_gdf.append(('OK', ok_gdf, ok_matches, None))
all_discrepancies.append(('OK', ok_discrepancies))

#### Oregon

In [44]:
or_gdf = gdf_from_zip('shps/OR-shapefiles.git/OR_precincts.zip')

In [45]:
(or_matches, or_discrepancies) = dq.compare_column_names(or_gdf, standards)
states_gdf.append(('OR', or_gdf, or_matches, None))
all_discrepancies.append(('OR', or_discrepancies))

#### Pennsylvania
_Note:_ Unsure of VTDs in shapefile accurately represent precincts.

In [46]:
pa_gdf = gdf_from_zip('shps/PA-shapefiles.git/PA_VTDs.zip')

In [47]:
(pa_matches, pa_discrepancies) = dq.compare_column_names(pa_gdf, standards)
states_gdf.append(('PA', pa_gdf, pa_matches, 
                   "Shapefile uses VTDs. Shapefile name does not fit convention."))
all_discrepancies.append(('PA', pa_discrepancies))

#### Rhode Island

In [48]:
ri_gdf = gdf_from_zip('shps/RI-shapefiles.git/RI_precincts.zip')

In [49]:
(ri_matches, ri_discrepancies) = dq.compare_column_names(ri_gdf, standards)
states_gdf.append(('RI', ri_gdf, ri_matches, None))
all_discrepancies.append(('RI', ri_discrepancies))

#### Texas 
_Note:_ Shapefile is stored externally at site: https://people.csail.mit.edu/ddeford/TX_vtds.zip

In [50]:
tx_gdf = gpd.read_file('https://people.csail.mit.edu/ddeford/TX_vtds.zip')

In [51]:
(tx_matches, tx_discrepancies) = dq.compare_column_names(tx_gdf, standards)
states_gdf.append(('TX', tx_gdf, tx_matches, 
                   "Shapefile stored externally (MIT). Shapefile name does not fit convention."))
all_discrepancies.append(('TX', tx_discrepancies))

#### Utah

In [52]:
ut_gdf = gdf_from_zip('shps/UT-shapefiles.git/UT_precincts.zip')

In [53]:
(ut_matches, ut_discrepancies) = dq.compare_column_names(ut_gdf, standards)
states_gdf.append(('UT', ut_gdf, ut_matches, None))
all_discrepancies.append(('UT', ut_discrepancies))

#### Virginia

In [54]:
va_gdf = gdf_from_zip('shps/VA-shapefiles.git/VA_precincts.zip')

In [55]:
(va_matches, va_discrepancies) = dq.compare_column_names(va_gdf, standards)
states_gdf.append(('VA', va_gdf, va_matches, None))
all_discrepancies.append(('VA', va_discrepancies))

#### Vermont
_Note:_ Unsure if towns represent precincts.

In [56]:
vt_gdf = gdf_from_zip('shps/VT-shapefiles.git/VT_towns.zip')

In [57]:
(vt_matches, vt_discrepancies) = dq.compare_column_names(vt_gdf, standards)
states_gdf.append(('VT', vt_gdf, vt_matches, 
                   "Uses towns as counting unit. Shapefile name does not fit convention."))
all_discrepancies.append(('VT', vt_discrepancies))

#### Wisconsin

In [58]:
wi_gdf = gdf_from_zip('shps/WI-shapefiles.git/WI_wards_12_16.zip')

In [59]:
(wi_matches, wi_discrepancies) = dq.compare_column_names(wi_gdf, standards)
states_gdf.append(('WI', wi_gdf, wi_matches, 
                   "Shapefile name does not fit convention."))
all_discrepancies.append(('WI', wi_discrepancies))

### Discrepancies Summary

In [60]:
print("======== List of Discrepancies by State ========")
print()

for state, discrs in all_discrepancies:
    print('{} : ---- \n{}\n'.format(state, discrs))


AK : ---- 
{'NAME', 'DISTRICT', 'ID', 'BLACK', '2MORE', 'OTHER', 'WHITE', 'NHPI', 'PRES16C', 'AREA', 'POPULATION', 'ASIAN', 'AMIN'}

AZ : ---- 
{'PCTNAME', 'CNTYABV'}

CO : ---- 
{'NAME', 'CD116FP', 'PRECID', 'REG18R', 'REG18D', 'SLDUST', 'SLDLST', 'VTDST'}

CT : ---- 
{'NAME10', 'COUNTYFP10', 'NAMELSAD10', 'STATEFP10'}

DE : ---- 
{'District_1', 'POPULATION', 'ID', 'EDRD_2012'}

GA : ---- 
{'DISTRICT', 'ID', 'PRECINCT_N', 'FIPS1', 'FIPS2', 'PRECINCT_I', 'CTYNAME'}

HI : ---- 
{'NAME', 'BALLOTTYPE', 'ZEROPOP', 'DPTYPE'}

IA : ---- 
{'PRES12OTH', 'PRES04OTH', 'PRES16OTH', 'PRES00OTH', 'NAME10', 'INTPTLON10', 'AWATER10', 'INTPTLAT10', 'PRES08OTH', 'N_NHPI', 'COUNTYFP10', 'ALAND10', 'STATEFP10', 'NAMELSAD10', 'GEOID10'}

IL : ---- 
{'10K_15K', 'RO_G15', 'shape_area', 'MCCART_19', 'EMAN_G15', 'shape_len', '60K_75K', 'TOTV_RO15', 'WILS_G15', 'LESS_10K', 'KOZLAR_19', 'WALLS_G15', 'GARCIA_G15', 'TOTV_19', 'JOYCE_19', 'VALLAS_19', 'full_text', '25K_30K', '50K_60K', '200K_MORE', '45K_50K', 'wa

In [61]:
def flatten(xs):
    return [x for sublist in xs for x in sublist]

bulk_discrepancies = flatten([discrs for _, discrs in all_discrepancies])
bulk_discrepancies = list(set(bulk_discrepancies))

print("======== List of all unique discrepancies ========")
print("Total number of unique discrepancies: ", len(bulk_discrepancies))
bulk_discrepancies

Total number of unique discrepancies:  485


['PRES12OTH',
 'BHVAP',
 '538GOP',
 'GOVIND214',
 'TRSSCT14',
 'ALAND10',
 'HDIST_11',
 'WAGREP14',
 'WALLS_G15',
 'CTU_TYPE',
 'EL14G_US_4',
 'TOTTO16',
 'BLACK18',
 'WSAIND12',
 'EL14G_US_3',
 'CON',
 '125K_150K',
 'RO_E15',
 'GOVIND414',
 'WSAREP212',
 'TRSTOT14',
 'USHSCT14',
 'G18RHOR',
 '75K_100K',
 'CNTYVTD',
 'WAGIND14',
 'JUDDIST',
 'CountyID',
 'ShapeSTLen',
 'GOVTOT12',
 'Shape__Are',
 'WAGDEM12',
 'ADJ_OTHER',
 'SLDUST',
 'county_nam',
 'T16PRESR',
 'TOTVR16',
 'PREIND1116',
 'CTYNAME',
 'USHDEM14',
 'WSASCT12',
 'MTFCC10',
 'PREIND716',
 'ADJNHPIVAP',
 'CDDIST',
 'VTD',
 'HISPVAP',
 'USHIND14',
 'FUNCSTAT10',
 'ADJ_POP',
 'ADJ_2MORE',
 'WSAIND212',
 'WSSDEM12',
 'CHICO_19',
 'G16RPRS',
 'FIPS',
 'WAGTOT14',
 'VTDI10',
 'OBJECTID',
 'MCD',
 'WSATOT16',
 'CD116FP',
 'CTYCOMDIST',
 'REGVOT16',
 'USS12R',
 'RSHARE',
 'G18OSEN',
 'TOTV12',
 'EL08G_US_1',
 'EL12G_GV_D',
 'CD_2011',
 'PREIND212',
 'CDADEM12',
 'PISLAND',
 'TOTTO14',
 'EL08G_GV_D',
 'PRES00OTH',
 'Shape_Le_1',
 'T

## Election Data Summation

In [62]:
print("======== Election Data Summations by State ========")
print()
print("Format:")
print("Election = Total Votes \t\ttype: datatype of votes value")
print()

for state, gdf, matches, notes in states_gdf:
    print("---- {} ----".format(state))
    if notes is not None:
        print("# Note: {}".format(notes))
    
    cols_to_sum = set(matches).intersection(
                        set(elections).union(set(counts)))
        
    sums = dq.sum_column_values(gdf, cols_to_sum)
    for col, val in sums:
        print('{:10} = {:20} \t\ttype: {}'.format(col, val, type(val)))
    
    print()


Format:
Election = Total Votes 		type: datatype of votes value

---- AK ----
GOV18D     =                80954 		type: <class 'numpy.int64'>
GOV18L     =                 3970 		type: <class 'numpy.int64'>
GOV18R     =               100372 		type: <class 'numpy.int64'>
PRES16D    =                69097 		type: <class 'numpy.int64'>
PRES16G    =                 3782 		type: <class 'numpy.int64'>
PRES16L    =                12004 		type: <class 'numpy.int64'>
PRES16R    =               103457 		type: <class 'numpy.int64'>
SEN16D     =                19602 		type: <class 'numpy.int64'>
SEN16L     =                60768 		type: <class 'numpy.int64'>
SEN16R     =                87854 		type: <class 'numpy.int64'>
USH14D     =                77004 		type: <class 'numpy.int64'>
USH14L     =                14715 		type: <class 'numpy.int64'>
USH14R     =               102464 		type: <class 'numpy.int64'>
USH16D     =                67274 		type: <class 'numpy.int64'>
USH16L     =              

## Demographics Data Summation

In [65]:
print("======== Demographics Summations by State ========")
print()
print("Format:")
print("Demographic = Population Count \ttype: datatype of population count value")
print()

for state, gdf, matches, notes in states_gdf:
    print("---- {} ----".format(state))
    if notes is not None:
        print("# Note: {}".format(notes))
    
    cols_to_sum = set(matches).intersection(
                        dm.get_keys_by_category(standards_raw, 'demographics'))
        
    sums = dq.sum_column_values(gdf, cols_to_sum)
    for col, val in sums:
        print('{:10} = {:20} \t\ttype: {}'.format(col, val, str(type(val))))
    
    print()


Format:
Demographic = Population Count 	type: datatype of population count value

---- AK ----
2MOREVAP   =                25525 		type: <class 'numpy.int64'>
AMINVAP    =                70630 		type: <class 'numpy.int64'>
ASIANVAP   =                28312 		type: <class 'numpy.int64'>
BVAP       =                16904 		type: <class 'numpy.int64'>
NHPIVAP    =                 4599 		type: <class 'numpy.int64'>
OTHERVAP   =                 7988 		type: <class 'numpy.int64'>
TOTPOP     =               710231 		type: <class 'numpy.int64'>
VAP        =               522853 		type: <class 'numpy.int64'>
WVAP       =               368895 		type: <class 'numpy.int64'>

---- AZ ----
# Note: Shapefile name does not fit convention.
2MOREVAP   =              60372.0 		type: <class 'numpy.float64'>
AMINVAP    =             175207.0 		type: <class 'numpy.float64'>
ASIANVAP   =             132317.0 		type: <class 'numpy.float64'>
BVAP       =             172249.0 		type: <class 'numpy.float64'>
HI

## Clean Directory

In [64]:
#!echo y | rm -r ./*-shapefiles/
dm.remove_repos('shps/')