In [20]:
import pandas as pd
import math

In [5]:
general_df = pd.read_csv('openelections-data-ga/2016/20161108__ga__general.csv')
general_df

Unnamed: 0,county,office,district,party,candidate,votes,election_day,absentee,early_voting,provisional
0,,President of the United States,,REP,DONALD J. TRUMP,2089104,863089,102766,1120743,2506
1,Appling,President of the United States,,REP,DONALD J. TRUMP,5494,2948,362,2182,2
2,Atkinson,President of the United States,,REP,DONALD J. TRUMP,1878,843,58,972,5
3,Bacon,President of the United States,,REP,DONALD J. TRUMP,3364,1142,134,2082,6
4,Baker,President of the United States,,REP,DONALD J. TRUMP,775,396,76,302,1
5,Baldwin,President of the United States,,REP,DONALD J. TRUMP,7697,2929,330,4433,5
6,Banks,President of the United States,,REP,DONALD J. TRUMP,6134,3079,195,2858,2
7,Barrow,President of the United States,,REP,DONALD J. TRUMP,21108,8881,755,11471,1
8,Bartow,President of the United States,,REP,DONALD J. TRUMP,29911,15525,1038,13327,21
9,Ben Hill,President of the United States,,REP,DONALD J. TRUMP,3739,907,115,2712,5


In [31]:
# remove any nans from county_names by making sure we only consider strings
county_names = [county for county in general_df['county'].drop_duplicates() if isinstance(county, str)]
# confirm we have exactly 159 counties
assert len(county_names) == 159
# lower case each string
county_names = list(map(str.lower, county_names))
county_names = [county.replace(' ', '_') for county in county_names]
county_names

['appling',
 'atkinson',
 'bacon',
 'baker',
 'baldwin',
 'banks',
 'barrow',
 'bartow',
 'ben_hill',
 'berrien',
 'bibb',
 'bleckley',
 'brantley',
 'brooks',
 'bryan',
 'bulloch',
 'burke',
 'butts',
 'calhoun',
 'camden',
 'candler',
 'carroll',
 'catoosa',
 'charlton',
 'chatham',
 'chattahoochee',
 'chattooga',
 'cherokee',
 'clarke',
 'clay',
 'clayton',
 'clinch',
 'cobb',
 'coffee',
 'colquitt',
 'columbia',
 'cook',
 'coweta',
 'crawford',
 'crisp',
 'dade',
 'dawson',
 'decatur',
 'dekalb',
 'dodge',
 'dooly',
 'dougherty',
 'douglas',
 'early',
 'echols',
 'effingham',
 'elbert',
 'emanuel',
 'evans',
 'fannin',
 'fayette',
 'floyd',
 'forsyth',
 'franklin',
 'fulton',
 'gilmer',
 'glascock',
 'glynn',
 'gordon',
 'grady',
 'greene',
 'gwinnett',
 'habersham',
 'hall',
 'hancock',
 'haralson',
 'harris',
 'hart',
 'heard',
 'henry',
 'houston',
 'irwin',
 'jackson',
 'jasper',
 'jeff_davis',
 'jefferson',
 'jenkins',
 'johnson',
 'jones',
 'lamar',
 'lanier',
 'laurens',
 'l

In [32]:
# read csv for each county
dataframes = {county: pd.read_csv('openelections-data-ga/2016/20161108__ga__general__{}__precinct.csv'.format(county)) for county in county_names}
dataframes

{'appling':       county precinct                                             office  \
 0    Appling      NaN                     President of the United States   
 1    Appling       1B                     President of the United States   
 2    Appling       1C                     President of the United States   
 3    Appling        2                     President of the United States   
 4    Appling       3A                     President of the United States   
 5    Appling      3A1                     President of the United States   
 6    Appling       3B                     President of the United States   
 7    Appling       3C                     President of the United States   
 8    Appling       4A                     President of the United States   
 9    Appling       4B                     President of the United States   
 10   Appling       4C                     President of the United States   
 11   Appling       4D                     President of the Unite

In [35]:
# only consider house of representative rows
rep_keyword = 'U.S. Representative'

# lambda that returns only the rows containing US representatives
rep_rows = lambda county: dataframes[county][dataframes[county]['office'] == rep_keyword]

dataframes_rep = {county: rep_rows(county) for county in county_names}
dataframes_rep

{'appling':       county precinct               office district party  \
 120  Appling      NaN  U.S. Representative       12  (REP   
 121  Appling       1B  U.S. Representative       12  (REP   
 122  Appling       1C  U.S. Representative       12  (REP   
 123  Appling        2  U.S. Representative       12  (REP   
 124  Appling       3A  U.S. Representative       12  (REP   
 125  Appling      3A1  U.S. Representative       12  (REP   
 126  Appling       3B  U.S. Representative       12  (REP   
 127  Appling       3C  U.S. Representative       12  (REP   
 128  Appling       4A  U.S. Representative       12  (REP   
 129  Appling       4B  U.S. Representative       12  (REP   
 130  Appling       4C  U.S. Representative       12  (REP   
 131  Appling       4D  U.S. Representative       12  (REP   
 132  Appling       5A  U.S. Representative       12  (REP   
 133  Appling       5B  U.S. Representative       12  (REP   
 134  Appling       5C  U.S. Representative       12  (REP 

In [45]:
# find all precinct names and remove nans in the process
# we put these in a set since they may be duplicated across rows
precinct_names = {precinct for df in dataframes_rep.values() for precinct in df['precinct'] if isinstance(precinct, str)}
precinct_names

{'DORSETT SHOALS',
 'LAKE CREEK',
 'TH',
 'BAYCREEK J',
 'BAYCREEK K',
 'Lamar Reese Elem School',
 'WHITEFOORD ELEM',
 'Precinct 4',
 'LITTLE CREEK',
 'Glenwood',
 'EMC (714BN)',
 'PINCKNEYVILLE C',
 'Austell 1A',
 'HAWKINS',
 'RW20',
 'BEREA-STEADMAN',
 'Macland 01',
 'NORTH DECATUR',
 'Bessie Thomas Center',
 'Attapulgus',
 'EAGES LANDING BAPTIST',
 '10P',
 'Harlem Baptist Church-12',
 'Post Oak 01',
 'CLAYTON',
 '8-08 Resurrection of Our Lord Church',
 'ROCKYCREEK A',
 'Owltown',
 'Remington ESC',
 'MORROW 9',
 'BOY SCOUTS HDQTRS',
 'VALLEY',
 'West Hardwick',
 'Devereux Fire Station',
 '08G',
 'Darton College',
 'Sterling Church of God',
 '01D',
 'Statesboro',
 '05D',
 '02L2',
 'Martinez Baptist',
 'TRENTON',
 'SHILOH',
 'TOONIGH',
 '5C',
 'Jackson Heights Elem',
 'Dupont',
 '03I',
 'Nunez',
 'EDGEWOOD BAPTIST',
 '28-Friendship I',
 'LAWRENCEVILLE G',
 'TOCCOA',
 'Central',
 '103',
 'SUTALLEE',
 'Doles',
 'BRIAR VISTA ELEM',
 'Cooperville Fire Station',
 'Bethel',
 'SC08B',
 'SUWA

In [52]:
len(precinct_names)

2629

In [50]:
open_precinct_names = set(map(str.lower, precinct_names))
open_precinct_names

{'shakerag west',
 'winters chapel',
 'ap05',
 'south newport',
 '04f',
 'ivy log',
 'gl',
 '2c',
 'bay',
 'pinckneyville j',
 'lucille',
 'red oak',
 'ml021',
 'winnona park elem',
 'white creek',
 'broken arrow',
 'bells ferry 02',
 'tilly mill road',
 'rutland 2',
 'shaw',
 'olde towne',
 'st. mark',
 'barksdale',
 'elberton',
 'lowell (1163)',
 'pinckneyville x',
 '16 otwell-9',
 'fair',
 '7-11 savannah first seventh day adventist church',
 '2-05 holy spirit lutheran church',
 'salvation army',
 'r t jones',
 'timber ridge 01',
 'pinckneyville a1',
 'hollingsworth',
 'grantville',
 'ep04-05',
 '12l',
 'vinings 01',
 'fischer road',
 'st. george',
 'ponce de leon',
 '8-15 garden city recreation center',
 'kiokee baptist church-12',
 'sugar hill d',
 'ss03',
 'vanderhorst',
 '3-02 rose of sharon',
 'christ the king luth ch',
 'tails creek',
 'chesnut elem',
 'jc07',
 'ashford dunwoody rd',
 'glyndale school',
 'kennesaw 5a',
 'black creek',
 'ellerslie',
 'berkshire o',
 '8-05 w broa

In [51]:
len(open_precinct_names)

2600

In [43]:
[(county, precinct) for county, df in dataframes_rep.items() for precinct in df['precinct'] if precinct == 'HOWARD 1']

[('bibb', 'HOWARD 1'), ('bibb', 'HOWARD 1')]

In [48]:
ga_metadata = pd.read_json('data/GeorgiaMetadata.json', orient='index')
ga_metadata

Unnamed: 0,AREA,COUNTY,COUNTY_NAM,CTYNAME,CTYNUMBER,CTYSOSID,DATA,DISTRICT,FIPS1,FIPS2,...,G16PSCLHos,G16PSCREch,G16USSDBar,G16USSLBuc,G16USSRIsa,ID,POPULATION,PRECINCT_I,PRECINCT_N,coloring
0,1.86,36.0,COLUMBIA,Columbia,36.0,073131,690,073131,13073.0,73.0,...,66,477,78,23,453,6094217,1013,131,JOURNEY COMM. CHURCH,12
1,1.64,36.0,COLUMBIA,Columbia,36.0,073064,714,073064,13073.0,73.0,...,464,1527,643,77,1345,6094266,3858,064,GRACE BAPTIST CHURCH,10
2,5.60,36.0,COLUMBIA,Columbia,36.0,073061,711,073061,13073.0,73.0,...,260,1508,299,88,1392,6094297,1920,061,GREENBRIER HIGH,10
3,0.87,36.0,COLUMBIA,Columbia,36.0,073063,713,073063,13073.0,73.0,...,156,716,219,34,648,6094341,1770,063,RIVERSIDE ELEMENTARY,12
4,3.78,36.0,COLUMBIA,Columbia,36.0,073132,691,073132,13073.0,73.0,...,268,1495,360,59,1379,6094377,3131,132,WESLEY METHODIST,12
5,3.09,36.0,COLUMBIA,Columbia,36.0,073135,693,073135,13073.0,73.0,...,301,1539,368,68,1434,6094444,3518,135,"CHRIST CHURCH, PRESBYTERIAN",12
6,3.18,36.0,COLUMBIA,Columbia,36.0,073107,681,073107,13073.0,73.0,...,203,832,299,31,758,6094489,1902,107,GOSPEL WATER BRANCH,12
7,1.38,31.0,CLAYTON,Clayton,31.0,063RD12,502,063RD12,13063.0,63.0,...,984,324,1324,36,162,888321,3720,RD12,RIVERDALE 12,13
8,42.90,36.0,COLUMBIA,Columbia,36.0,073040,707,073040,13073.0,73.0,...,111,696,166,27,644,6058629,1719,040,EUBANK/BLANCHARD CTR,10
9,32.42,90.0,LINCOLN,Lincoln,90.0,1813-B,2078,1813-B,13181.0,181.0,...,127,371,252,9,344,6058690,1257,3-B,TABERNACLE,10


In [54]:
metadata_precinct_names = set(ga_metadata['PRECINCT_N'].str.lower())
metadata_precinct_names

{'shakerag west',
 'ap05',
 'south newport',
 '2c',
 'ivy log',
 'bay',
 'gl',
 '04f',
 'pinckneyville j',
 'red oak',
 'ml021',
 'broken arrow',
 'white creek',
 'bells ferry 02',
 'rutland 2',
 'shaw',
 'st. mark',
 'elberton',
 'pinckneyville x',
 'saint francis episcopal church',
 'fair',
 'salvation army',
 'timber ridge 01',
 'r t jones',
 'pinckneyville a1',
 'hollingsworth',
 'grantville',
 'bramlett elementary school',
 '12l',
 'vinings 01',
 'fischer road',
 'st. george',
 'sugar hill d',
 'ss03',
 'vanderhorst',
 'mccra',
 'tails creek',
 'jc07',
 'glyndale school',
 'kennesaw 5a',
 'black creek',
 'ellerslie',
 'oakwood i',
 'berkshire o',
 'camp creek',
 'st andrews/midland',
 'chestnut ridge 01',
 'univeter',
 'pucketts a',
 'youngs grove',
 'renfroe middle (dec)',
 'marshallville',
 'chalker 01',
 'sirmans',
 '103',
 'church of our savior',
 'rock spring',
 'tibet',
 'sc18a',
 'eddin',
 '5d',
 'ragan',
 'berkshire q',
 'talmo',
 'twin city',
 'stark',
 'radium middle sch

In [55]:
len(metadata_precinct_names)

2558

These are the same results as with the MIT Election data

In [57]:
matched = 0
unmatched = []
for name in metadata_precinct_names:
    if name in open_precinct_names:
        matched += 1
    else:
        unmatched.append(name)
matched, len(unmatched)

(2108, 450)

In [62]:
from fuzzywuzzy import fuzz
from collections import defaultdict

In [64]:
match_dict = {}
matchee_dict = defaultdict(list)
open_precinct_list = list(open_precinct_names)
for name in metadata_precinct_names:
    ratios = list(map(lambda x: fuzz.ratio(name, x), open_precinct_list))
    f = lambda i: ratios[i]
    argmax_index = max(range(len(open_precinct_list)), key=f)
    #print(argmax_index)
    match_dict[name] = open_precinct_list[argmax_index]
    matchee_dict[open_precinct_list[argmax_index]].append(name)

match_dict

{'shakerag west': 'shakerag west',
 'ap05': 'ap05',
 'south newport': 'south newport',
 '2c': '2c',
 'ivy log': 'ivy log',
 'bay': 'bay',
 'gl': 'gl',
 '04f': '04f',
 'pinckneyville j': 'pinckneyville j',
 'red oak': 'red oak',
 'ml021': 'ml021',
 'broken arrow': 'broken arrow',
 'white creek': 'white creek',
 'bells ferry 02': 'bells ferry 02',
 'rutland 2': 'rutland 2',
 'shaw': 'shaw',
 'st. mark': 'st. mark',
 'elberton': 'elberton',
 'pinckneyville x': 'pinckneyville x',
 'saint francis episcopal church': '4-05 saint francis episcopal',
 'fair': 'fair',
 'salvation army': 'salvation army',
 'timber ridge 01': 'timber ridge 01',
 'r t jones': 'r t jones',
 'pinckneyville a1': 'pinckneyville a1',
 'hollingsworth': 'hollingsworth',
 'grantville': 'grantville',
 'bramlett elementary school': '03 bramlett elementary school',
 '12l': '12l',
 'vinings 01': 'vinings 01',
 'fischer road': 'fischer road',
 'st. george': 'st. george',
 'sugar hill d': 'sugar hill d',
 'ss03': 'ss03',
 'vande

In [61]:
len(match_dict)

2558

In [68]:
len(matchee_dict)

2430

In [65]:
matchee_dict

defaultdict(list,
            {'shakerag west': ['shakerag west', None],
             'ap05': ['ap05'],
             'south newport': ['south newport'],
             '2c': ['2c', '6c'],
             'ivy log': ['ivy log'],
             'bay': ['bay', 'ba'],
             'gl': ['gl'],
             '04f': ['04f', '04'],
             'pinckneyville j': ['pinckneyville j'],
             'red oak': ['red oak'],
             'ml021': ['ml021'],
             'broken arrow': ['broken arrow'],
             'white creek': ['white creek'],
             'bells ferry 02': ['bells ferry 02'],
             'rutland 2': ['rutland 2'],
             'shaw': ['shaw', 'sh', 'sa'],
             'st. mark': ['st. mark'],
             'elberton': ['elberton'],
             'pinckneyville x': ['pinckneyville x'],
             '4-05 saint francis episcopal': ['saint francis episcopal church'],
             'fair': ['fair'],
             'salvation army': ['salvation army'],
             'timber ridge 01': ['ti

In [67]:
[name for name in open_precinct_names if name.find('redbone') != -1]

['redbone']

In [69]:
{match: matchee for match, matchee in matchee_dict.items() if len(matchee) > 1}

{'shakerag west': ['shakerag west', None],
 '2c': ['2c', '6c'],
 'bay': ['bay', 'ba'],
 '04f': ['04f', '04'],
 'shaw': ['shaw', 'sh', 'sa'],
 '12l': ['12l', '12'],
 'mcrae': ['mccra', 'mcrae'],
 '103': ['103', '10', '13', '03'],
 'tibet': ['tibet', 'bt'],
 '05d': ['5d', '05d'],
 'howard 4': ['howard 4', 'howard 2'],
 'spence': ['spence', 'sp'],
 'sandersville': ['sandersville', 'andersonville', 'sandhill'],
 'trion': ['trion', 'mt zion'],
 '701': ['701', '01'],
 'chapel hill': ['church at chapel hill', 'chapel hill'],
 '02g': ['02g', '02'],
 'ft. perry': ['fort perry', 'st. peter'],
 'empire': ['empire', 'empir'],
 '208': ['208', '08', '20'],
 '09e': ['09e', '09'],
 '07d': ['07d', '07'],
 'riverside elementary': ['riverview health reh', 'riverside elementary'],
 'lord': ['lord', 'lo'],
 '11p': ['11p', '11'],
 'midvale road': ['middle 9th-gay', 'midvale road'],
 'first baptist': ['#6 first baptist',
  'first baptist douglasville',
  'first baptist'],
 'tesnatee': ['tate', 'tesnatee'],
 

In [78]:
[name for name in open_precinct_names if name.find('butler') != -1]

['5-03 butler presbyterian ed. building',
 '#1 butler fire dept.',
 '8-11 butler school']

In [73]:
[name for name in open_precinct_names if name.find('red') != -1]

['red oak',
 'redstone',
 'redan-trotti library',
 'red bud',
 'redan middle',
 'red hill',
 'redeemer church',
 'redbone',
 'mildred',
 'red rock',
 'redan road',
 'kittredge elem',
 'redan elem']