In [53]:
import pandas as pd
import numpy as np
import os
import csv
import re
from collections import Counter
pd.options.display.max_rows = 1500
pd.options.display.max_columns = 30

In [54]:
raw=pd.read_csv('/Users/declanchin/Desktop/MEDSL/2020-precincts/precinct/KY/raw/KY2020_mostrecent.csv',low_memory=False)

# fixes issues in two counties where the offices are exact duplcates (eg two separate questions both named question)
fix_dup_list = []
for county in raw['county'].unique():
    c = raw[raw['county']==county]
    cset = set(zip(c['race'],c['race.id']))
    keys = [key[0] for key in cset]
    cnt = Counter(keys)
    cnt_lst=[k for k, v in cnt.items() if v > 1]
    if len(cnt_lst)>0:
        fix_dup_list = fix_dup_list + ([[county] + [i for i in list(cset) if i[0] in cnt_lst]])
        
for duplicate in fix_dup_list:
    county = duplicate[0]
    fix1 = duplicate[1]
    fix2 = duplicate[2]
    raw.loc[((raw['county']==county)&(raw['race'] == fix1[0])&(raw['race.id'] == fix1[1])), 'race'] = fix1[0] + ' - 1'
    raw.loc[((raw['county']==county)&(raw['race'] == fix2[0])&(raw['race.id'] == fix2[1])), 'race'] = fix2[0] + ' - 2'

In [55]:
crosswalk = pd.read_excel('/Users/declanchin/Desktop/MEDSL/2020-precincts/precinct/KY/ky-party-candidate-crosswalk.xlsx')
crosswalk['candidate'] = crosswalk['candidate'].str.upper().str.replace('.','', regex=True)
crosswalk.loc[-1] = ['MATTHEW RYAN BEST', 'Democratic Party', np.nan]

In [56]:
def drop_duplicates():
    df = raw.loc[:,['race','candidates','county','precinct','type','votes','registration']]
    df.columns = ['office','candidate','county','precinct','mode','votes','registration']
    df['office'] = df['office'].str.upper()
    df['county'] = df['county'].str.upper()
    #drop duplicates and empty rows for floating counties
    df=df[(df['mode']=='total')&(df['precinct']!='-1')]
    counties_with_mode=[i for i in df['county'].unique() if 'Abse' in df[df['county']==i]['precinct'].unique()]
    for i in counties_with_mode:
        if i != "CLAY":
            df = df[~((df['county']==i)&(~df['precinct'].isin(['Elec','Abse','Earl','Pres'])))]
            df['mode'] = np.where(df['county']==i, df['precinct'], df['mode'])
            df['precinct'] = np.where(df['county']==i, 'COUNTY FLOATING', df['precinct'])
    df['precinct'] = np.where(((df['precinct']=='BALLOT')),'COUNTY FLOATING',df['precinct'])
    df.loc[(df['county']=='CLAY') & (df['precinct']=='Elec'),
          'mode'] = 'Elec'
    df.loc[(df['county']=='CLAY') & (df['precinct']=='Abse'),
          'mode'] = 'Abse'
    df.loc[(df['county']=='CLAY') & (df['precinct']=='Earl'),
          'mode'] = 'Earl'
    df.loc[(df['county']=='CLAY') & (df['precinct']=='Pres'),
          'mode'] = 'Pres'
    df['mode'] = df['mode'].str.upper()
    df = df.replace(['ELEC', 'ABSE', 'EARL','PRES'],['ELECTION DAY', 'ABSENTEE', 'EARLY','PRES'])
    df.loc[((df['county']=='CLAY') & (df['mode']!='TOTAL')), 'precinct'] = 'COUNTY FLOATING'  
    return df

In [57]:
def fix_candidate(x):
    x = x.replace('.','').upper()
    if "/" in x: return x.split('/')[0].strip()
    if x =='WRITE-IN': return "WRITEIN"
    if "' " in x: return x.replace("' ",'" ').replace(" '",' "')
    if " '" in x: return x.replace("' ",'" ').replace(" '",' "')
    if x =='KATHY (SUSIE) MILLS': return 'KATHY "SUSIE" MILLS'
    if x =='JAMES E (BILL) NAPIER': return 'JAMES E "BILL" NAPIER'
    else: return x

In [58]:
def get_registration(df):
    # gets non-zero registration info for unique county/precinct pairs to concat
    registered_voters=df.melt(id_vars = ['office','candidate','county','precinct','mode'], value_vars='registration').rename(columns={'value':'votes'}).drop(columns='variable')
    registered_voters = registered_voters.drop_duplicates(subset = ['county','precinct'])
    registered_voters = registered_voters[registered_voters['votes']>0]
    registered_voters['office'] = 'REGISTERED VOTERS'
    registered_voters['candidate'] = ''
    return registered_voters

In [59]:
# strips district from office field and adds to district. Same for circuit courts, divisions for School board
def fix_office_district(x):
    if ("DISTRICT" in x) & ('COUNCIL' not in x)& ('EDUCATION' not in x)& ('ELECTED' not in x): 
        return x.split(re.findall(r'\d+',x)[0])[0].strip().strip(',').replace(' DISTRICT','')
    if 'WARD' in x:
        lst='CITY COUNCIL CITY OF HOPKINSVILLE WARD 12'.split('WARD '+ re.findall(r'\d+', 'CITY COUNCIL CITY OF HOPKINSVILLE WARD 12')[0])
        fixed = ' '.join([elem.strip() for elem in lst])
        return fixed
    if "METRO COUNCIL" in x: return "METRO COUNCIL " + re.findall(r'\d+',x)[0]
    if "URBAN COUNTY COUNCIL" in x: return "URBAN COUNTY COUNCIL"
    if "CIRCUIT" in x: return x.split(',')[0]
    if "EDUCATION" in x: return x.split('EDUCATION')[0] + 'EDUCATION'
    else: return x

In [60]:
def fix_office(x):
    if x == 'PRESIDENT AND VICE PRESIDENT OF THE UNITED STATES': return 'US PRESIDENT'
    if x=='US REPRESENTATIVE': return 'US HOUSE'
    if x=='STATE REPRESENTATIVE': return 'STATE HOUSE'
    if x=='US SENATOR': return 'US SENATE'
    if x=='STATE SENATOR': return 'STATE SENATE'
    if "UNEXPIRED" in x: return x.replace('(UNEXPIRED TERM)','').strip()
    else: return x  

In [61]:
def fix_district(x):
    if 'PERSONS'  in x: return ''
    if (("DISTRICT" in x)&("DIVISION" in x)): return re.findall(r'\d+',x)[0].zfill(3) + ', DIVISION ' + re.findall(r'\d+',x)[1]
    if "DISTRICT" in x: return re.findall(r'\d+',x)[0].zfill(3)
    if "WARD" in x: return 'WARD ' + re.findall(r'\d+',x)[0]
    if ("COUNCIL" in x): return re.findall(r'\d+',x)[0].zfill(3)
    if "CIRCUIT" in x: return "CIRCUIT " + re.findall(r'\d+',x)[0] + ', DIVISION ' + re.findall(r'\d+',x)[1]
    if (("DIVISION" in x)&("CIRCUIT" not in x)): return 'DIVISION ' + re.findall(r'\d+',x)[0]
    else: return x

In [62]:
def get_party_writein(df):
    df=df.merge(crosswalk, on='candidate',how='left')
    df['party'] = df['party'].str.upper().str.replace(' PARTY','').fillna('').str.replace('DEMOCRATIC', "DEMOCRAT")
    df = df.rename(columns={'party':'party_detailed','county':'county_name'})
    df['writein'] = np.where(df['candidate']=='WRITEIN','TRUE','FALSE')
    return df

In [63]:
def get_party_simplified(x):
    if x in ['DEMOCRAT','REPUBLICAN','NONPARTISAN','LIBERTARIAN']: return x
    if x == '': return ''
    else: return "OTHER"

In [64]:
def get_special(x):
    if "UNEXPIRED" in x: return 'TRUE'
    else: return "FALSE"

In [65]:
def get_dataverse(x):
    if x =='US PRESIDENT': return 'PRESIDENT'
    if x == 'US HOUSE': return 'HOUSE'
    if x =='US SENATE': return 'SENATE'
    if x in ['STATE SENATE', 'STATE HOUSE','JUDGE OF THE COURT OF APPEALS','JUSTICE OF THE SUPREME COURT',
            'CONSTITUTIONAL AMENDMENT 1','CONSTITUTIONAL AMENDMENT 2']: 
        return 'STATE'
    if 'CIRCUIT' in x: return 'STATE'
    if x=='REGISTERED VOTERS': return ''
    else: return 'LOCAL'

In [66]:
# all functions
df = drop_duplicates()
df.candidate = df.candidate.apply(fix_candidate)
df = pd.concat([df,get_registration(df)]).drop(columns='registration').reset_index(drop=True)
df['special'] = df.office.apply(get_special)
districts = [i if ((any(str.isdigit(c) for c in i))&(('DISTRICT' in i)|('WARD' in i)|('COUNCIL' in i)|('DIVISION' in i))) else '' for i in df['office']]
df['district'] = districts
office_no_district = df.district.apply(fix_office_district)
df['office'] = np.where(df['district']!='', office_no_district,df['office'])
df.office = df.office.apply(fix_office)  
df['district'] = df.district.apply(fix_district)
df['district']= np.where(((df['office']=='US PRESIDENT')|(df['office']== 'US SENATE')|
                         (df['office']=='CONSTITUTIONAL AMENDMENT 1')|
                         (df['office']=='CONSTITUTIONAL AMENDMENT 2')), 'STATEWIDE',df['district'])
#party info
df = get_party_writein(df)
df['party_detailed'] = np.where(((df['office'].str.contains('JUDGE'))|(df['office'].str.contains('JUSTICE'))|(df['office'].str.contains('CLERK'))),
                               'NONPARTISAN', df['party_detailed'])
df['party_simplified'] = df.party_detailed.apply(get_party_simplified)
df['dataverse'] = df.office.apply(get_dataverse)

In [67]:
# add year, stage, state, date, office, jurisdiction,special
df['jurisdiction_name'] = df['county_name'].str.upper()
jurisdiction_fips = pd.read_csv('/Users/declanchin/Desktop/MEDSL/2020-precincts/help-files/jurisdiction-fips-codes.csv')
jurisdiction_fips = jurisdiction_fips[jurisdiction_fips['state']=='Kentucky'].drop(columns='state')
df=df.merge(jurisdiction_fips, on='jurisdiction_name', how='left')
df['jurisdiction_fips'] = df['jurisdiction_fips'].fillna('').astype(str).str.strip('\.0')
county_fips=pd.read_csv('/Users/declanchin/Desktop/MEDSL/2020-precincts/help-files/county-fips-codes.csv')
county_fips = county_fips[county_fips['state']=='Kentucky'].drop(columns='state')
df=df.merge(county_fips, on='county_name', how='left')
df['year']= 2020
df['state'] = 'KENTUCKY'
df['date']= '2020-11-03' #np.where(((df['office']=='STATE HOUSE')&(df['special']==True)), '2020-02-25','2020-11-03')
df['stage']='GEN'
df['magnitude']= np.where(df['office'].str.contains('CITY COMMISSIONERS'), 4, 1)
df['magnitude']= np.where(df['office']=='REGISTERED VOTERS', 0, df['magnitude'])
df['votes']=df['votes'].astype(int)
df['readme_check']=np.where(df['precinct']=='COUNTY FLOATING', 'TRUE', 'FALSE')
# state codes
state_codes = pd.read_csv('/Users/declanchin/Desktop/MEDSL/2020-precincts/help-files/merge_on_statecodes.csv')
state_codes = state_codes[state_codes['state']=='Kentucky']
state_codes['state'] = state_codes['state'].str.upper()
df=df.merge(state_codes, on='state')
df=df.fillna('')
#fixes
df['office']=df['office'].str.strip()
#corrections
df['precinct'] = df['precinct'].replace('Pres','COUNTY FLOATING')
df['office'] = np.where(((df['office']=='CIRCUIT JUDGE')&((df['district']==""))),
                       'CIRCUIT JUDGE FAMILY COURT', df['office'])
df['district'] = np.where(((df['office']=='CIRCUIT JUDGE FAMILY COURT')&((df['district']==""))),
                         'CIRCUIT 30, DIVISION 3',df['district'] )

In [68]:
# two candidates withdrew in the same county floating precinct
df.loc[(df['office']=='WOODLAND HILLS COMMISSIONER WOODLAND HILLS')&
       (df['candidate']=='WITHDRAWN'),'candidate'] = ['WITHDRAWN - 1', 'WITHDRAWN - 2']

In [69]:
df = df.loc[:,['precinct', 'office', 'party_detailed', 'party_simplified', 'mode',
       'votes', 'county_name', 'county_fips', 'jurisdiction_name',
       'jurisdiction_fips', 'candidate', 'district', 'magnitude', 'dataverse',
       'year', 'stage', 'state', 'special', 'writein', 'state_po',
       'state_fips', 'state_cen', 'state_ic', 'date', 'readme_check']]

In [70]:
df.to_csv('/Users/declanchin/Desktop/MEDSL/2020-precincts/precinct/KY/2020-ky-precinct-general.csv', index=False,quoting=csv.QUOTE_NONNUMERIC)

### NOTES
- only election day voting? 
- missing three special elections
- "-1" precincts look like vote totals across either precincts or voting modes
- ANSWER: readme_check for these counties. They centralized their voting process by having voters return ballots to the county.
- leaving in pres precinct for now
- clay has mode and precinct

In [71]:
# !jupyter nbconvert --to script 'KY_cleaning.ipynb'

[NbConvertApp] Converting notebook KY_cleaning.ipynb to script
[NbConvertApp] Writing 11491 bytes to KY_cleaning.py
