In [12]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [13]:
#Formatting Functions

#Removes commas from integers with >3 digits represented as strings
#Example: ('1,456,552' -> 1456552)
def comma_del(my_str):
    if type(my_str)==int or type(my_str)==float:
        return my_str
    else:    
        my_list = my_str.split(",")
        output = ""
        for element in my_list:
            output+=element
        output = int(output)
        return output

#Removes leading 0's from string representations of integers
#Example: ('0000151' -> 151)
def remove_zeros(my_str):
    i=0
    for char in my_str:
        if char=="0":
            i+=1
        if char!="0":
            return(my_str[slice(i,len(my_str),1)])


In [14]:
#Iterates through all csv files in 'County_Data_Clean' and 
#produces a dictionary containing county abbreviations and file names
#Example:   {... ,'SEM':'SEM_Data_Clean.csv', ... }
files = os.listdir("County_Data_Clean")
counties = [x.split("_")[0] for x in files]
file_dict = {counties[i]:files[i] for i in range(len(files))}

#Outputs a CSV of county abbreviations
pd.DataFrame(counties).to_csv("Counties.csv")

#Source: https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory

In [15]:
#Imports Voter Registration Data For Precincts
precinct_df = pd.read_csv("2018gen_precinct.csv")
#Drops blank row
precinct_df = precinct_df.drop(5881)

In [16]:
#Creates Dictionary of Data Frames
#Example: {..., "SEM":<data frame with clean Seminole County data>, ...}
county_df_dict = {}
for county in counties:
    path = os.path.join("County_Data_Clean",file_dict[county])
    county_df_dict[county] = pd.read_csv(path,index_col=0)

In [21]:
#Removes Leading Zeros from Precinct Data
precinct_df['Precinct Number'].map(remove_zeros)
precinct_df

Unnamed: 0,County Code,Precinct Number,Republican Party of Florida,Florida Democratic Party,Other,Total
0,ALA,1,584,538,287,1409
1,ALA,2,739,951,379,2069
2,ALA,3,1626,1520,961,4107
3,ALA,4,1406,1040,671,3117
4,ALA,5,733,1552,841,3126
...,...,...,...,...,...,...
5876,WAS,5,729,546,247,1522
5877,WAS,6,221,318,85,624
5878,WAS,7,810,507,226,1543
5879,WAS,8,764,696,175,1635


In [22]:
#Reformats precinct numbers for merge
for county in county_df_dict:
    county_df_dict[county] = county_df_dict[county].rename(columns = {"Unique PCT Identifier":"Precinct Number"})
    county_df_dict[county]['Precinct Number'] = county_df_dict[county]['Precinct Number'].map(str)
    county_df_dict[county]['Precinct Number'] = county_df_dict[county]['Precinct Number'].map(remove_zeros)  

In [23]:
#Separates precinct data into a dictionary of data frames.
precinct_dict={}
for county in counties:
    precinct_dict[county]=precinct_df[precinct_df['County Code']==county]

In [32]:
#Merges precinct data into elections data for each county.
for county in counties:
    county_df_dict[county] = pd.merge(county_df_dict[county],precinct_dict[county],
                                      on=["Precinct Number","County Code"],
                                      how="inner")
    
    county_df_dict[county] = county_df_dict[county].rename(columns = {"Republican Party of Florida":"Registered Republicans",
                                     "Florida Democratic Party":"Registered Democrats",
                                     "Other":"Registered Other"})

In [33]:
#Re-orders columns and drops redundant columns.
for county in counties:
    county_df_dict[county]=county_df_dict[county][["County Code",
                                "Election Date",
                                "Precinct Number",
                                "Precinct Polling Location",
                                "Registered Republicans",
                                "Registered Democrats",
                                "Registered Other",
                                "Total Registered",
                                "Contest Name",
                                "District",
                                "Candidate",
                                "Party",
                                "Vote Total"]]

In [34]:
#Combines all dataframes into one
election_list = [county_df_dict[x] for x in county_df_dict]
election_data=pd.concat(election_list,keys=None,ignore_index=True)

In [35]:
#Removes commas from string representations of integers greater than 1000, casts to int.
for col in ['Registered Republicans','Registered Democrats','Registered Other','Total Registered','Vote Total']:
    election_data[col]=[comma_del(x) for x in election_data[col]]

In [36]:
#Replaces null values in Total_Registered with sum of Registered voters by party
for idx in election_data[election_data['Total Registered'].isnull()].index.tolist():
    election_data.loc[idx,'Total Registered']=election_data.loc[idx,'Registered Republicans']+election_data.loc[idx,'Registered Democrats']+election_data.loc[idx,'Registered Other']

In [37]:
# Adds a column which combines county abbreviation and precinct number
election_data["Precinct_Full_ID"]= election_data["County Code"] + "--" + election_data["Precinct Number"].map(str)

In [39]:
election_data.to_csv("election_data_clean.csv")

In [38]:
election_gb = election_data.groupby(['Contest Name','Candidate'])

In [None]:
null_counties = []
county_gb = election_data.groupby('County Code')
counties = list(county_gb.groups.keys())
for county in counties:
    null_df = county_gb.get_group(county).isnull()
    if null_df.sum().sum()==0:
        pass
    else:
        null_counties.append(county)
        
        

In [None]:
# for county in null_counties:
#     plt.figure()
#     sns.heatmap(county_gb.get_group(county).isnull(),cbar=False)
#     plt.title(county)

In [None]:
precinct_df2 = precinct_df.copy()

In [None]:
precinct_df2["Precinct_Full_ID"]=precinct_df2["County Code"] + "--" + precinct_df2["Precinct Number"].map(str)

In [None]:
#Determines Precincts Missing Voter Registration Data
missing_from_precinct_data = []
for x in election_data['Precinct_Full_ID'].unique():
    if not x in precinct_df2['Precinct_Full_ID'].unique():
        missing_from_precinct_data.append(x)          

missing_pcts = {}
for x in missing_from_precinct_data:
    missing_pcts[x] = election_data[election_data['Precinct_Full_ID']==x]

#Determines the precincts with votes but not voter registration data
#(all other precincts from missing_pcts will be dropped)
missing_registration = []
for x in lst2:
    county = x.split("--")[0]
    pct = x.split("--")[1]
    vt = round(county_df_dict[county][county_df_dict[county]['Precinct Number']==pct]['Vote Total'].mean(),1)
    tr = round(county_df_dict[county][county_df_dict[county]['Precinct Number']==pct]['Total Registered'].mean(),1)
    if vt+tr!=0:
        missing_registration.append((x,vt,tr))


In [None]:
missing_votes = election_data[election_data['Vote Total'].isnull()]['Precinct_Full_ID'].tolist()

In [26]:
oka = county_df_dict['OKA']

In [24]:
poka = precinct_dict['OKA']


In [29]:
doka = oka[oka['District']==' District 3']
doka[doka['Precinct Number']=="1"]

Unnamed: 0,County Code,Election Date,Precinct Number,Precinct Polling Location,Total Registered,Contest Name,District,Contest Code,Candidate,Party,Vote Total
36,OKA,11/06/2018,1,1,2350,State Representative,District 3,260030,Jayer Williamson,REP,1458
37,OKA,11/06/2018,1,1,2350,State Representative,District 3,260030,Bobbi Osborne,NPA,165
38,OKA,11/06/2018,1,1,2350,State Representative,District 3,260030,OverVotes,,0
39,OKA,11/06/2018,1,1,2350,State Representative,District 3,260030,UnderVotes,,68
