Cleans data for 2017 mayor ballots

In [1]:
import pandas as pd
import numpy as np
import os
from pathfinder import csvfinder

csv_file = r"2017-Mayor-Ballot-Records.csv" #Change file name here/other data cleaning tasks below when dealing with different datasets
file_path = csvfinder(csv_file)

#Load the CSV file
data = pd.read_csv(file_path)

In [2]:
data = pd.read_csv(file_path)
data.rename(columns={
    '1st Choice': '1st',
    '2nd Choice': '2nd',
    '3rd Choice': '3rd',
    'Precinct': 'Precinct',
    'Count': 'Count'
}, inplace=True)
data.to_csv(file_path, index = False)

Remove invalid votes

In [52]:
#Define the list of values to filter out
undesired_values = ["undervote", "overvote", "defective"]

cleaned_data = data.copy()

#Replace undesired values with NA
cleaned_data.replace(undesired_values, np.nan, inplace=True);

# Filter out rows where all of the specified columns have undesired values
# cleaned_data = data[~data[['1st', '2nd', '3rd']].isin(undesired_values).any(axis=1)]
#cleaned_data = data[~(data['1st'].isin(undesired_values) &
 #                     data['2nd'].isin(undesired_values) &
 #                     data['3rd'].isin(undesired_values))]
cleaned_data = cleaned_data.dropna(subset=['1st', '2nd', '3rd'], how='all')

#Drop column Precinct
cleaned_data = cleaned_data.drop(columns=['Precinct'])

#Display the first few rows of the cleaned dataframe to verify the changes
cleaned_data

data.to_csv(file_path, index = False)

KeyError: "['Precinct'] not found in axis"

In [53]:
print(cleaned_data.groupby(['1st', '2nd', '3rd']).agg({'Count': 'sum'}).reset_index().sort_values(by='Count', ascending=True, ignore_index=True))

                       1st                 2nd                 3rd  Count
0                 L.A. Nik          Al Flowers        Betsy Hodges      1
1     Captain Jack Sparrow            L.A. Nik    Gregg A. Iverson      1
2         Gregg A. Iverson                Null            Tom Hoch      1
3          Ronald Lischeid        Aswar Rahman                Null      1
4     Captain Jack Sparrow            L.A. Nik  Nekima Levy-Pounds      1
...                    ...                 ...                 ...    ...
2359            Jacob Frey            Tom Hoch        Betsy Hodges   2676
2360              Tom Hoch                Null                Null   2886
2361          Betsy Hodges                Null                Null   3223
2362          Raymond Dehn  Nekima Levy-Pounds        Betsy Hodges   3410
2363            Jacob Frey                Null                Null   4672

[2364 rows x 4 columns]


Edit invalid choices in votes

In [54]:
# Identify rows where 1st == 2nd == 3rd
condition_all = (cleaned_data['1st'] == cleaned_data['2nd']) & (cleaned_data['1st'] == cleaned_data['3rd']) & (cleaned_data["1st"] != "UWI")

# Change col2 and col3 to NaN for those rows
cleaned_data.loc[condition_all, ['2nd', '3rd']] = np.nan

# Identify rows
condition_1st_3rd_equal = (cleaned_data['1st'] == cleaned_data['3rd']) & (cleaned_data["1st"] != "UWI")
condition_2nd_3rd_equal = (cleaned_data['2nd'] == cleaned_data['3rd']) & (cleaned_data["2nd"] != "UWI")
condition_1st_2nd_equal = (cleaned_data['1st'] == cleaned_data['2nd']) & (cleaned_data["1st"] != "UWI")

# Change 3rd to NaN for those rows
cleaned_data.loc[condition_1st_3rd_equal, '3rd'] = np.nan
cleaned_data.loc[condition_2nd_3rd_equal, '3rd'] = np.nan
cleaned_data.loc[condition_1st_2nd_equal, '2nd'] = np.nan

# Check if '1st' and '2nd' are NaN
condition = cleaned_data['1st'].isna() & cleaned_data['2nd'].isna()

# Apply the condition and update the values
cleaned_data.loc[condition, '1st'] = cleaned_data.loc[condition, '3rd']
cleaned_data.loc[condition, '3rd'] = np.nan

# Check if '2nd' are NaN
condition = cleaned_data['2nd'].isna()

# Apply the condition and update the values
cleaned_data.loc[condition, '2nd'] = cleaned_data.loc[condition, '3rd']
cleaned_data.loc[condition, '3rd'] = np.nan

# Check if '1st' are NaN
condition = cleaned_data['1st'].isna()

# Apply the condition and update the values
cleaned_data.loc[condition, '1st'] = cleaned_data.loc[condition, '2nd']
cleaned_data.loc[condition, '2nd'] = np.nan

cleaned_data

cleaned_data.to_csv(file_path, index = False)

Aggregate Data

In [55]:
cleaned_data = cleaned_data.fillna('Null')
aggregated_data = cleaned_data.groupby(['1st', '2nd', '3rd']).agg({'Count': 'sum'}).reset_index().sort_values(by='1st', ascending=True, ignore_index=True)
print(aggregated_data)

cleaned_data.to_csv(file_path, index = False)

                   1st                   2nd                   3rd  Count
0           Al Flowers          Aswar Rahman          Betsy Hodges      4
1           Al Flowers    Nekima Levy-Pounds       Ronald Lischeid      2
2           Al Flowers    Nekima Levy-Pounds          Raymond Dehn     18
3           Al Flowers    Nekima Levy-Pounds                  Null     25
4           Al Flowers    Nekima Levy-Pounds            Jacob Frey     16
...                ...                   ...                   ...    ...
2359  Troy Benjegerdes          Charlie Gers  Captain Jack Sparrow      1
2360  Troy Benjegerdes          Charlie Gers          Betsy Hodges      1
2361  Troy Benjegerdes  Captain Jack Sparrow              Tom Hoch      1
2362  Troy Benjegerdes  Captain Jack Sparrow          Raymond Dehn      1
2363  Troy Benjegerdes              Tom Hoch          Raymond Dehn      2

[2364 rows x 4 columns]


Round 1 1st Rank

In [56]:
# Group by the first-ranked candidate and sum the counts, then convert to DataFrame
first_rank_counts_df = aggregated_data.groupby('1st')['Count'].sum().reset_index()

# Rename the columns
first_rank_counts_df.columns = ['Candidate', 'Count']

# Set the index to start from 1
first_rank_counts_df.index = first_rank_counts_df.index + 1

print(first_rank_counts_df)

cleaned_data.to_csv(file_path, index = False)

                Candidate  Count
1              Al Flowers    713
2            Aswar Rahman    756
3            Betsy Hodges  18921
4    Captain Jack Sparrow    443
5            Charlie Gers   1238
6   Christopher Zimmerman      1
7       David John Wilson    224
8         David Rosenfeld    480
9        Gregg A. Iverson    337
10            Ian Simpson    119
11             Jacob Frey  26124
12               L.A. Nik    616
13     Nekima Levy-Pounds  15718
14           Raymond Dehn  18104
15        Ronald Lischeid    325
16               Tom Hoch  20131
17       Troy Benjegerdes    185


Next round: remove UWI

In [57]:
cleaned_data.replace("UWI", np.nan, inplace=True)
cleaned_data.replace("Null", np.nan, inplace=True)

#Repeat Steps
cleaned_data = cleaned_data.dropna(subset=['1st', '2nd', '3rd'], how='all')

#Repeat Steps

# Check if '1st' and '2nd' are NaN
condition = cleaned_data['1st'].isna() & cleaned_data['2nd'].isna()

# Apply the condition and update the values
cleaned_data.loc[condition, '1st'] = cleaned_data.loc[condition, '3rd']
cleaned_data.loc[condition, '3rd'] = np.nan

# Check if '2nd' are NaN
condition = cleaned_data['2nd'].isna()

# Apply the condition and update the values
cleaned_data.loc[condition, '2nd'] = cleaned_data.loc[condition, '3rd']
cleaned_data.loc[condition, '3rd'] = np.nan

# Check if '1st' are NaN
condition = cleaned_data['1st'].isna()

# Apply the condition and update the values
cleaned_data.loc[condition, '1st'] = cleaned_data.loc[condition, '2nd']
cleaned_data.loc[condition, '2nd'] = np.nan

print(cleaned_data)

cleaned_data.to_csv(file_path, index = False)

                       1st                 2nd                   3rd  Count
0               Jacob Frey  Nekima Levy-Pounds          Raymond Dehn      1
1                 Tom Hoch          Jacob Frey          Betsy Hodges      1
2               Jacob Frey                 NaN                   NaN      1
3       Nekima Levy-Pounds        Betsy Hodges          Raymond Dehn      1
4               Jacob Frey                 NaN                   NaN      1
...                    ...                 ...                   ...    ...
104430          Jacob Frey    Gregg A. Iverson  Captain Jack Sparrow      1
104431          Jacob Frey  Nekima Levy-Pounds              Tom Hoch      1
104432        Raymond Dehn  Nekima Levy-Pounds          Betsy Hodges      1
104433        Betsy Hodges                 NaN                   NaN      1
104434  Nekima Levy-Pounds          Jacob Frey                   NaN      1

[104435 rows x 4 columns]


Aggregate

In [58]:
cleaned_data = cleaned_data.fillna('Null')
aggregated_data = cleaned_data.groupby(['1st', '2nd', '3rd']).agg({'Count': 'sum'}).reset_index().sort_values(by='1st', ascending=True, ignore_index=True)
print(aggregated_data)

cleaned_data.to_csv(file_path, index = False)

                   1st                   2nd              3rd  Count
0           Al Flowers          Aswar Rahman     Betsy Hodges      4
1           Al Flowers    Nekima Levy-Pounds  Ronald Lischeid      2
2           Al Flowers    Nekima Levy-Pounds     Raymond Dehn     18
3           Al Flowers    Nekima Levy-Pounds             Null     25
4           Al Flowers    Nekima Levy-Pounds       Jacob Frey     16
...                ...                   ...              ...    ...
2334  Troy Benjegerdes          Charlie Gers     Betsy Hodges      1
2335  Troy Benjegerdes  Captain Jack Sparrow         Tom Hoch      1
2336  Troy Benjegerdes  Captain Jack Sparrow  Ronald Lischeid      2
2337  Troy Benjegerdes            Jacob Frey     Aswar Rahman      2
2338  Troy Benjegerdes              Tom Hoch     Raymond Dehn      2

[2339 rows x 4 columns]


Round 2 first rank

In [59]:
# Group by the first-ranked candidate and sum the counts, then convert to DataFrame
first_rank_counts_df = aggregated_data.groupby('1st')['Count'].sum().reset_index()

# Rename the columns
first_rank_counts_df.columns = ['Candidate', 'Count']

# Set the index to start from 1
first_rank_counts_df.index = first_rank_counts_df.index + 1

print(first_rank_counts_df)

cleaned_data.to_csv(file_path, index = False)

                Candidate  Count
1              Al Flowers    713
2            Aswar Rahman    756
3            Betsy Hodges  18921
4    Captain Jack Sparrow    443
5            Charlie Gers   1238
6   Christopher Zimmerman      1
7       David John Wilson    224
8         David Rosenfeld    480
9        Gregg A. Iverson    337
10            Ian Simpson    119
11             Jacob Frey  26124
12               L.A. Nik    616
13     Nekima Levy-Pounds  15718
14           Raymond Dehn  18104
15        Ronald Lischeid    325
16               Tom Hoch  20131
17       Troy Benjegerdes    185


Aggregate

In [60]:
cleaned_data = cleaned_data.fillna('Null')
aggregated_data = cleaned_data.groupby(['1st', '2nd', '3rd']).agg({'Count': 'sum'}).reset_index().sort_values(by='1st', ascending=True, ignore_index=True)
print(aggregated_data)

cleaned_data.to_csv(file_path, index = False)

                   1st                   2nd              3rd  Count
0           Al Flowers          Aswar Rahman     Betsy Hodges      4
1           Al Flowers    Nekima Levy-Pounds  Ronald Lischeid      2
2           Al Flowers    Nekima Levy-Pounds     Raymond Dehn     18
3           Al Flowers    Nekima Levy-Pounds             Null     25
4           Al Flowers    Nekima Levy-Pounds       Jacob Frey     16
...                ...                   ...              ...    ...
2334  Troy Benjegerdes          Charlie Gers     Betsy Hodges      1
2335  Troy Benjegerdes  Captain Jack Sparrow         Tom Hoch      1
2336  Troy Benjegerdes  Captain Jack Sparrow  Ronald Lischeid      2
2337  Troy Benjegerdes            Jacob Frey     Aswar Rahman      2
2338  Troy Benjegerdes              Tom Hoch     Raymond Dehn      2

[2339 rows x 4 columns]


Round 3 first rank

In [61]:
# Group by the first-ranked candidate and sum the counts, then convert to DataFrame
first_rank_counts_df = aggregated_data.groupby('1st')['Count'].sum().reset_index()

# Rename the columns
first_rank_counts_df.columns = ['Candidate', 'Count']

# Set the index to start from 1
first_rank_counts_df.index = first_rank_counts_df.index + 1

print(first_rank_counts_df)

                Candidate  Count
1              Al Flowers    713
2            Aswar Rahman    756
3            Betsy Hodges  18921
4    Captain Jack Sparrow    443
5            Charlie Gers   1238
6   Christopher Zimmerman      1
7       David John Wilson    224
8         David Rosenfeld    480
9        Gregg A. Iverson    337
10            Ian Simpson    119
11             Jacob Frey  26124
12               L.A. Nik    616
13     Nekima Levy-Pounds  15718
14           Raymond Dehn  18104
15        Ronald Lischeid    325
16               Tom Hoch  20131
17       Troy Benjegerdes    185
