In [15]:
import pandas as pd
import glob
import os
from pprint import pprint

In [16]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\MA\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary'])
]


In [17]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__acton__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__auburn__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__barnstable__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__belmont__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__brookline__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__chelmsford__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__dedham__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__falmouth__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__freetown__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__somerset__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\

In [18]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\MA\20160301__ma__primary__president__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20160908__ma__primary__precinct.csv


In [19]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)
                # Drop rows where 'precinct' is NaN
        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
            
        df = df.drop_duplicates()

        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
pri_combined_df["precinct"] = pri_combined_df["town"].astype(str) + pri_combined_df["precinct"].astype(str)
pri_combined_df["precinct"] = pri_combined_df["precinct"].str.upper()
pri_combined_df


Unnamed: 0,town,ward,precinct,office,district,party,candidate,votes
0,Mattapoisett,,MATTAPOISETT1,President,,Green-rainbow,Darryl Cherney,0
1,Aquinnah,,AQUINNAH1,President,,Republican,Jeb Bush,0
2,Lawrence,D,LAWRENCE1,President,,Republican,Carly Fiorina,0
3,Boston,21,BOSTON8,President,,Republican,Total Votes Cast,51
4,Scituate,,SCITUATE1,President,,Republican,Ben Carson,10
...,...,...,...,...,...,...,...,...
73227,Ipswich,,IPSWICH1,President,,Green-rainbow,Blank Votes,0
73228,Lynn,7,LYNN3,President,,Republican,Rick Santorum,0
73229,Swampscott,,SWAMPSCOTT3,President,,Green-rainbow,Blank Votes,0
73230,Fairhaven,,FAIRHAVEN5,President,,Democratic,No Preference,5


In [20]:
pri_combined_df["party"].value_counts(dropna=False)

party
Republican       36958
Green-rainbow    18882
Democratic       17390
NaN                  2
Name: count, dtype: int64

In [21]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,MATTAPOISETT1,Green-rainbow,Darryl Cherney,0
1,AQUINNAH1,Republican,Jeb Bush,0
2,LAWRENCE1,Republican,Carly Fiorina,0
3,BOSTON8,Republican,Total Votes Cast,51
4,SCITUATE1,Republican,Ben Carson,10
...,...,...,...,...
73227,IPSWICH1,Green-rainbow,Blank Votes,0
73228,LYNN3,Republican,Rick Santorum,0
73229,SWAMPSCOTT3,Green-rainbow,Blank Votes,0
73230,FAIRHAVEN5,Democratic,No Preference,5


In [22]:
primary_data.loc[:,"party"] = (
    primary_data["party"]
    .replace({
        "Democratic": "DEM",
        "Republican": "REP",
        "Green-rainbow": "GRN"
    })
    .fillna("IND")
)

primary_data["party"].value_counts(dropna=False)

party
REP    36958
GRN    18882
DEM    17390
IND        2
Name: count, dtype: int64

In [23]:
primary_data["candidate"].value_counts(dropna=False)

candidate
All Others            6446
Total Votes Cast      6446
Blank Votes           6446
No Preference         6446
Rand Paul             2174
Jim Gilmore           2174
Rick Santorum         2174
Marco Rubio           2174
Mike Huckabee         2174
Roque De La Fuente    2174
Hillary Clinton       2174
Martin O'Malley       2174
George Pataki         2174
John R. Kasich        2174
Jeb Bush              2174
Chris Christie        2174
Donald J. Trump       2174
Ted Cruz              2174
Bernie Sanders        2174
Ben Carson            2174
Carly Fiorina         2174
William P. Kreml      2098
Jill E. Stein         2098
Skcm Curry            2098
Kent Mesplay          2098
Darryl Cherney        2098
Name: count, dtype: int64

In [24]:
# Cleaning Candidates
primary_data = primary_data[~primary_data["candidate"].isin(["Blank Votes", "All Others","Total Votes Cast","No Preference"])] 

primary_data["candidate"].value_counts(dropna=False)

candidate
George Pataki         2174
Rand Paul             2174
Jim Gilmore           2174
Rick Santorum         2174
Marco Rubio           2174
Mike Huckabee         2174
Roque De La Fuente    2174
Hillary Clinton       2174
Martin O'Malley       2174
Jeb Bush              2174
John R. Kasich        2174
Chris Christie        2174
Donald J. Trump       2174
Ted Cruz              2174
Bernie Sanders        2174
Ben Carson            2174
Carly Fiorina         2174
William P. Kreml      2098
Jill E. Stein         2098
Skcm Curry            2098
Kent Mesplay          2098
Darryl Cherney        2098
Name: count, dtype: int64

In [25]:
primary_data["candidate"].unique()
candidate_party_map = (
    primary_data.dropna(subset=["candidate", "party"])
                .set_index("candidate")["party"]
                .to_dict()
)
print(candidate_party_map)

{'Darryl Cherney': 'GRN', 'Jeb Bush': 'REP', 'Carly Fiorina': 'REP', 'Ben Carson': 'REP', 'Kent Mesplay': 'GRN', 'Bernie Sanders': 'DEM', 'Ted Cruz': 'REP', 'Donald J. Trump': 'REP', 'Jill E. Stein': 'GRN', 'Chris Christie': 'REP', 'Rand Paul': 'REP', 'George Pataki': 'REP', 'William P. Kreml': 'GRN', "Martin O'Malley": 'DEM', 'Hillary Clinton': 'DEM', 'Roque De La Fuente': 'DEM', 'Skcm Curry': 'GRN', 'Mike Huckabee': 'REP', 'Marco Rubio': 'REP', 'Rick Santorum': 'REP', 'Jim Gilmore': 'REP', 'John R. Kasich': 'REP'}


In [26]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data.loc[:,"candidate_column"] = (


candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_grn_CHERNEY,pri_grn_CURRY,pri_grn_KREML,pri_grn_MESPLAY,pri_grn_STEIN,...,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_GILMORE,pri_rep_HUCKABEE,pri_rep_KASICH,pri_rep_PATAKI,pri_rep_PAUL,pri_rep_RUBIO,pri_rep_SANTORUM,pri_rep_TRUMP
0,ABINGTON1,193,0,3,268,0,0,0,0,0,...,36,3,1,0,61,0,2,45,0,235
1,ABINGTON2,195,0,3,271,0,0,0,0,1,...,34,0,0,0,46,0,1,48,0,257
2,ABINGTON3,246,2,5,273,0,0,0,0,0,...,48,0,0,1,56,0,2,52,0,264
3,ABINGTON4,265,1,2,267,0,0,0,0,0,...,56,0,0,2,68,0,0,62,0,267
4,ABINGTON5,260,1,2,311,0,0,0,0,1,...,37,0,0,1,72,0,1,61,1,245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1285,YARMOUTH3,276,0,2,217,0,0,0,0,0,...,27,2,0,0,55,1,1,54,0,206
1286,YARMOUTH4,343,1,3,325,0,0,0,0,0,...,43,2,0,0,71,0,1,74,0,319
1287,YARMOUTH5,222,0,1,260,0,0,0,0,0,...,33,2,0,2,45,0,0,58,0,239
1288,YARMOUTH6,251,2,0,236,0,0,0,0,0,...,31,1,1,0,47,0,0,64,1,244


In [27]:
# Process general files
gen_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)

        # Drop rows where 'precinct' is NaN
        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]
        # Select only president
        if 'office' in df.columns:
            df = df[df["office"] == "President" ]

        gen_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(gen_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
gen_combined_df["precinct"] = gen_combined_df["town"].astype(str) + gen_combined_df["precinct"].astype(str)
gen_combined_df["precinct"] = gen_combined_df["precinct"].str.upper()
gen_combined_df


Unnamed: 0,county,town,precinct,office,district,party,candidate,votes,ward
0,Middlesex,Acton,ACTON1,President,,,BLANK,37,
1,Middlesex,Acton,ACTON2,President,,,BLANK,33,
2,Middlesex,Acton,ACTON3,President,,,BLANK,29,
3,Middlesex,Acton,ACTON4,President,,,BLANK,18,
4,Middlesex,Acton,ACTON5,President,,,BLANK,36,
...,...,...,...,...,...,...,...,...,...
28811,Worcester,Spencer,SPENCERTOTAL,President,,Libertarian,Johnson and Weld,296,
28812,Worcester,Spencer,SPENCERTOTAL,President,,Green-Rainbow,Stein and Baraka,83,
28813,Worcester,Spencer,SPENCERTOTAL,President,,Republican,Trump and Pence,3045,
28814,Worcester,Spencer,SPENCERTOTAL,President,,,McMullin and Johnson,10,


In [28]:

general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]
general_data


Unnamed: 0,precinct,party,candidate,votes
0,ACTON1,,BLANK,37
1,ACTON2,,BLANK,33
2,ACTON3,,BLANK,29
3,ACTON4,,BLANK,18
4,ACTON5,,BLANK,36
...,...,...,...,...
28811,SPENCERTOTAL,Libertarian,Johnson and Weld,296
28812,SPENCERTOTAL,Green-Rainbow,Stein and Baraka,83
28813,SPENCERTOTAL,Republican,Trump and Pence,3045
28814,SPENCERTOTAL,,McMullin and Johnson,10


In [29]:
general_data["candidate"].value_counts(dropna=False)

candidate
Stein and Baraka            2192
Johnson and Weld            2192
Clinton and Kaine           2192
Trump and Pence             2192
All Others                  2184
Kotlikoff and Leamer        2174
No Preference               2174
Moorehead and Lilly         2174
Blank Votes                 2174
Schoenke and Mitchel        2174
Feegbeh and O'Brien         2174
Mcmullin and Johnson        2174
Total Votes Cast            2174
Blanks                        35
JOHNSON and WELD              24
STEIN and BARAKA              24
TRUMP and PENCE               24
CLINTON and KAINE             24
Blank                         22
STEIN & BARAKA                21
JOHNSON & WELD                21
TRUMP & PENCE                 21
CLINTON & KAINE               15
MCMULLIN & JOHNSON            15
Write-In                      14
TRUMP-PENCE                   14
JOHNSTON-WELD                 14
BLANKS                        14
McMullin and Johnson          14
CLINTON-KAINE                 14


In [30]:
general_data = general_data[~general_data["candidate"].isin(["All Others", "No Preference","Blank Votes","Total Votes Cast","Blanks","Blank","Write-In","BLANKS","Other Write-in","All Other Write-in Votes","Write-in votes","Write in","BLANK","Scattering Write-ins","SCATTERED","All Other Write Ins","Write-in"])] 
general_data["candidate"] = (
    general_data["candidate"]
    .str.split(r"\s*(?:and|/|&|–|-|\+)\s*", n=1, expand=True)[0]
    .str.strip()
    .str.upper()
)

# Fixing McMullin

general_data.loc[
    (
        general_data["candidate"].str.contains("MCMULLIN", case=False, na=False) |
        general_data["candidate"].str.contains("EVAN MCMULLEN", case=False, na=False)
    ),
    "candidate"
] = "MCMULLIN"

# Fixing Sanders
# general_data.loc[
#     general_data["candidate"].str.contains("SANDERS", case=False, na=False),
#     "candidate"
# ] = "SANDERS"

general_data.loc[
    (
        general_data["candidate"].str.contains("SANDERS", case=False, na=False) |
        general_data["candidate"].str.contains("BERNIE S", case=False, na=False)
    ),
    "candidate"
] = "SANDERS"


general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = (


candidate
STEIN                2251
TRUMP                2251
CLINTON              2245
JOHNSON              2237
MCMULLIN             2215
MOOREHEAD            2174
SCHOENKE             2174
KOTLIKOFF            2174
FEEGBEH              2174
SANDERS                15
JOHNSTON               14
ROMNEY                  9
KASICH                  9
BLUMBERG                9
RYAN                    9
CLINTON AND KAINE       6
Name: count, dtype: int64

In [31]:
general_data["party"].value_counts(dropna=False)

party
(Write-In)           10870
Democratic            2251
Libertarian           2251
Republican            2251
Green-rainbow         2174
NaN                     92
Green-Rainbow           71
Green and Rainbow        6
Name: count, dtype: int64

In [32]:
general_data["party"] = general_data.apply(
    lambda row: candidate_party_map.get(row["candidate"], row["party"]) if pd.isna(row["party"]) else row["party"],
    axis=1
)
def fill_party_from_general_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown

general_data["party"] = general_data.apply(
    lambda row: fill_party_from_general_data(row, general_data),
    axis=1
)
general_data["party"] = (
    general_data["party"]
    .replace({
        "(Write-In)": "IND",
        "Libertarian": "LIB",
        "Democratic": "DEM",
        "Republican":"REP",
        "Green-rainbow": "GRN",
        "Green-Rainbow": "GRN",
        "Green and Rainbow": "GRN",
    })
    .fillna("IND")
)

general_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = (


party
IND    10962
DEM     2251
LIB     2251
GRN     2251
REP     2251
Name: count, dtype: int64

In [33]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate_column"] = (


candidate_column,precinct,gen_dem_CLINTON,gen_dem_KAINE,gen_grn_STEIN,gen_ind_BLUMBERG,gen_ind_FEEGBEH,gen_ind_KASICH,gen_ind_KOTLIKOFF,gen_ind_MCMULLIN,gen_ind_MOOREHEAD,gen_ind_ROMNEY,gen_ind_RYAN,gen_ind_SANDERS,gen_ind_SCHOENKE,gen_lib_JOHNSON,gen_lib_JOHNSTON,gen_rep_TRUMP
0,ABINGTON1,818,0,25,0,0,0,0,0,0,0,0,0,0,97,0,717
1,ABINGTON2,739,0,27,0,0,0,0,0,0,0,0,0,0,67,0,785
2,ABINGTON3,773,0,18,0,0,0,0,0,0,0,0,0,0,80,0,808
3,ABINGTON4,877,0,16,0,0,0,0,0,0,0,0,0,0,87,0,878
4,ABINGTON5,908,0,16,0,0,0,0,0,0,0,0,0,0,86,0,829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1302,YARMOUTH3,904,0,18,0,0,0,0,0,0,0,0,0,0,63,0,663
1303,YARMOUTH4,1002,0,31,0,0,0,0,0,0,0,0,0,0,79,0,860
1304,YARMOUTH5,881,0,26,0,0,0,0,0,0,0,0,0,0,61,0,797
1305,YARMOUTH6,907,0,30,0,0,0,0,0,0,0,0,0,0,65,0,753


In [34]:
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")
dem_cols = combined.filter(like="pri_dem_").columns
combined[dem_cols] = combined[dem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_primary_total"] = combined[dem_cols].sum(axis=1)

gen_cols = combined.filter(like="gen_").columns
combined[gen_cols] = combined[gen_cols].apply(pd.to_numeric, errors="coerce")
combined["general_total"] = combined[gen_cols].sum(axis=1)

combined

candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_grn_CHERNEY,pri_grn_CURRY,pri_grn_KREML,pri_grn_MESPLAY,pri_grn_STEIN,...,gen_ind_MOOREHEAD,gen_ind_ROMNEY,gen_ind_RYAN,gen_ind_SANDERS,gen_ind_SCHOENKE,gen_lib_JOHNSON,gen_lib_JOHNSTON,gen_rep_TRUMP,dem_primary_total,general_total
0,ABINGTON1,193,0,3,268,0,0,0,0,0,...,0,0,0,0,0,97,0,717,464,1657
1,ABINGTON2,195,0,3,271,0,0,0,0,1,...,0,0,0,0,0,67,0,785,469,1618
2,ABINGTON3,246,2,5,273,0,0,0,0,0,...,0,0,0,0,0,80,0,808,526,1679
3,ABINGTON4,265,1,2,267,0,0,0,0,0,...,0,0,0,0,0,87,0,878,535,1858
4,ABINGTON5,260,1,2,311,0,0,0,0,1,...,0,0,0,0,0,86,0,829,574,1839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1285,YARMOUTH3,276,0,2,217,0,0,0,0,0,...,0,0,0,0,0,63,0,663,495,1648
1286,YARMOUTH4,343,1,3,325,0,0,0,0,0,...,0,0,0,0,0,79,0,860,672,1972
1287,YARMOUTH5,222,0,1,260,0,0,0,0,0,...,0,0,0,0,0,61,0,797,483,1765
1288,YARMOUTH6,251,2,0,236,0,0,0,0,0,...,0,0,0,0,0,65,0,753,489,1755


In [35]:
combined.to_csv("MA.csv", index=False)
