In [2]:
import pandas as pd
import glob
import os
from pprint import pprint

In [6]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\MI\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary'])
]


In [7]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\MI\20161108__mi__general__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MI\20161108__mi__special__general__precinct.csv


In [8]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\MI\20160308__mi__primary__president__precinct.csv


In [48]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)
        
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
            
        df = df.drop_duplicates()

        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
pri_combined_df["precinct"] = pri_combined_df["county"].astype(str) + pri_combined_df["precinct"].astype(str)
pri_combined_df["precinct"] = pri_combined_df["precinct"].str.upper()
pri_combined_df


Unnamed: 0,county,precinct,office,district,party,candidate,votes
0,Alcona,ALCONAALCONA TOWNSHIP 1,President,,REP,Jeb Bush,3
1,Alcona,ALCONACALEDONIA TOWNSHIP 1,President,,REP,Jeb Bush,0
2,Alcona,ALCONACURTIS TOWNSHIP 1,President,,REP,Jeb Bush,3
3,Alcona,ALCONAGREENBUSH TOWNSHIP 1,President,,REP,Jeb Bush,1
4,Alcona,ALCONAGUSTIN TOWNSHIP 1,President,,REP,Jeb Bush,0
...,...,...,...,...,...,...,...
19451,Kalamazoo,KALAMAZOOPORTAGE CITY 19,President,,DEM,Fuente,0
19452,Kalamazoo,KALAMAZOOPORTAGE CITY 19,President,,DEM,O'malley,0
19453,Kalamazoo,KALAMAZOOPORTAGE CITY 19,President,,DEM,Sanders,195
19454,Kalamazoo,KALAMAZOOPORTAGE CITY 19,President,,DEM,Uncommitted,9


In [49]:
pri_combined_df["party"].value_counts(dropna=False)

party
REP    14114
DEM     5342
Name: count, dtype: int64

In [50]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,ALCONAALCONA TOWNSHIP 1,REP,Jeb Bush,3
1,ALCONACALEDONIA TOWNSHIP 1,REP,Jeb Bush,0
2,ALCONACURTIS TOWNSHIP 1,REP,Jeb Bush,3
3,ALCONAGREENBUSH TOWNSHIP 1,REP,Jeb Bush,1
4,ALCONAGUSTIN TOWNSHIP 1,REP,Jeb Bush,0
...,...,...,...,...
19451,KALAMAZOOPORTAGE CITY 19,DEM,Fuente,0
19452,KALAMAZOOPORTAGE CITY 19,DEM,O'malley,0
19453,KALAMAZOOPORTAGE CITY 19,DEM,Sanders,195
19454,KALAMAZOOPORTAGE CITY 19,DEM,Uncommitted,9


In [41]:
primary_data["party"].value_counts(dropna=False)

party
REP    14114
DEM     5342
Name: count, dtype: int64

In [42]:
primary_data["candidate"].value_counts(dropna=False)

candidate
Uncommitted                 1942
Rand Paul                    884
Marco Rubio                  884
Bernie Sanders               884
Martin J. O'Malley           884
Hillary Clinton              884
Donald J. Trump              884
John R. Kasich               884
George Pataki                884
Rick Santorum                884
Jeb Bush                     863
Ben Carson                   863
Mike Huckabee                863
Lindsey Graham               863
Carly Fiorina                863
Ted Cruz                     863
Chris Christie               863
Roque Rocky De La Fuente     520
Write-ins                    465
Rocky De La Fuente           364
Write-In                     119
O'malley                     108
Pataki                       108
Fuente                       108
Clinton                      108
Write-in                     108
Trump                        108
Santorum                     108
Rubio                        108
Paul                         108


In [52]:
#Remove Uncommited, Write-ins Join full name candidate with only last name 
primary_data = primary_data[~primary_data["candidate"].isin(["Uncommitted", "Write-ins","Write-in","Write-In","Ballots Cast","Write-In Votes"])] 

primary_data.loc[:,"candidate"] = (
    primary_data["candidate"].str.split().str[-1].str.upper()
)


primary_data["candidate"].value_counts(dropna=False)

candidate
PATAKI      992
PAUL        992
O'MALLEY    992
FUENTE      992
CLINTON     992
TRUMP       992
SANTORUM    992
RUBIO       992
SANDERS     992
KASICH      992
CARSON      971
HUCKABEE    971
GRAHAM      971
FIORINA     971
CRUZ        971
CHRISTIE    971
BUSH        971
Name: count, dtype: int64

In [53]:
primary_data["candidate"].unique()
candidate_party_map = (
    primary_data.dropna(subset=["candidate", "party"])
                .set_index("candidate")["party"]
                .to_dict()
)
print(candidate_party_map)

{'BUSH': 'REP', 'CARSON': 'REP', 'CHRISTIE': 'REP', 'CRUZ': 'REP', 'FIORINA': 'REP', 'GRAHAM': 'REP', 'HUCKABEE': 'REP', 'KASICH': 'REP', 'PATAKI': 'REP', 'PAUL': 'REP', 'RUBIO': 'REP', 'SANTORUM': 'REP', 'TRUMP': 'REP', 'CLINTON': 'DEM', 'FUENTE': 'DEM', "O'MALLEY": 'DEM', 'SANDERS': 'DEM'}


In [54]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CHRISTIE,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_GRAHAM,pri_rep_HUCKABEE,pri_rep_KASICH,pri_rep_PATAKI,pri_rep_PAUL,pri_rep_RUBIO,pri_rep_SANTORUM,pri_rep_TRUMP
0,ALCONAALCONA TOWNSHIP 1,48,0,0,42,3,4,1,61,2,0,1,44,0,0,18,1,141
1,ALCONACALEDONIA TOWNSHIP 1,53,0,1,34,0,1,1,40,0,0,0,32,1,0,15,0,112
2,ALCONACURTIS TOWNSHIP 1,72,0,0,63,3,5,0,46,2,0,1,24,0,0,11,0,128
3,ALCONAGREENBUSH TOWNSHIP 1,67,0,1,66,1,5,1,39,0,0,0,46,0,2,17,1,121
4,ALCONAGUSTIN TOWNSHIP 1,27,0,0,24,0,0,0,23,0,0,0,18,1,0,5,0,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987,SCHOOLCRAFTMANISTIQUE CITY 3,25,0,0,28,0,2,0,21,0,0,0,15,1,0,7,0,44
988,SCHOOLCRAFTMANISTIQUE TOWNSHIP 1,31,0,0,41,2,0,0,24,0,0,0,37,0,0,14,0,71
989,SCHOOLCRAFTMUELLER TOWNSHIP 1,12,0,0,12,0,0,0,10,0,0,0,7,0,0,3,0,12
990,SCHOOLCRAFTSENEY TOWNSHIP 1,4,0,0,3,0,0,0,5,0,0,0,5,0,0,2,0,22


In [56]:
# Process general files
gen_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)

        # Select only president
        if 'office' in df.columns:
            df = df[df["office"] == "President" ]

        gen_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(gen_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
gen_combined_df["precinct"] = gen_combined_df["county"].astype(str) + gen_combined_df["precinct"].astype(str)
gen_combined_df["precinct"] = gen_combined_df["precinct"].str.upper()
gen_combined_df


Unnamed: 0,county,precinct,office,district,party,candidate,votes
0,Alcona,ALCONAALCONA TOWNSHIP 1,President,,NPA,Tom Hoefling,0
1,Alcona,ALCONAALCONA TOWNSHIP 1,President,,NPA,Michael A. Maturen,0
2,Alcona,ALCONAALCONA TOWNSHIP 1,President,,NPA,Monica Moorehead,0
3,Alcona,ALCONAALCONA TOWNSHIP 1,President,,NPA,Evan Mcmullin,0
4,Alcona,ALCONAALCONA TOWNSHIP 1,President,,NPA,Laurence Kotlikoff,0
...,...,...,...,...,...,...,...
65996,Wexford,WEXFORD9999,President,,DEM,Hillary Clinton,0
65997,Wexford,WEXFORD9999,President,,NLP,Emidio Mimi Soltysik,0
65998,Wexford,WEXFORD9999,President,,REP,Donald J. Trump,0
65999,Wexford,WEXFORD9999,President,,UST,Darrell L. Castle,0


In [60]:

general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]
general_data["party"].value_counts(dropna=False)


party
NPA    35539
GRN     5077
DEM     5077
NLP     5077
REP     5077
UST     5077
LIB     5077
Name: count, dtype: int64

In [61]:
general_data["candidate"].value_counts(dropna=False)

candidate
Tom Hoefling            5077
Michael A. Maturen      5077
Monica Moorehead        5077
Evan Mcmullin           5077
Laurence Kotlikoff      5077
Ben Hartnell            5077
Cherunda Fox            5077
Jill Stein              5077
Hillary Clinton         5077
Emidio Mimi Soltysik    5077
Donald J. Trump         5077
Darrell L. Castle       5077
Gary Johnson            5077
Name: count, dtype: int64

In [59]:
# general_data["candidate"] = general_data["candidate"].str.extract(r"^([^/&]+)").iloc[:, 0].str.strip()
# general_data = general_data[~general_data["candidate"].isin(["Registered Voters", "Write-In"])] 
# general_data["candidate"].value_counts(dropna=False)

In [63]:
# general_data = general_data[~general_data["candidate"].isin(["YES", "NO"])]
# general_data["party"] = general_data.apply(
#     lambda row: candidate_party_map.get(row["candidate"], row["party"]) if pd.isna(row["party"]) else row["party"],
#     axis=1
# )
# def fill_party_from_general_data(row, df):
#     if pd.notna(row["party"]):
#         return row["party"]
    
#     # Try to find other rows with the same candidate and known party
#     matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
#     if not matches.empty:
#         return matches["party"].iloc[0]  # Return the first match's party
#     else:
#         return None  # Still unknown

# general_data["party"] = general_data.apply(
#     lambda row: fill_party_from_general_data(row, general_data),
#     axis=1
# )
general_data["party"] = (
    general_data["party"]
    .replace({
        "NPA": "IND",
        "Green": "GRN",
        "UST": "CON"
        
    })
    .fillna("IND")
)

general_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = (


party
IND    35539
GRN     5077
DEM     5077
NLP     5077
REP     5077
CON     5077
LIB     5077
Name: count, dtype: int64

In [67]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate_column"] = (


candidate_column,precinct,gen_con_CASTLE,gen_dem_CLINTON,gen_grn_STEIN,gen_ind_FOX,gen_ind_HARTNELL,gen_ind_HOEFLING,gen_ind_KOTLIKOFF,gen_ind_MATUREN,gen_ind_MCMULLIN,gen_ind_MOOREHEAD,gen_lib_JOHNSON,gen_nlp_SOLTYSIK,gen_rep_TRUMP
0,ALCONA9999,0,0,0,0,0,0,0,16,3,0,0,0,0
1,ALCONAALCONA TOWNSHIP 1,4,199,2,0,0,0,0,0,0,0,13,0,500
2,ALCONACALEDONIA TOWNSHIP 1,5,178,4,0,0,0,0,0,0,0,14,0,441
3,ALCONACURTIS TOWNSHIP 1,3,214,13,0,0,0,0,0,0,0,8,0,401
4,ALCONAGREENBUSH TOWNSHIP 1,2,256,4,0,0,0,0,0,0,0,22,0,501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5072,WEXFORDSELMA TOWNSHIP 1,4,322,14,0,0,0,0,0,0,0,36,1,729
5073,WEXFORDSLAGLE TOWNSHIP 1,2,92,0,0,0,0,0,0,0,0,12,0,179
5074,WEXFORDSOUTH BRANCH TOWNSHIP 1,0,76,3,0,0,0,0,0,0,0,5,0,127
5075,WEXFORDSPRINGVILLE TOWNSHIP 1,0,176,10,0,0,0,0,0,0,0,28,0,494


In [71]:
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")
dem_cols = combined.filter(like="pri_dem_").columns
combined[dem_cols] = combined[dem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_primary_total"] = combined[dem_cols].sum(axis=1)

gen_cols = combined.filter(like="gen_").columns
combined[gen_cols] = combined[gen_cols].apply(pd.to_numeric, errors="coerce")
combined["general_total"] = combined[gen_cols].sum(axis=1)

combined

candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CHRISTIE,pri_rep_CRUZ,pri_rep_FIORINA,...,gen_ind_HOEFLING,gen_ind_KOTLIKOFF,gen_ind_MATUREN,gen_ind_MCMULLIN,gen_ind_MOOREHEAD,gen_lib_JOHNSON,gen_nlp_SOLTYSIK,gen_rep_TRUMP,dem_primary_total,general_total
0,ALCONAALCONA TOWNSHIP 1,48,0,0,42,3,4,1,61,2,...,0,0,0,0,0,13,0,500,90,718
1,ALCONACALEDONIA TOWNSHIP 1,53,0,1,34,0,1,1,40,0,...,0,0,0,0,0,14,0,441,88,642
2,ALCONACURTIS TOWNSHIP 1,72,0,0,63,3,5,0,46,2,...,0,0,0,0,0,8,0,401,135,639
3,ALCONAGREENBUSH TOWNSHIP 1,67,0,1,66,1,5,1,39,0,...,0,0,0,0,0,22,0,501,134,785
4,ALCONAGUSTIN TOWNSHIP 1,27,0,0,24,0,0,0,23,0,...,0,0,0,0,0,16,0,249,51,352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,SCHOOLCRAFTMANISTIQUE CITY 3,25,0,0,28,0,2,0,21,0,...,0,0,0,0,0,11,0,193,53,335
403,SCHOOLCRAFTMANISTIQUE TOWNSHIP 1,31,0,0,41,2,0,0,24,0,...,0,0,0,0,0,18,0,321,72,524
404,SCHOOLCRAFTMUELLER TOWNSHIP 1,12,0,0,12,0,0,0,10,0,...,0,0,0,0,0,4,0,97,24,141
405,SCHOOLCRAFTSENEY TOWNSHIP 1,4,0,0,3,0,0,0,5,0,...,0,0,0,0,0,5,0,55,7,71


In [72]:
combined.to_csv("MI.csv", index=False)
