In [40]:
import pandas as pd
import glob
import os
from pprint import pprint

In [41]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\WA\2016\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary', 'president'])
]


In [42]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\WA\2016\20160209__wa__special__general__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\WA\2016\20160426__wa__special__general__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\WA\2016\20161108__wa__general__precinct.csv


In [43]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\WA\2016\20160524__wa__primary_president__precinct.csv


In [44]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)

        # Drop rows where 'precinct' is NaN
        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]
        
        df = df.drop_duplicates()

        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
pri_combined_df["precinct"] = pri_combined_df["county"].astype(str) + pri_combined_df["precinct_code"].astype(str)
pri_combined_df = pri_combined_df.drop(columns=["party"])
pri_combined_df = pri_combined_df.rename(columns={"office": "party"})
pri_combined_df["precinct"] = pri_combined_df["precinct"].str.upper()
pri_combined_df


Unnamed: 0,county,precinct_code,precinct,party,district,candidate,votes
0,King,3562,KING3562,Democratic Party,,Bernie Sanders,24
1,King,3562,KING3562,Democratic Party,,Hillary Clinton,117
2,King,3562,KING3562,Democratic Party,,Registered Voters,503
3,King,3562,KING3562,Democratic Party,,Write-In,0
4,King,3562,KING3562,Republican Party,,Ben Carson,1
...,...,...,...,...,...,...,...
52753,Yakima,5101,YAKIMA5101,President Republican Party,,Donald J. Trump,72
52754,Yakima,5202,YAKIMA5202,President Republican Party,,Ben Carson,0
52755,Yakima,5202,YAKIMA5202,President Republican Party,,Ted Cruz,0
52756,Yakima,5202,YAKIMA5202,President Republican Party,,John R. Kasich,0


In [45]:
pri_combined_df["party"].value_counts(dropna=False)

party
President Republican Party    18392
Republican Party              15102
Democratic Party              10068
President Democratic Party     9196
Name: count, dtype: int64

In [46]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]
primary_data.loc[:, "party"] = primary_data["party"].replace({
    "President Republican Party": "REP",
    "President Democratic Party": "DEM",
    "Democratic Party": "DEM",
    "Republican Party": "REP"
})

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,KING3562,DEM,Bernie Sanders,24
1,KING3562,DEM,Hillary Clinton,117
2,KING3562,DEM,Registered Voters,503
3,KING3562,DEM,Write-In,0
4,KING3562,REP,Ben Carson,1
...,...,...,...,...
52753,YAKIMA5101,REP,Donald J. Trump,72
52754,YAKIMA5202,REP,Ben Carson,0
52755,YAKIMA5202,REP,Ted Cruz,0
52756,YAKIMA5202,REP,John R. Kasich,0


In [47]:
primary_data["party"].value_counts(dropna=False)

party
REP    33494
DEM    19264
Name: count, dtype: int64

In [48]:
#Remove Write-in and Registered Voters
primary_data = primary_data[~primary_data["candidate"].isin(["Registered Voters", "Write-In"])] 
primary_data["candidate"].value_counts(dropna=False)

candidate
Bernie Sanders     7115
Hillary Clinton    7115
Ben Carson         7115
Donald J. Trump    7115
John R. Kasich     7115
Ted Cruz           7115
Name: count, dtype: int64

In [49]:
primary_data["candidate"].unique()
candidate_party_map = (
    primary_data.dropna(subset=["candidate", "party"])
                .set_index("candidate")["party"]
                .to_dict()
)
print(candidate_party_map)

{'Bernie Sanders': 'DEM', 'Hillary Clinton': 'DEM', 'Ben Carson': 'REP', 'Donald J. Trump': 'REP', 'John R. Kasich': 'REP', 'Ted Cruz': 'REP'}


In [50]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data.loc[:,"candidate_column"] = (


candidate_column,precinct,pri_dem_CLINTON,pri_dem_SANDERS,pri_rep_CARSON,pri_rep_CRUZ,pri_rep_KASICH,pri_rep_TRUMP
0,ADAMS111,11,6,2,6,2,45
1,ADAMS112,14,10,2,6,1,41
2,ADAMS113,17,11,8,8,2,59
3,ADAMS114,16,19,1,10,13,44
4,ADAMS115,18,11,4,5,11,53
...,...,...,...,...,...,...,...
7110,YAKIMA5020,26,11,6,15,12,100
7111,YAKIMA5101,19,26,0,6,5,72
7112,YAKIMA5202,0,0,0,0,0,0
7113,YAKIMA701,26,25,10,3,2,22


In [51]:
# Process general files
gen_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)

        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]

        # Select only president
        if 'office' in df.columns:
            df = df[df["office"] == "President" ]

        gen_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(gen_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
gen_combined_df["precinct"] = gen_combined_df["county"].astype(str) + gen_combined_df["precinct_code"].astype(str)
gen_combined_df["precinct"] = gen_combined_df["precinct"].str.upper()
gen_combined_df


  df = pd.read_csv(file)


Unnamed: 0,county,precinct_code,precinct,office,district,party,candidate,votes
0,King,3562,KING3562,President,,Constitution,Darrell L. Castle & Scott N. Bradley,0
1,King,3562,KING3562,President,,Democratic,Hillary Clinton & Tim Kaine,266
2,King,3562,KING3562,President,,Green,Jill Stein & Ajamu Baraka,1
3,King,3562,KING3562,President,,Libertarian,Gary Johnson & Bill Weld,14
4,King,3562,KING3562,President,,,Registered Voters,519
...,...,...,...,...,...,...,...,...
54911,Yakima,5202,YAKIMA5202,President,,Socialist Workers,Alyson Kennedy / Osborne Hart,0
54912,Yakima,5202,YAKIMA5202,President,,Socialism & Liberation,Gloria Estela La Riva / Eugene Puryear,0
54913,Yakima,5202,YAKIMA5202,President,,Green,Jill Stein / Ajamu Baraka,0
54914,Yakima,5202,YAKIMA5202,President,,Constitution,Darrell L. Castle / Scott N. Bradley,0


In [52]:

general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]
general_data["party"].value_counts(dropna=False)


party
Constitution              7126
Democratic                7126
Green                     7126
Libertarian               7126
Republican                7126
Socialism & Liberation    7126
Socialist Workers         7126
NaN                       5034
Name: count, dtype: int64

In [53]:
general_data["candidate"].value_counts(dropna=False)

candidate
Hillary Clinton / Tim Kaine               4609
Donald J. Trump / Michael R. Pence        4609
Alyson Kennedy / Osborne Hart             4609
Gloria Estela La Riva / Eugene Puryear    4609
Jill Stein / Ajamu Baraka                 4609
Darrell L. Castle / Scott N. Bradley      4609
Gary Johnson / Bill Weld                  4609
Darrell L. Castle & Scott N. Bradley      2517
Hillary Clinton & Tim Kaine               2517
Jill Stein & Ajamu Baraka                 2517
Gary Johnson & Bill Weld                  2517
Registered Voters                         2517
Write-In                                  2517
Donald J. Trump & Michael R. Pence        2517
Gloria Estela La Riva & Eugene Puryear    2517
Alyson Kennedy & Osborne Hart             2517
Name: count, dtype: int64

In [54]:
general_data["candidate"] = general_data["candidate"].str.extract(r"^([^/&]+)").iloc[:, 0].str.strip()
general_data = general_data[~general_data["candidate"].isin(["Registered Voters", "Write-In"])] 
general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].str.extract(r"^([^/&]+)").iloc[:, 0].str.strip()


candidate
Darrell L. Castle        7126
Hillary Clinton          7126
Jill Stein               7126
Gary Johnson             7126
Donald J. Trump          7126
Gloria Estela La Riva    7126
Alyson Kennedy           7126
Name: count, dtype: int64

In [55]:
# general_data = general_data[~general_data["candidate"].isin(["YES", "NO"])]
# general_data["party"] = general_data.apply(
#     lambda row: candidate_party_map.get(row["candidate"], row["party"]) if pd.isna(row["party"]) else row["party"],
#     axis=1
# )
def fill_party_from_general_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown

general_data["party"] = general_data.apply(
    lambda row: fill_party_from_general_data(row, general_data),
    axis=1
)
general_data["party"] = (
    general_data["party"]
    .replace({
        "Democratic": "DEM",
        "Republican": "REP",
        "Libertarian": "LIB",
        "Green": "GRN",
        "Constitution": "CON",
        "American Delta": "AMD",
        "Prohibition": "PRO",
        "Socialism & Liberation": "SOL",
        "Socialist Workers": "SOW"
    })
    .fillna("IND")
)

general_data

Unnamed: 0,precinct,party,candidate,votes
0,KING3562,CON,Darrell L. Castle,0
1,KING3562,DEM,Hillary Clinton,266
2,KING3562,GRN,Jill Stein,1
3,KING3562,LIB,Gary Johnson,14
6,KING3562,REP,Donald J. Trump,185
...,...,...,...,...
54911,YAKIMA5202,SOW,Alyson Kennedy,0
54912,YAKIMA5202,SOL,Gloria Estela La Riva,0
54913,YAKIMA5202,GRN,Jill Stein,0
54914,YAKIMA5202,CON,Darrell L. Castle,0


In [56]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

candidate_column,precinct,gen_con_CASTLE,gen_dem_CLINTON,gen_grn_STEIN,gen_lib_JOHNSON,gen_rep_TRUMP,gen_sol_RIVA,gen_sow_KENNEDY
0,ADAMS111,2,24,1,4,109,0,0
1,ADAMS112,3,26,0,1,96,0,0
2,ADAMS113,0,56,1,12,117,0,0
3,ADAMS114,3,54,1,8,113,1,1
4,ADAMS115,1,48,1,10,123,0,0
...,...,...,...,...,...,...,...,...
7121,YAKIMA5020,1,95,4,16,240,0,1
7122,YAKIMA5101,0,91,2,14,176,0,0
7123,YAKIMA5202,0,0,0,0,0,0,0
7124,YAKIMA701,0,97,1,8,53,1,0


In [57]:
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")
combined["rep_primary_total"] = combined.filter(like="pri_rep_").sum(axis=1)
combined["dem_primary_total"] = combined.filter(like="pri_dem_").sum(axis=1)
combined["general_total"] = combined.filter(like="gen_").sum(axis=1)
combined

candidate_column,precinct,pri_dem_CLINTON,pri_dem_SANDERS,pri_rep_CARSON,pri_rep_CRUZ,pri_rep_KASICH,pri_rep_TRUMP,gen_con_CASTLE,gen_dem_CLINTON,gen_grn_STEIN,gen_lib_JOHNSON,gen_rep_TRUMP,gen_sol_RIVA,gen_sow_KENNEDY,rep_primary_total,dem_primary_total,general_total
0,ADAMS111,11,6,2,6,2,45,2,24,1,4,109,0,0,55,17,140
1,ADAMS112,14,10,2,6,1,41,3,26,0,1,96,0,0,50,24,126
2,ADAMS113,17,11,8,8,2,59,0,56,1,12,117,0,0,77,28,186
3,ADAMS114,16,19,1,10,13,44,3,54,1,8,113,1,1,68,35,181
4,ADAMS115,18,11,4,5,11,53,1,48,1,10,123,0,0,73,29,183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7110,YAKIMA5020,26,11,6,15,12,100,1,95,4,16,240,0,1,133,37,357
7111,YAKIMA5101,19,26,0,6,5,72,0,91,2,14,176,0,0,83,45,283
7112,YAKIMA5202,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7113,YAKIMA701,26,25,10,3,2,22,0,97,1,8,53,1,0,37,51,160


In [58]:
combined.to_csv("WA.csv", index=False)
