In [1]:
# Import all the libraries
import pandas as pd
import glob
import os
import re
from pprint import pprint

In [2]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\WA\2016\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary', 'president'])
]


In [3]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\WA\2016\20160209__wa__special__general__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\WA\2016\20160426__wa__special__general__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\WA\2016\20161108__wa__general__precinct.csv


In [4]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\WA\2016\20160524__wa__primary_president__precinct.csv


In [5]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)
        
            
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()
        df["county"] = df["county"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]                
            df = df[~df['precinct'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]   

        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]       
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]               
        # Filtering out only President 
            
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
pri_combined_df = pri_combined_df.drop(columns=["party"])
pri_combined_df = pri_combined_df.rename(columns={"office": "party"})
pri_combined_df


Unnamed: 0,county,precinct_code,precinct,party,district,candidate,votes
0,KING,3562,KINGADAIR,Democratic Party,,Bernie Sanders,24
1,KING,3562,KINGADAIR,Democratic Party,,Hillary Clinton,117
2,KING,3562,KINGADAIR,Democratic Party,,Registered Voters,503
3,KING,3562,KINGADAIR,Democratic Party,,Write-In,0
4,KING,3562,KINGADAIR,Republican Party,,Ben Carson,1
...,...,...,...,...,...,...,...
52753,YAKIMA,5101,YAKIMA5101.38,President Republican Party,,Donald J. Trump,72
52754,YAKIMA,5202,YAKIMA5202.473,President Republican Party,,Ben Carson,0
52755,YAKIMA,5202,YAKIMA5202.473,President Republican Party,,Ted Cruz,0
52756,YAKIMA,5202,YAKIMA5202.473,President Republican Party,,John R. Kasich,0


In [6]:
## Cleaning precinct, county name
# Fixing Cubbler
# pri_combined_df.loc[
#     ( 
#         pri_combined_df["county"].str.contains("\w*F120\w*", case=False, na=False)|
#         pri_combined_df["county"].str.contains("\w*STONE\w*", case=False, na=False)|
#         pri_combined_df["county"].str.contains("\w*OBBLER\w*", case=False, na=False),
#     "county")
# ] = "STONE"

In [7]:
# Checking any suspicious precinct, county
suspicious_precincts = pri_combined_df[
    pri_combined_df["precinct"].isna() |
    pri_combined_df["precinct"].str.strip().str.upper().str.contains(r"\w*TOTAL\w*", na=False) |
    pri_combined_df["precinct"].str.strip().str.upper().isin(["NONE", "NAN"]) |
    pri_combined_df["county"].isna() |
    pri_combined_df["county"].str.strip().str.upper().str.contains(r"\w*TOTAL\w*", na=False) |
    pri_combined_df["county"].str.strip().str.upper().isin(["NONE", "NAN"])
]

In [8]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,KINGADAIR,Democratic Party,Bernie Sanders,24
1,KINGADAIR,Democratic Party,Hillary Clinton,117
2,KINGADAIR,Democratic Party,Registered Voters,503
3,KINGADAIR,Democratic Party,Write-In,0
4,KINGADAIR,Republican Party,Ben Carson,1
...,...,...,...,...
52753,YAKIMA5101.38,President Republican Party,Donald J. Trump,72
52754,YAKIMA5202.473,President Republican Party,Ben Carson,0
52755,YAKIMA5202.473,President Republican Party,Ted Cruz,0
52756,YAKIMA5202.473,President Republican Party,John R. Kasich,0


In [9]:
# Viewing candidate data
primary_data["candidate"].value_counts(dropna=False)

candidate
Bernie Sanders       7115
Hillary Clinton      7115
Ben Carson           7115
Donald J. Trump      7115
John R. Kasich       7115
Ted Cruz             7115
Registered Voters    5034
Write-In             5034
Name: count, dtype: int64

In [10]:
# Cleaning Candidates

# Turning all primary data to uppercase
primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()

unwanted_keywords = [
    r"\w*VOTE\w*",
    r"\w*UNCOM\w*",
    r"\w*TOTAL\w*",
    r"Blank Votes", r"All Others",r"Total Votes Cast",r"No Preference",
    r"UNCOMMITTED","WRITE-IN"
]

pattern = "|".join(unwanted_keywords)

# # Assuming candidate column is already string and uppercase
primary_data = primary_data[~primary_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

#fIXING DE LA FUENTE
# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("rocky.*fuente", case=False, na=False) |
#         primary_data["candidate"].str.contains("LA FUENTE", case=False, na=False) |
#         primary_data["candidate"].str.contains("\w*Fuque\w*", case=False, na=False)
#     ),
#     "candidate"
# ] = "LA FUENTE"


# primary_data.loc[:, "candidate"] = primary_data["candidate"].replace({
#     "Donald I. Trump": "Donald J. Trump",
# })

primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()
# # Fixing the , candidate
# primary_data["candidate"] = (
#     primary_data["candidate"].str.split(",")
#     .str[0]
#     .str.strip()
#     )

# #Selecting only last name
primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]

# Fixing Christie
primary_data.loc[
    primary_data["candidate"].str.contains("CHRISTLE", case=False, na=False),
    "candidate"
] = "CHRISTIE"

# Fixing KASICH
primary_data.loc[
    primary_data["candidate"].str.contains("KAISCH", case=False, na=False),
    "candidate"
] = "CHRISTIE"

# Fixing O'MALLEY
primary_data.loc[
    primary_data["candidate"].str.contains("O'MALLEY", case=False, na=False),
    "candidate"
] = "O'MAILEY"

# Fixing O'MALLEY
primary_data.loc[
    primary_data["candidate"].str.contains("PATAKL", case=False, na=False),
    "candidate"
] = "PATAKI"

# Fixing O'MALLEY
primary_data.loc[
    primary_data["candidate"].str.contains("RUBLO", case=False, na=False),
    "candidate"
] = "RUBIO"

# # Fixing Huckabee
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ABEE\w*", case=False, na=False),
#     "candidate"
# ] = "HUCKABEE"

# # Fixing Fiorina
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ORINA\W*", case=False, na=False),
#     "candidate"
# ] = "FIORINA"

# # Fixing Gray
# primary_data.loc[
#     primary_data["candidate"].str.contains("GREY", case=False, na=False),
#     "candidate"
# ] = "GRAY"

# # Fixing Wilson
# primary_data.loc[
#     primary_data["candidate"].str.contains("WISON", case=False, na=False),
#     "candidate"
# ] = "WILSON"


# # Fixing separator 
# primary_data["candidate"] = (
#     primary_data["candidate"]
#     .str.split(r"\s*(?:and|/|&|–|-|\+)\s*", n=1, expand=True)[0]
#     .str.strip()
#     .str.upper()
# )

# # Fixing McMullin

# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("MCMULLIN", case=False, na=False) |
#         primary_data["candidate"].str.contains("EVAN MCMULLEN", case=False, na=False)
#     ),
#     "candidate"
# ] = "MCMULLIN"

# primary_data.loc[
#     primary_data["candidate"].str.contains("De La Fuen", case=False, na=False),
#     "candidate"
# ] = "FUENTE D"

## SPLITTINNG BY ,
# primary_data.loc[:,"candidate"] = (
#     primary_data["candidate"].str.split().str[0].str.upper()
# )

primary_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candida

candidate
SANDERS    7115
CLINTON    7115
CARSON     7115
TRUMP      7115
KASICH     7115
CRUZ       7115
Name: count, dtype: int64

In [11]:
# Viewing Party
primary_data["party"].value_counts(dropna=False)

party
President Republican Party    18392
Republican Party              10068
President Democratic Party     9196
Democratic Party               5034
Name: count, dtype: int64

In [12]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown

#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# )

primary_data["party"].value_counts(dropna=False)


party
President Republican Party    18392
Republican Party              10068
President Democratic Party     9196
Democratic Party               5034
Name: count, dtype: int64

In [13]:
# Cleaning Party
# Turning all general data party to uppercase
primary_data["party"] = primary_data["party"].astype(str).str.upper()

primary_data["party"] = primary_data.apply(
    lambda row: fill_party_from_data(row, primary_data),
    axis=1
)
primary_data["party"] = (
    primary_data["party"]
    .replace({
        "PRESIDENT REPUBLICAN PARTY": "REP",
        "PRESIDENT DEMOCRATIC PARTY": "DEM",
        "DEMOCRATIC PARTY": "DEM",
        "REPUBLICAN PARTY": "REP",
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEMOCRAT":"DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GREEN",
        "GREEN-RAINBOW":"GRN",
        "LIBERTARIAN": "LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK"
    })
)
primary_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = primary_data["party"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = primary_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = (


party
REP    28460
DEM    14230
Name: count, dtype: int64

In [14]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"]
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data.loc[:,"candidate_column"] = (


candidate_column,precinct,pri_dem_CLINTON,pri_dem_SANDERS,pri_rep_CARSON,pri_rep_CRUZ,pri_rep_KASICH,pri_rep_TRUMP
0,ADAMSBATUM,2,6,5,4,0,34
1,ADAMSBENGE,3,2,0,3,0,14
2,ADAMSCUNNINGHAM,0,0,0,0,0,0
3,ADAMSFAIRVIEW,2,2,0,0,0,0
4,ADAMSHATTON CITY,0,0,0,0,0,0
...,...,...,...,...,...,...,...
7109,YAKIMA5020.5839999999998,26,11,6,15,12,100
7110,YAKIMA5101.38,19,26,0,6,5,72
7111,YAKIMA5202.473,0,0,0,0,0,0
7112,YAKIMA701.6,26,25,10,3,2,22


In [15]:
# Process general files
general_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)
        
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]                
        
        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]       
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]      
            
        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"].str.upper() == "PRESIDENT"]
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        general_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(general_df_list, ignore_index=True)

# Checking any suspicious precinct
suspicious_precincts = gen_combined_df[
    gen_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]
general_files


  df = pd.read_csv(file)


['C:\\Huy Phan\\College\\VoterTurnout\\data\\WA\\2016\\20160209__wa__special__general__precinct.csv',
 'C:\\Huy Phan\\College\\VoterTurnout\\data\\WA\\2016\\20160426__wa__special__general__precinct.csv',
 'C:\\Huy Phan\\College\\VoterTurnout\\data\\WA\\2016\\20161108__wa__general__precinct.csv']

In [16]:
# Select only the relevant columns
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]

general_data

Unnamed: 0,precinct,party,candidate,votes
0,KINGADAIR,Constitution,Darrell L. Castle & Scott N. Bradley,0
1,KINGADAIR,Democratic,Hillary Clinton & Tim Kaine,266
2,KINGADAIR,Green,Jill Stein & Ajamu Baraka,1
3,KINGADAIR,Libertarian,Gary Johnson & Bill Weld,14
4,KINGADAIR,,Registered Voters,519
...,...,...,...,...
54911,YAKIMA5202.473,Socialist Workers,Alyson Kennedy / Osborne Hart,0
54912,YAKIMA5202.473,Socialism & Liberation,Gloria Estela La Riva / Eugene Puryear,0
54913,YAKIMA5202.473,Green,Jill Stein / Ajamu Baraka,0
54914,YAKIMA5202.473,Constitution,Darrell L. Castle / Scott N. Bradley,0


In [17]:
# Viewing candidate data
general_data["candidate"].value_counts(dropna=False)

candidate
Hillary Clinton / Tim Kaine               4609
Donald J. Trump / Michael R. Pence        4609
Alyson Kennedy / Osborne Hart             4609
Gloria Estela La Riva / Eugene Puryear    4609
Jill Stein / Ajamu Baraka                 4609
Darrell L. Castle / Scott N. Bradley      4609
Gary Johnson / Bill Weld                  4609
Darrell L. Castle & Scott N. Bradley      2517
Hillary Clinton & Tim Kaine               2517
Jill Stein & Ajamu Baraka                 2517
Gary Johnson & Bill Weld                  2517
Registered Voters                         2517
Write-In                                  2517
Donald J. Trump & Michael R. Pence        2517
Gloria Estela La Riva & Eugene Puryear    2517
Alyson Kennedy & Osborne Hart             2517
Name: count, dtype: int64

In [18]:
# Cleaning Candidates

# Turning all general data to uppercase
general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


unwanted_keywords = [
    r"\w*VOTE\w*",
    r"\w*UNCOM\w*",
    r"\w*TOTAL\w*",
    r"NAN",
    r"\w*WRITE[\s-]\w*",
    r"UNCERTIFIED",
    r"UNVERIFIED",
    r"NONE OF THE ABOVE",
    r"LBT",
    r"DEM",
    r"REP",
    r"GRN",
    r"NAN"

]

# Fixing FUENTE
general_data.loc[
    general_data["candidate"].str.contains("\w*Fuenta\w*", case=False, na=False),
    "candidate"
] = "FUENTE"
pattern = "|".join(unwanted_keywords)

# # FILTER OUT TRASH WORDS
general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# # Fixing Darrell Castle"
# general_data.loc[
#     general_data["candidate"].str.contains("\w*ASTLE\w*", case=False, na=False),
#     "candidate"
# ] = "CASTLE"

# general_data.loc[
#     general_data["candidate"].str.contains("\w*ATUREN\w*", case=False, na=False),
#     "candidate"
# ] = "MATUREN"

# Fixing mixed president + vice_president by /
general_data["candidate"] = general_data["candidate"].str.extract(r"^([^/&]+)").iloc[:, 0].str.strip()

# #Selecting only last name
# general_data["candidate"] = general_data["candidate"].str.split().str[-1]

# # Fixing the , candidate
# # general_data["candidate"] = (
# #     general_data["candidate"].str.split(",")
# #     .str[0]
# #     .str.strip()
# #     )




# # Fixing Cubbler
# general_data.loc[
#     ( 
#         general_data["candidate"].str.contains("\w*UBBIER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*UBLER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*OBBLER\w*", case=False, na=False),
#     "candidate")
# ] = "CUBBLER"


# # Fixing Kotlikoff
# general_data.loc[
#     general_data["candidate"].str.contains("\w*TIKOFF\w*", case=False, na=False),
#     "candidate"
# ] = "KOTLIKOFF"

# # Fixing Valdivia
# general_data.loc[
#     general_data["candidate"].str.contains("\w*VALDIVA\W*", case=False, na=False),
#     "candidate"
# ] = "VALDIVIA"

# # Fixing HOEFLING
# general_data.loc[
#     general_data["candidate"].str.contains("\w*HOEFFLING", case=False, na=False),
#     "candidate"
# ] = "HOEFLING"


# # Fixing McMullin
# general_data.loc[
#     general_data["candidate"].str.contains("MCMULLEN", case=False, na=False) ,
#     "candidate"
# ] = "MCMULLIN"

# Fixing LASTNAME + First name Initial
# general_data.loc[:,"candidate"] = (
#     general_data["candidate"].str.split().str[0].str.upper()
# )


# #Selecting only last name
general_data["candidate"] = general_data["candidate"].str.split().str[-1]

general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].str.extract(r"^([^/&]+)").iloc[:, 0].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"

candidate
CASTLE     7126
CLINTON    7126
STEIN      7126
JOHNSON    7126
TRUMP      7126
RIVA       7126
KENNEDY    7126
Name: count, dtype: int64

In [19]:
# Viewing Party
general_data["party"].value_counts(dropna=False)

party
Constitution              7126
Democratic                7126
Green                     7126
Libertarian               7126
Republican                7126
Socialism & Liberation    7126
Socialist Workers         7126
Name: count, dtype: int64

In [20]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown
    

#=====================================
# Function to fill remaining missing party from master lookup
#=====================================
def fill_party_from_master(df, master_df):
    party_map = master_df.set_index("candidate")["party"].to_dict()
    df["party"] = df.apply(
        lambda row: party_map.get(row["candidate"], row["party"])
        if pd.isna(row["party"]) else row["party"],
        axis=1
    )
    return df


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)

    # Filter and combine only known party entries (exclude 'UNK')
    new_data = (
        df[["candidate", "party"]]
        .dropna()
        .query('party != "UNK"')
        .drop_duplicates()
    )

    # Merge with master and remove duplicates by candidate
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
# general_data["party"] = general_data.apply(
#     lambda row: fill_party_from_data(row, general_data),
#     axis=1
# )

# # # Fill remaining party using general master CSV
# master_party_df = pd.read_csv(r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv") # USE YOUR OWN ADDRESS
# general_data = fill_party_from_master(general_data, master_party_df)
general_data["party"].value_counts(dropna=False)


party
Constitution              7126
Democratic                7126
Green                     7126
Libertarian               7126
Republican                7126
Socialism & Liberation    7126
Socialist Workers         7126
Name: count, dtype: int64

In [21]:
# Cleaning Party
# Turning all general data party to uppercase
general_data["party"] = general_data["party"].astype(str).str.upper()

general_data["party"] = (
    general_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEMOCRAT":"DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GRN",
        "GREEN AND RAINBOW":"GRN",
        "GREEN-RAINBOW":"GRN",
        "LIBERTARIAN": "LIB",
        "LIBERTARIN":"LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "AMERICAN DELTA":"AMD",
        "PROHIBITION":"PRO",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK",
        "NPA":"IND",
        "UST":"CON",
        "INDEPENDENT":"IND",
        "(WRITE-IN)":"IND",
        "SOCIALISM & LIBERATION": "SOL",
        "SOCIALIST WORKERS": "SOW"
    })
)

general_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data["party"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = (


party
CON    7126
DEM    7126
GRN    7126
LIB    7126
REP    7126
SOL    7126
SOW    7126
Name: count, dtype: int64

In [22]:
# UPDATE MASTER FILE, CAREFUL
update_master_candidate_party(general_data, r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv")

In [23]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate_column"] = (


candidate_column,precinct,gen_con_CASTLE,gen_dem_CLINTON,gen_grn_STEIN,gen_lib_JOHNSON,gen_rep_TRUMP,gen_sol_RIVA,gen_sow_KENNEDY
0,ADAMSBATUM,0,10,2,2,96,0,0
1,ADAMSBENGE,0,4,0,0,29,0,0
2,ADAMSCUNNINGHAM,0,6,0,0,8,0,1
3,ADAMSFAIRVIEW,0,5,0,1,27,0,0
4,ADAMSHATTON CITY,0,2,0,1,25,0,0
...,...,...,...,...,...,...,...,...
7120,YAKIMA5020.5839999999998,1,95,4,16,240,0,1
7121,YAKIMA5101.38,0,91,2,14,176,0,0
7122,YAKIMA5202.473,0,0,0,0,0,0,0
7123,YAKIMA701.6,0,97,1,8,53,1,0


In [24]:
# Merge
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")

# Convert DEM primary columns to numeric and calculate total
dem_cols = combined.filter(like="pri_dem_").columns
combined[dem_cols] = combined[dem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_primary_total"] = combined[dem_cols].sum(axis=1)

rep_cols = combined.filter(like="pri_rep_").columns
combined[rep_cols] = combined[rep_cols].apply(pd.to_numeric, errors="coerce")
combined["rep_primary_total"] = combined[rep_cols].sum(axis=1)

# Convert general election columns to numeric and calculate total

gdem_cols = combined.filter(like="gen_dem_").columns
combined[gdem_cols] = combined[gdem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_general_total"] = combined[gdem_cols].sum(axis=1)

grep_cols = combined.filter(like="gen_rep_").columns
combined[grep_cols] = combined[grep_cols].apply(pd.to_numeric, errors="coerce")
combined["rep_general_total"] = combined[grep_cols].sum(axis=1)

# Convert all numeric columns to int
for col in combined.columns[1:]:
    combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0).astype(int)

# Identify filtered-out precincts
primary_precincts = set(primary_result["precinct"])
general_precincts = set(general_result["precinct"])
combined_precincts = set(combined["precinct"])

primary_filtered_out = primary_precincts - combined_precincts
general_filtered_out = general_precincts - combined_precincts


combined

candidate_column,precinct,pri_dem_CLINTON,pri_dem_SANDERS,pri_rep_CARSON,pri_rep_CRUZ,pri_rep_KASICH,pri_rep_TRUMP,gen_con_CASTLE,gen_dem_CLINTON,gen_grn_STEIN,gen_lib_JOHNSON,gen_rep_TRUMP,gen_sol_RIVA,gen_sow_KENNEDY,dem_primary_total,rep_primary_total,dem_general_total,rep_general_total
0,ADAMSBATUM,2,6,5,4,0,34,0,10,2,2,96,0,0,8,43,10,96
1,ADAMSBENGE,3,2,0,3,0,14,0,4,0,0,29,0,0,5,17,4,29
2,ADAMSCUNNINGHAM,0,0,0,0,0,0,0,6,0,0,8,0,1,0,0,6,8
3,ADAMSFAIRVIEW,2,2,0,0,0,0,0,5,0,1,27,0,0,4,0,5,27
4,ADAMSHATTON CITY,0,0,0,0,0,0,0,2,0,1,25,0,0,0,0,2,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7109,YAKIMA5020.5839999999998,26,11,6,15,12,100,1,95,4,16,240,0,1,37,133,95,240
7110,YAKIMA5101.38,19,26,0,6,5,72,0,91,2,14,176,0,0,45,83,91,176
7111,YAKIMA5202.473,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7112,YAKIMA701.6,26,25,10,3,2,22,0,97,1,8,53,1,0,51,37,97,53


In [25]:
combined.to_csv("WA.csv", index=False)
# Step 3: Identify unmatched precincts and retrieve full rows
combined_precincts = set(combined["precinct"])

# Rows in primary_result but not in combined
primary_filtered_out = primary_result[~primary_result["precinct"].isin(combined_precincts)]
primary_filtered_out.to_csv("WA_primary_filtered.csv", index=False)

# Rows in general_result but not in combined
general_filtered_out = general_result[~general_result["precinct"].isin(combined_precincts)]
general_filtered_out.to_csv("WA_general_filtered.csv", index=False)

In [26]:
print(f"primary: {len(primary_precincts)}, general: {len(general_precincts)}, combined: {len(combined)}")

primary: 7114, general: 7125, combined: 7114


In [27]:
print(f"primary: {len(pri_combined_df['county'].unique())}, general: {len(gen_combined_df['county'].unique())}")


primary: 38, general: 39


In [28]:
pri_combined_df['precinct'].unique()

array(['KINGADAIR', 'KINGALDARRA', 'KINGALDER SPRINGS', ...,
       'YAKIMA5020.5839999999998', 'YAKIMA5101.38', 'YAKIMA5202.473'],
      dtype=object)

In [29]:
gen_combined_df['precinct'].unique()

array(['KINGADAIR', 'KINGALDARRA', 'KINGALDER SPRINGS', ...,
       'YAKIMA5020.5839999999998', 'YAKIMA5101.38', 'YAKIMA5202.473'],
      dtype=object)

In [30]:
primary_counties = set(pri_combined_df['precinct'].dropna().str.strip().str.upper())
general_counties = set(gen_combined_df['precinct'].dropna().str.strip().str.upper())

diff = primary_counties - general_counties
print(f"Counties in primary but not in general: {len(diff)}")
print(diff)


Counties in primary but not in general: 0
set()
