In [24]:
# Import all the libraries
import pandas as pd
import glob
import os
import re
from pprint import pprint

In [25]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\WI\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['general'])
]

# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['primary'])
]


In [26]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\WI\20160405__wi__general__ward.csv
C:\Huy Phan\College\VoterTurnout\data\WI\20161108__wi__general__ward.csv


In [27]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\WI\20160216__wi__primary__ward.csv
C:\Huy Phan\College\VoterTurnout\data\WI\20160405__wi__primary__ward.csv
C:\Huy Phan\College\VoterTurnout\data\WI\20160809__wi__primary__ward.csv


In [28]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)
        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
            
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["ward"].astype(str)
        df["precinct"] = df["precinct"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]                
            df = df[~df['precinct'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]   

        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]       
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]               
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
pri_combined_df

Unnamed: 0,county,ward,office,district,total votes,party,candidate,votes,precinct
0,Adams,Town Of Adams Wards 1-3,President,,271,REP,Marco Rubio,3,ADAMSTOWN OF ADAMS WARDS 1-3
1,Adams,Town Of Adams Wards 1-3,President,,271,REP,Ben Carson,0,ADAMSTOWN OF ADAMS WARDS 1-3
2,Adams,Town Of Adams Wards 1-3,President,,271,REP,Rand Paul,2,ADAMSTOWN OF ADAMS WARDS 1-3
3,Adams,Town Of Adams Wards 1-3,President,,271,REP,Mike Huckabee,1,ADAMSTOWN OF ADAMS WARDS 1-3
4,Adams,Town Of Adams Wards 1-3,President,,271,REP,Jim Gilmore,0,ADAMSTOWN OF ADAMS WARDS 1-3
...,...,...,...,...,...,...,...,...,...
81475,Wood,"City Of Wisconsin Rapids Wards 16-23,25",President,,775,DEM,Martin O'Malley,3,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25"
81476,Wood,"City Of Wisconsin Rapids Wards 16-23,25",President,,775,DEM,Bernie Sanders,388,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25"
81477,Wood,"City Of Wisconsin Rapids Wards 16-23,25",President,,775,DEM,Uninstructed Delegation,1,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25"
81478,Wood,"City Of Wisconsin Rapids Wards 16-23,25",President,,775,DEM,Roque Rocky De La Fuente (Write-In),0,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25"


In [29]:
# Checking any suspicious precinct, county
suspicious_precincts = pri_combined_df[
    pri_combined_df["precinct"].isna() |
    pri_combined_df["precinct"].str.strip().str.upper().str.contains(r"\w*TOTAL\w*", na=False) |
    pri_combined_df["precinct"].str.strip().str.upper().isin(["NONE", "NAN"]) |
    pri_combined_df["county"].isna() |
    pri_combined_df["county"].str.strip().str.upper().str.contains(r"\w*TOTAL\w*", na=False) |
    pri_combined_df["county"].str.strip().str.upper().isin(["NONE", "NAN"])
]

In [30]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,ADAMSTOWN OF ADAMS WARDS 1-3,REP,Marco Rubio,3
1,ADAMSTOWN OF ADAMS WARDS 1-3,REP,Ben Carson,0
2,ADAMSTOWN OF ADAMS WARDS 1-3,REP,Rand Paul,2
3,ADAMSTOWN OF ADAMS WARDS 1-3,REP,Mike Huckabee,1
4,ADAMSTOWN OF ADAMS WARDS 1-3,REP,Jim Gilmore,0
...,...,...,...,...
81475,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25",DEM,Martin O'Malley,3
81476,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25",DEM,Bernie Sanders,388
81477,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25",DEM,Uninstructed Delegation,1
81478,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25",DEM,Roque Rocky De La Fuente (Write-In),0


In [32]:
# Viewing candidate data
primary_data["candidate"].value_counts(dropna=False)

candidate
Scattering                             7760
Uninstructed Delegation                7760
Marco Rubio                            3880
Jeb Bush                               3880
Bernie Sanders                         3880
Martin O'Malley                        3880
Hillary Clinton                        3880
Victor Williams (Write-In)             3880
Ted Cruz                               3880
John R. Kasich                         3880
Ben Carson                             3880
Carly Fiorina                          3880
Rick Santorum                          3880
Donald J. Trump                        3880
Chris Christie                         3880
Jim Gilmore                            3880
Mike Huckabee                          3880
Rand Paul                              3880
Roque Rocky De La Fuente (Write-In)    3880
Name: count, dtype: int64

In [33]:
# Cleaning Candidates

# Turning all primary data to uppercase
primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()

unwanted_keywords = [
    r"\w*VOTE\w*",
    r"\w*UNCOM\w*",
    r"\w*TOTAL\w*",
    r"\w*ATTERING\w*",
    r"\w*UNINSTRUCTED\w*",
    r"UNCOMMITTED", r"OTHER",r"Total Votes Cast",r"No Preference"
]

pattern = "|".join(unwanted_keywords)

# Assuming candidate column is already string and uppercase
primary_data = primary_data[~primary_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

#fIXING DE LA FUENTE
primary_data.loc[
    (
        primary_data["candidate"].str.contains("rocky.*fuente", case=False, na=False) |
        primary_data["candidate"].str.contains("LA FUENTE", case=False, na=False) |
        primary_data["candidate"].str.contains("\w*Fuque\w*", case=False, na=False)
    ),
    "candidate"
] = "LA FUENTE"

# # Fixing williams
primary_data.loc[
    primary_data["candidate"].str.contains("WILLIAMS", case=False, na=False),
    "candidate"
] = "WILLIAMS"

primary_data.loc[:, "candidate"] = primary_data["candidate"].replace({
    "Donald I. Trump": "Donald J. Trump",
})

# # Fixing the , candidate
# primary_data["candidate"] = (
#     primary_data["candidate"].str.split(",")
#     .str[0]
#     .str.strip()
#     )


# #Selecting only last name
primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]

# # Fixing Christie
# primary_data.loc[
#     primary_data["candidate"].str.contains("CHRISTLE", case=False, na=False),
#     "candidate"
# ] = "CHRISTIE"

# # Fixing KASICH
# primary_data.loc[
#     primary_data["candidate"].str.contains("KAISCH", case=False, na=False),
#     "candidate"
# ] = "CHRISTIE"

# # Fixing O'MALLEY
# primary_data.loc[
#     primary_data["candidate"].str.contains("O'MALLEY", case=False, na=False),
#     "candidate"
# ] = "O'MAILEY"

# # Fixing O'MALLEY
# primary_data.loc[
#     primary_data["candidate"].str.contains("PATAKL", case=False, na=False),
#     "candidate"
# ] = "PATAKI"

# # Fixing O'MALLEY
# primary_data.loc[
#     primary_data["candidate"].str.contains("RUBLO", case=False, na=False),
#     "candidate"
# ] = "RUBIO"

# # Fixing Huckabee
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ABEE\w*", case=False, na=False),
#     "candidate"
# ] = "HUCKABEE"

# # Fixing Fiorina
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ORINA\W*", case=False, na=False),
#     "candidate"
# ] = "FIORINA"

# # Fixing Gray
# primary_data.loc[
#     primary_data["candidate"].str.contains("GREY", case=False, na=False),
#     "candidate"
# ] = "GRAY"

# # Fixing Wilson
# primary_data.loc[
#     primary_data["candidate"].str.contains("WISON", case=False, na=False),
#     "candidate"
# ] = "WILSON"


# # Fixing separator 
# primary_data["candidate"] = (
#     primary_data["candidate"]
#     .str.split(r"\s*(?:and|/|&|–|-|\+)\s*", n=1, expand=True)[0]
#     .str.strip()
#     .str.upper()
# )

# # Fixing McMullin

# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("MCMULLIN", case=False, na=False) |
#         primary_data["candidate"].str.contains("EVAN MCMULLEN", case=False, na=False)
#     ),
#     "candidate"
# ] = "MCMULLIN"

# primary_data.loc[
#     primary_data["candidate"].str.contains("De La Fuen", case=False, na=False),
#     "candidate"
# ] = "FUENTE D"

# primary_data.loc[:,"candidate"] = (
#     primary_data["candidate"].str.split().str[0].str.upper()
# )

primary_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]


candidate
RUBIO       3880
KASICH      3880
SANDERS     3880
O'MALLEY    3880
CLINTON     3880
WILLIAMS    3880
CRUZ        3880
BUSH        3880
FIORINA     3880
CARSON      3880
SANTORUM    3880
TRUMP       3880
CHRISTIE    3880
GILMORE     3880
HUCKABEE    3880
PAUL        3880
FUENTE      3880
Name: count, dtype: int64

In [34]:
# Viewing Party
primary_data["party"].value_counts(dropna=False)

party
REP    50440
DEM    15520
Name: count, dtype: int64

In [35]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown

#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# )

primary_data["party"].value_counts(dropna=False)


party
REP    50440
DEM    15520
Name: count, dtype: int64

In [82]:
# Cleaning Party
# Turning all general data party to uppercase
primary_data["party"] = primary_data["party"].astype(str).str.upper()

# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# )
primary_data["party"] = (
    primary_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEMOCRAT":"DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GREEN",
        "GREEN-RAINBOW":"GRN",
        "LIBERTARIAN": "LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK"
    })
)
primary_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = primary_data["party"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = (


party
DEM    3400
Name: count, dtype: int64

In [36]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"]
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data.loc[:,"candidate_column"] = (


candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CHRISTIE,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_GILMORE,pri_rep_HUCKABEE,pri_rep_KASICH,pri_rep_PAUL,pri_rep_RUBIO,pri_rep_SANTORUM,pri_rep_TRUMP,pri_rep_WILLIAMS
0,ADAMSCITY OF ADAMS WARD 1,27,0,1,35,1,1,0,27,0,0,1,4,0,1,0,41,0
1,ADAMSCITY OF ADAMS WARD 2,37,0,0,55,0,0,0,32,0,0,0,5,0,0,0,51,0
2,ADAMSCITY OF ADAMS WARD 3,31,0,0,18,1,0,0,10,0,0,0,3,0,2,0,20,0
3,ADAMSCITY OF ADAMS WARD 4,35,0,0,15,1,0,0,14,0,0,1,4,2,1,0,28,0
4,"ADAMSCITY OF WISCONSIN DELLS WARDS 5,9",1,0,0,5,0,0,0,0,0,0,0,2,0,0,0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3875,WOODVILLAGE OF HEWITT WARD 1,56,0,0,86,0,2,0,114,0,0,2,22,0,2,0,87,0
3876,WOODVILLAGE OF MILLADORE WARD 1,14,0,0,22,0,0,0,31,0,0,0,0,0,1,0,27,0
3877,WOODVILLAGE OF PORT EDWARDS WARDS 1-3,136,0,0,167,2,6,0,175,0,0,0,51,0,2,0,166,0
3878,WOODVILLAGE OF RUDOLPH WARD 1,35,0,0,44,0,0,0,57,0,0,0,8,0,0,0,31,0


In [40]:
# Process general files
general_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)
        
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["ward"].astype(str)
        df["precinct"] = df["precinct"].str.upper()
        df["county"] = df["county"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]                
        
        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]       
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]      
            
        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        general_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(general_df_list, ignore_index=True)
# Checking any suspicious precinct
suspicious_precincts = gen_combined_df[
    gen_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]
gen_combined_df


Unnamed: 0,county,ward,office,district,total votes,party,candidate,votes,precinct
0,ADAMS,Town Of Adams Wards 1-3,President,,618,REP,Donald J. Trump & Michael R. Pence,377,ADAMSTOWN OF ADAMS WARDS 1-3
1,ADAMS,Town Of Adams Wards 1-3,President,,618,DEM,Hillary Clinton & Tim Kaine,207,ADAMSTOWN OF ADAMS WARDS 1-3
2,ADAMS,Town Of Adams Wards 1-3,President,,618,CON,Darrell L. Castle & Scott N. Bradley,3,ADAMSTOWN OF ADAMS WARDS 1-3
3,ADAMS,Town Of Adams Wards 1-3,President,,618,LIB,Gary Johnson & Bill Weld,22,ADAMSTOWN OF ADAMS WARDS 1-3
4,ADAMS,Town Of Adams Wards 1-3,President,,618,WGR,Jill Stein & Ajamu Baraka,4,ADAMSTOWN OF ADAMS WARDS 1-3
...,...,...,...,...,...,...,...,...,...
61807,WOOD,"City Of Wisconsin Rapids Wards 16-23,25",President,,2571,IND,Laurence Kotlikoff & Edward E. Leamer (Write-In),0,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25"
61808,WOOD,"City Of Wisconsin Rapids Wards 16-23,25",President,,2571,IND,Tom Hoefling & Steve Schulin (Write-In),0,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25"
61809,WOOD,"City Of Wisconsin Rapids Wards 16-23,25",President,,2571,IND,Joseph Maldonado (Write-In),0,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25"
61810,WOOD,"City Of Wisconsin Rapids Wards 16-23,25",President,,2571,IND,Emidio Soltysik & Angela Nicole Walker (Write-In),0,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25"


In [41]:
# Select only the relevant columns
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]

general_data

Unnamed: 0,precinct,party,candidate,votes
0,ADAMSTOWN OF ADAMS WARDS 1-3,REP,Donald J. Trump & Michael R. Pence,377
1,ADAMSTOWN OF ADAMS WARDS 1-3,DEM,Hillary Clinton & Tim Kaine,207
2,ADAMSTOWN OF ADAMS WARDS 1-3,CON,Darrell L. Castle & Scott N. Bradley,3
3,ADAMSTOWN OF ADAMS WARDS 1-3,LIB,Gary Johnson & Bill Weld,22
4,ADAMSTOWN OF ADAMS WARDS 1-3,WGR,Jill Stein & Ajamu Baraka,4
...,...,...,...,...
61807,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25",IND,Laurence Kotlikoff & Edward E. Leamer (Write-In),0
61808,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25",IND,Tom Hoefling & Steve Schulin (Write-In),0
61809,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25",IND,Joseph Maldonado (Write-In),0
61810,"WOODCITY OF WISCONSIN RAPIDS WARDS 16-23,25",IND,Emidio Soltysik & Angela Nicole Walker (Write-In),0


In [42]:
# Viewing candidate data
general_data["candidate"] = general_data["candidate"].astype(str).str.upper()
general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


candidate
DONALD J. TRUMP & MICHAEL R. PENCE                             3636
MICHAEL A. MATUREN & JUAN MUNOZ (WRITE-IN)                     3636
EMIDIO SOLTYSIK & ANGELA NICOLE WALKER (WRITE-IN)              3636
JOSEPH MALDONADO (WRITE-IN)                                    3636
TOM HOEFLING & STEVE SCHULIN (WRITE-IN)                        3636
LAURENCE KOTLIKOFF & EDWARD E. LEAMER (WRITE-IN)               3636
CHRIS KENISTON & DEACON TAYLOR (WRITE-IN)                      3636
MARSHALL SCHOENKE & JAMES CREIGHTON MITCHELL JR. (WRITE-IN)    3636
EVAN MCMULLIN & NATHAN JOHNSON (WRITE-IN)                      3636
HILLARY CLINTON & TIM KAINE                                    3636
CHERUNDA FOX & ROGER KUSHNER (WRITE-IN)                        3636
ROCKY ROQUE DE LA FUENTE & MICHAEL STEINBERG                   3636
MONICA MOOREHEAD & LAMONT LILLY                                3636
JILL STEIN & AJAMU BARAKA                                      3636
GARY JOHNSON & BILL WELD              

In [43]:
# Cleaning Candidates

# Turning all general data to uppercase
general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


unwanted_keywords = [
    r"\w*VOTE\w*",
    r"\w*UNCOM\w*",
    r"\w*TOTAL\w*",
    r"NAN",
    r"UNCERTIFIED",
    r"UNVERIFIED",
    r"NONE OF THE ABOVE",
    r"LBT",
    r"DEM",
    r"REP",
    r"GRN",
    r"NAN",
    r"\w*CANDIDATES\w*",
    "SCATTERING"
]

# Fixing FUENTE
general_data.loc[
    general_data["candidate"].str.contains("\w*FUENTE\w*", case=False, na=False),
    "candidate"
] = "FUENTE"
pattern = "|".join(unwanted_keywords)

# FILTER OUT TRASH WORDS
general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# # Fixing Darrell Castle"
# general_data.loc[
#     general_data["candidate"].str.contains("\w*ASTLE\w*", case=False, na=False),
#     "candidate"
# ] = "CASTLE"

general_data.loc[
    general_data["candidate"].str.contains("\w*ATUREN\w*", case=False, na=False),
    "candidate"
] = "MATUREN"

general_data.loc[
    general_data["candidate"].str.contains("\w*OLTYSIK\w*", case=False, na=False),
    "candidate"
] = "SOLTYSIK"

general_data.loc[
    general_data["candidate"].str.contains("\w*MALDONADO\w*", case=False, na=False),
    "candidate"
] = "MALDONADO"

# # Fixing mixed president + vice_president by /
# general_data["candidate"] = (
#     general_data["candidate"].str.split("/")
#     .str[0]
#     .str.strip()
#     )

# #Selecting only last name
# general_data["candidate"] = general_data["candidate"].str.split().str[-1]

# Fixing the & candidate
general_data["candidate"] = (
    general_data["candidate"].str.split("&")
    .str[0]
    .str.strip()
    )




# # Fixing Cubbler
# general_data.loc[
#     ( 
#         general_data["candidate"].str.contains("\w*UBBIER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*UBLER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*OBBLER\w*", case=False, na=False),
#     "candidate")
# ] = "CUBBLER"


# Fixing Kotlikoff
general_data.loc[
    general_data["candidate"].str.contains("\w*LIKOFF\w*", case=False, na=False),
    "candidate"
] = "KOTLIKOFF"

# Fixing Valdivia
general_data.loc[
    general_data["candidate"].str.contains("\w*KENISTON\W*", case=False, na=False),
    "candidate"
] = "KENISTON"

# Fixing HOEFLING
general_data.loc[
    general_data["candidate"].str.contains("\w*HOEFFLING", case=False, na=False),
    "candidate"
] = "HOEFLING"

general_data.loc[
    general_data["candidate"].str.contains("\w*SCHOENKE", case=False, na=False),
    "candidate"
] = "SCHOENKE"


# Fixing McMullin
general_data.loc[
    general_data["candidate"].str.contains("MCMULLIN", case=False, na=False) ,
    "candidate"
] = "MCMULLIN"

general_data.loc[
    general_data["candidate"].str.contains("FOX", case=False, na=False) ,
    "candidate"
] = "FOX"

# Fixing LASTNAME + First name Initial
# general_data.loc[:,"candidate"] = (
#     general_data["candidate"].str.split().str[0].str.upper()
# )


# #Selecting only last name
general_data["candidate"] = general_data["candidate"].str.split().str[-1]

general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].str.split().str[-1]


candidate
TRUMP        3636
CLINTON      3636
CASTLE       3636
JOHNSON      3636
STEIN        3636
MOOREHEAD    3636
FUENTE       3636
FOX          3636
MCMULLIN     3636
MATUREN      3636
SCHOENKE     3636
KENISTON     3636
KOTLIKOFF    3636
HOEFLING     3636
MALDONADO    3636
SOLTYSIK     3636
Name: count, dtype: int64

In [44]:
# Viewing Party
general_data["party"].value_counts(dropna=False)

party
IND    39996
REP     3636
DEM     3636
CON     3636
LIB     3636
WGR     3636
Name: count, dtype: int64

In [45]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown
    

#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)

    # Filter and combine only known party entries (exclude 'UNK')
    new_data = (
        df[["candidate", "party"]]
        .dropna()
        .query('party != "UNK"')
        .drop_duplicates()
    )

    # Merge with master and remove duplicates by candidate
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)

    # Filter and combine only known party entries (exclude 'UNK')
    new_data = (
        df[["candidate", "party"]]
        .dropna()
        .query('party != "UNK"')
        .drop_duplicates()
    )

    # Merge with master and remove duplicates by candidate
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
# general_data["party"] = general_data.apply(
#     lambda row: fill_party_from_data(row, general_data),
#     axis=1
# )

# # Fill remaining party using general master CSV
# master_party_df = pd.read_csv(r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv") # USE YOUR OWN ADDRESS
# general_data = fill_party_from_master(general_data, master_party_df)
general_data["party"].value_counts(dropna=False)


party
IND    39996
REP     3636
DEM     3636
CON     3636
LIB     3636
WGR     3636
Name: count, dtype: int64

In [46]:
# Cleaning Party
# Turning all general data party to uppercase
general_data["party"] = general_data["party"].astype(str).str.upper()

general_data["party"] = (
    general_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEMOCRAT":"DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GRN",
        "WGR":"GRN",
        "GREEN AND RAINBOW":"GRN",
        "GREEN-RAINBOW":"GRN",
        "LIBERTARIAN": "LIB",
        "LIBERTARIN":"LIB",
        "LPN":"LIB",
        "LBT": "LIB",
        "L": "LIB",
        "IAP":"CON",
        "CONSTITUTION": "CON",
        "AMERICAN DELTA":"AMD",
        "PROHIBITION":"PRO",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK",
        "NPA":"IND",
        "UST":"CON",
        "NPP":"IND",
        "(WRITE-IN)":"IND"
    })
)

general_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data["party"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = (


party
IND    39996
REP     3636
DEM     3636
CON     3636
LIB     3636
GRN     3636
Name: count, dtype: int64

In [47]:
# UPDATE MASTER FILE, CAREFUL
update_master_candidate_party(general_data, r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv")

In [48]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate_column"] = (


candidate_column,precinct,gen_con_CASTLE,gen_dem_CLINTON,gen_grn_STEIN,gen_ind_FOX,gen_ind_FUENTE,gen_ind_HOEFLING,gen_ind_KENISTON,gen_ind_KOTLIKOFF,gen_ind_MALDONADO,gen_ind_MATUREN,gen_ind_MCMULLIN,gen_ind_MOOREHEAD,gen_ind_SCHOENKE,gen_ind_SOLTYSIK,gen_lib_JOHNSON,gen_rep_TRUMP
0,ADAMSCITY OF ADAMS WARD 1-4,6,329,6,0,1,0,0,0,0,0,4,2,0,0,16,379
1,"ADAMSCITY OF WISCONSIN DELLS WARDS 5,9",0,7,0,0,0,0,0,0,0,0,0,0,0,0,1,14
2,ADAMSTOWN OF ADAMS WARDS 1-3,3,207,4,0,0,0,0,0,0,0,3,1,0,0,22,377
3,ADAMSTOWN OF BIG FLATS WARD 1-2,4,169,4,0,0,0,0,0,0,0,0,0,0,0,10,300
4,ADAMSTOWN OF COLBURN WARD 1,0,42,2,0,0,0,0,0,0,0,0,0,0,0,1,85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3631,WOODVILLAGE OF HEWITT WARD 1,1,181,3,0,0,0,0,0,0,0,1,1,0,0,16,275
3632,WOODVILLAGE OF MILLADORE WARD 1,2,47,3,0,0,0,0,0,0,0,0,0,0,0,5,79
3633,WOODVILLAGE OF PORT EDWARDS WARDS 1-3,7,389,9,0,1,0,0,0,0,0,4,0,0,0,40,515
3634,WOODVILLAGE OF RUDOLPH WARD 1,2,101,2,0,1,0,0,0,0,0,0,0,0,0,5,140


In [49]:
# Merge
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")

# Convert DEM primary columns to numeric and calculate total
dem_cols = combined.filter(like="pri_dem_").columns
combined[dem_cols] = combined[dem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_primary_total"] = combined[dem_cols].sum(axis=1)

# Convert general election columns to numeric and calculate total
gen_cols = combined.filter(like="gen_").columns
combined[gen_cols] = combined[gen_cols].apply(pd.to_numeric, errors="coerce")
combined["general_total"] = combined[gen_cols].sum(axis=1)

# Convert all numeric columns to int
for col in combined.columns[1:]:
    combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0).astype(int)

# Identify filtered-out precincts
primary_precincts = set(primary_result["precinct"])
general_precincts = set(general_result["precinct"])
combined_precincts = set(combined["precinct"])

primary_filtered_out = primary_precincts - combined_precincts
general_filtered_out = general_precincts - combined_precincts




In [50]:


combined.to_csv("WI.csv", index=False)
# Step 3: Identify unmatched precincts and retrieve full rows
combined_precincts = set(combined["precinct"])

# Rows in primary_result but not in combined
primary_filtered_out = primary_result[~primary_result["precinct"].isin(combined_precincts)]
primary_filtered_out.to_csv("WI_primary_filtered.csv", index=False)

# Rows in general_result but not in combined
general_filtered_out = general_result[~general_result["precinct"].isin(combined_precincts)]
general_filtered_out.to_csv("WI_general_filtered.csv", index=False)

In [51]:
print(f"primary: {len(primary_precincts)}, general: {len(general_precincts)}, combined: {len(combined)}")

primary: 3880, general: 3636, combined: 3052


In [52]:
print(f"primary: {len(pri_combined_df['county'].unique())}, general: {len(gen_combined_df['county'].unique())}")


primary: 72, general: 72


In [None]:
pri_combined_df['jurisdiction'].unique()

array(['Carson City', 'Churchill', 'Clark', 'Douglas', 'Elko',
       'Esmeralda', 'Eureka', 'Humboldt', 'Lander', 'Lincoln', 'Lyon',
       'Mineral', 'Nye', 'ye', 'Pershing', 'Storey', 'Washoe',
       'White Pine'], dtype=object)

In [None]:
gen_combined_df['county'].unique()

array(['CARSON CITY', 'CHURCHILL', 'CLARK', 'DOUGLAS', 'ELKO',
       'ESMERALDA', 'EUREKA', 'HUMBOLDT', 'LANDER', 'LINCOLN', 'LYON',
       'MINERAL', 'NYE', 'PERSHING', 'STOREY', 'WASHOE', 'WHITE PINE'],
      dtype=object)

In [None]:
primary_counties = set(pri_combined_df['jurisdiction'].dropna().str.strip().str.upper())
general_counties = set(gen_combined_df['county'].dropna().str.strip().str.upper())

diff = primary_counties - general_counties
print(f"Counties in primary but not in general: {len(diff)}")
print(diff)


Counties in primary but not in general: 1
{'YE'}
