In [6]:
# Import all the libraries
import pandas as pd
import glob
import os
import re
from pprint import pprint

In [7]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\ID\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary'])
]


In [8]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\ID\20161108__id__general__precinct.csv


In [9]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\ID\20160308__id__primary__presidential__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\ID\20160517__id__primary__precinct.csv


In [10]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)
        
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]       
            df = df[~df['precinct'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]          
        
        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]  
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]    

        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)


# Checking any suspicious precinct
suspicious_precincts = pri_combined_df[
    pri_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]


In [11]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,ADA1401,REP,Jeb Bush,0.0
1,ADA1402,REP,Jeb Bush,0.0
2,ADA1403,REP,Jeb Bush,1.0
3,ADA1404,REP,Jeb Bush,1.0
4,ADA1405,REP,Jeb Bush,1.0
...,...,...,...,...
12488,WASHINGTON07 MIDVALE,REP,Donald J. Trump,96.0
12489,WASHINGTON08 CAMBRIDGE,REP,Donald J. Trump,72.0
12490,WASHINGTON09 PIONEER,REP,Donald J. Trump,52.0
12491,WASHINGTON10 SUNNYSIDE,REP,Donald J. Trump,88.0


In [12]:
# Viewing candidate data
primary_data["candidate"].value_counts(dropna=False)

candidate
Jeb Bush           961
Ben Carson         961
Chris Christie     961
Ted Cruz           961
Carly Fiorina      961
Lindsey Graham     961
Mike Huckabee      961
John R. Kasich     961
Peter Messina      961
Rand Paul          961
Marco Rubio        961
Rick Santorum      961
Donald J. Trump    961
Name: count, dtype: int64

In [13]:
# Cleaning Candidates

# Turning all primary data to uppercase
# primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()

# unwanted_keywords = [
#     r"\w*VOTE\w*",
#     r"\w*UNCOM\w*",
#     r"\w*TOTAL\w*",
#     r"Blank Votes", r"All Others",r"Total Votes Cast",r"No Preference"
# ]

# pattern = "|".join(unwanted_keywords)

# # Assuming candidate column is already string and uppercase
# primary_data = primary_data[~primary_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

primary_data = primary_data[~primary_data["candidate"].isin(["Blank Votes", "All Others","Total Votes Cast","No Preference"])] 
primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()
# # Fixing the , candidate
# primary_data["candidate"] = (
#     primary_data["candidate"].str.split(",")
#     .str[0]
#     .str.strip()
#     )

# # Fixing De la Fuente
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*Fuen\w*", case=False, na=False),
#     "candidate"
# ] = "LA FUENTE"

# #Selecting only last name
primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]

# # Fixing Huckabee
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ABEE\w*", case=False, na=False),
#     "candidate"
# ] = "HUCKABEE"

# # Fixing Fiorina
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ORINA\W*", case=False, na=False),
#     "candidate"
# ] = "FIORINA"

# # Fixing Gray
# primary_data.loc[
#     primary_data["candidate"].str.contains("GREY", case=False, na=False),
#     "candidate"
# ] = "GRAY"

# # Fixing Wilson
# primary_data.loc[
#     primary_data["candidate"].str.contains("WISON", case=False, na=False),
#     "candidate"
# ] = "WILSON"


# # Fixing separator 
# primary_data["candidate"] = (
#     primary_data["candidate"]
#     .str.split(r"\s*(?:and|/|&|–|-|\+)\s*", n=1, expand=True)[0]
#     .str.strip()
#     .str.upper()
# )

# # Fixing McMullin

# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("MCMULLIN", case=False, na=False) |
#         primary_data["candidate"].str.contains("EVAN MCMULLEN", case=False, na=False)
#     ),
#     "candidate"
# ] = "MCMULLIN"

# primary_data.loc[
#     primary_data["candidate"].str.contains("De La Fuen", case=False, na=False),
#     "candidate"
# ] = "FUENTE D"

# primary_data.loc[:,"candidate"] = (
#     primary_data["candidate"].str.split().str[0].str.upper()
# )

primary_data["candidate"].value_counts(dropna=False)

candidate
BUSH        961
CARSON      961
CHRISTIE    961
CRUZ        961
FIORINA     961
GRAHAM      961
HUCKABEE    961
KASICH      961
MESSINA     961
PAUL        961
RUBIO       961
SANTORUM    961
TRUMP       961
Name: count, dtype: int64

In [14]:
# Viewing Party
primary_data["party"].value_counts(dropna=False)

party
REP    12493
Name: count, dtype: int64

In [15]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown
    

#=====================================
# Function to fill remaining missing party from master lookup
#=====================================
def fill_party_from_master(df, master_df):
    party_map = master_df.set_index("candidate")["party"].to_dict()
    df["party"] = df.apply(
        lambda row: party_map.get(row["candidate"], row["party"])
        if pd.isna(row["party"]) else row["party"],
        axis=1
    )
    return df


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)
    
    # Combine unique pairs
    new_data = df[["candidate", "party"]].dropna().drop_duplicates()
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# )

# Fill remaining party using general master CSV
# master_party_df = pd.read_csv("data/master_primary_candidate_party.csv")
# general_data = fill_party_from_master(general_data, master_party_df)

# STEP 3: Update master file with new (candidate, party) pairs
# update_master_candidate_party(general_data, "data/master_candidate_party.csv")
primary_data["party"].value_counts(dropna=False)


party
REP    12493
Name: count, dtype: int64

In [16]:
# Cleaning Party
# Turning all general data party to uppercase
primary_data["party"] = primary_data["party"].astype(str).str.upper()

primary_data["party"] = primary_data.apply(
    lambda row: fill_party_from_data(row, primary_data),
    axis=1
)
primary_data["party"] = (
    primary_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GREEN",
        "GREEN-RAINBOW":"GRN",
        "LIBERTARIAN": "LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK"
    })
)
primary_data["party"].value_counts(dropna=False)

party
REP    12493
Name: count, dtype: int64

In [17]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"]
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

candidate_column,precinct,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CHRISTIE,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_GRAHAM,pri_rep_HUCKABEE,pri_rep_KASICH,pri_rep_MESSINA,pri_rep_PAUL,pri_rep_RUBIO,pri_rep_SANTORUM,pri_rep_TRUMP
0,ADA1401,0.0,5.0,1.0,219.0,0.0,0.0,0.0,35.0,0.0,0.0,66.0,1.0,139.0
1,ADA1402,0.0,6.0,0.0,192.0,0.0,0.0,0.0,53.0,0.0,3.0,73.0,0.0,160.0
2,ADA1403,1.0,3.0,0.0,75.0,1.0,0.0,0.0,26.0,0.0,0.0,18.0,0.0,53.0
3,ADA1404,1.0,12.0,2.0,310.0,0.0,0.0,1.0,44.0,0.0,4.0,78.0,0.0,239.0
4,ADA1405,1.0,5.0,0.0,211.0,1.0,0.0,1.0,32.0,0.0,3.0,71.0,0.0,224.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
956,WASHINGTON07 MIDVALE,1.0,9.0,0.0,120.0,0.0,0.0,0.0,11.0,0.0,1.0,14.0,0.0,96.0
957,WASHINGTON08 CAMBRIDGE,1.0,9.0,2.0,121.0,0.0,0.0,0.0,9.0,0.0,1.0,42.0,0.0,72.0
958,WASHINGTON09 PIONEER,2.0,7.0,1.0,46.0,1.0,0.0,0.0,16.0,0.0,0.0,24.0,2.0,52.0
959,WASHINGTON10 SUNNYSIDE,1.0,8.0,1.0,98.0,2.0,0.0,1.0,16.0,1.0,2.0,20.0,0.0,88.0


In [18]:
# Process general files
general_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)
        
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]          
            df = df[~df['precinct'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]       
        
        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]  
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]    
        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        general_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(general_df_list, ignore_index=True)

# Checking any suspicious precinct
suspicious_precincts = gen_combined_df[
    gen_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]


In [19]:
# Select only the relevant columns
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]

general_data

Unnamed: 0,precinct,party,candidate,votes
0,ADA1401,IND,Darrell L. Castle,4
1,ADA1402,IND,Darrell L. Castle,3
2,ADA1403,IND,Darrell L. Castle,2
3,ADA1404,IND,Darrell L. Castle,13
4,ADA1405,IND,Darrell L. Castle,8
...,...,...,...,...
7683,WASHINGTON07 MIDVALE,REP,Donald J. Trump,403
7684,WASHINGTON08 CAMBRIDGE,REP,Donald J. Trump,398
7685,WASHINGTON09 PIONEER,REP,Donald J. Trump,288
7686,WASHINGTON10 SUNNYSIDE,REP,Donald J. Trump,359


In [20]:
# Viewing candidate data
general_data["candidate"].value_counts(dropna=False)

candidate
Darrell L. Castle         961
Hillary Rodham Clinton    961
Scott Copeland            961
Rocky De La Fuente        961
Gary Johnson              961
Evan McMullin             961
Jill Stein                961
Donald J. Trump           961
Name: count, dtype: int64

In [21]:
# Cleaning Candidates

# Turning all general data to uppercase
general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


# unwanted_keywords = [
#     r"\w*VOTE\w*",
#     r"\w*UNCOM\w*",
#     r"\w*TOTAL\w*",
#     r"\w*WRITE[\s-]\w*",
#     r"NAN",
#     r"UNCERTIFIED",
#     r"UNVERIFIED",
#     r"NONE OF THE ABOVE",
#     r"LBT",
#     r"DEM",
#     r"REP",
#     r"GRN",
#     r"NAN"

# ]

# pattern = "|".join(unwanted_keywords)

# # FILTER OUT TRASH WORDS
# general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# # Fixing Darrell Castle"
# general_data.loc[
#     general_data["candidate"].str.contains("\w*ASTLE\w*", case=False, na=False),
#     "candidate"
# ] = "CASTLE"

# general_data.loc[
#     general_data["candidate"].str.contains("\w*ATUREN\w*", case=False, na=False),
#     "candidate"
# ] = "MATUREN"

# # Fixing mixed president + vice_president by /
# general_data["candidate"] = (
#     general_data["candidate"].str.split("/")
#     .str[0]
#     .str.strip()
#     )

# #Selecting only last name
# general_data["candidate"] = general_data["candidate"].str.split().str[-1]

# # Fixing the , candidate
# # general_data["candidate"] = (
# #     general_data["candidate"].str.split(",")
# #     .str[0]
# #     .str.strip()
# #     )




# # Fixing Cubbler
# general_data.loc[
#     ( 
#         general_data["candidate"].str.contains("\w*UBBIER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*UBLER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*OBBLER\w*", case=False, na=False),
#     "candidate")
# ] = "CUBBLER"


# # Fixing Kotlikoff
# general_data.loc[
#     general_data["candidate"].str.contains("\w*TIKOFF\w*", case=False, na=False),
#     "candidate"
# ] = "KOTLIKOFF"

# # Fixing Valdivia
# general_data.loc[
#     general_data["candidate"].str.contains("\w*VALDIVA\W*", case=False, na=False),
#     "candidate"
# ] = "VALDIVIA"

# # Fixing HOEFLING
# general_data.loc[
#     general_data["candidate"].str.contains("\w*HOEFFLING", case=False, na=False),
#     "candidate"
# ] = "HOEFLING"


# # Fixing McMullin
# general_data.loc[
#     general_data["candidate"].str.contains("MCMULLEN", case=False, na=False) ,
#     "candidate"
# ] = "MCMULLIN"

# Fixing LASTNAME + First name Initial
# general_data.loc[:,"candidate"] = (
#     general_data["candidate"].str.split().str[0].str.upper()
# )


# #Selecting only last name
general_data["candidate"] = general_data["candidate"].str.split().str[-1]

general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].str.split().str[-1]


candidate
CASTLE      961
CLINTON     961
COPELAND    961
FUENTE      961
JOHNSON     961
MCMULLIN    961
STEIN       961
TRUMP       961
Name: count, dtype: int64

In [22]:
# Viewing Party
general_data["party"].value_counts(dropna=False)

party
IND    3844
DEM     961
CON     961
LIB     961
REP     961
Name: count, dtype: int64

In [23]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown
    

#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)

    # Filter and combine only known party entries (exclude 'UNK')
    new_data = (
        df[["candidate", "party"]]
        .dropna()
        .query('party != "UNK"')
        .drop_duplicates()
    )

    # Merge with master and remove duplicates by candidate
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)

    # Filter and combine only known party entries (exclude 'UNK')
    new_data = (
        df[["candidate", "party"]]
        .dropna()
        .query('party != "UNK"')
        .drop_duplicates()
    )

    # Merge with master and remove duplicates by candidate
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
# general_data["party"] = general_data.apply(
#     lambda row: fill_party_from_data(row, general_data),
#     axis=1
# )

# # Fill remaining party using general master CSV
# master_party_df = pd.read_csv(r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv") # USE YOUR OWN ADDRESS
# general_data = fill_party_from_master(general_data, master_party_df)
general_data["party"].value_counts(dropna=False)


party
IND    3844
DEM     961
CON     961
LIB     961
REP     961
Name: count, dtype: int64

In [24]:
# Cleaning Party
# Turning all general data party to uppercase
general_data["party"] = general_data["party"].astype(str).str.upper()

general_data["party"] = (
    general_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GRN",
        "GREEN AND RAINBOW":"GRN",
        "GREEN-RAINBOW":"GRN",
        "LIBERTARIAN": "LIB",
        "LIBERTARIN":"LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK",
        "(WRITE-IN)":"IND"
    })
)

general_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data["party"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = (


party
IND    3844
DEM     961
CON     961
LIB     961
REP     961
Name: count, dtype: int64

In [25]:
# UPDATE MASTER FILE, CAREFUL
update_master_candidate_party(general_data, r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv")

In [26]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate_column"] = (


candidate_column,precinct,gen_con_COPELAND,gen_dem_CLINTON,gen_ind_CASTLE,gen_ind_FUENTE,gen_ind_MCMULLIN,gen_ind_STEIN,gen_lib_JOHNSON,gen_rep_TRUMP
0,ADA1401,0,206,4,0,63,10,35,857
1,ADA1402,2,292,3,1,86,8,49,796
2,ADA1403,2,121,2,1,41,5,18,289
3,ADA1404,2,317,13,7,148,13,79,1290
4,ADA1405,3,331,8,3,90,9,54,994
...,...,...,...,...,...,...,...,...,...
956,WASHINGTON07 MIDVALE,3,31,2,0,4,0,3,403
957,WASHINGTON08 CAMBRIDGE,0,68,2,0,29,2,8,398
958,WASHINGTON09 PIONEER,1,110,2,1,14,4,18,288
959,WASHINGTON10 SUNNYSIDE,4,48,2,1,24,1,15,359


In [27]:
# Merge
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")

# Convert DEM primary columns to numeric and calculate total
dem_cols = combined.filter(like="pri_dem_").columns
combined[dem_cols] = combined[dem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_primary_total"] = combined[dem_cols].sum(axis=1)

# Convert general election columns to numeric and calculate total
gen_cols = combined.filter(like="gen_").columns
combined[gen_cols] = combined[gen_cols].apply(pd.to_numeric, errors="coerce")
combined["general_total"] = combined[gen_cols].sum(axis=1)

# Convert all numeric columns to int
for col in combined.columns[1:]:
    combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0).astype(int)

# Identify filtered-out precincts
primary_precincts = set(primary_result["precinct"])
general_precincts = set(general_result["precinct"])
combined_precincts = set(combined["precinct"])

primary_filtered_out = primary_precincts - combined_precincts
general_filtered_out = general_precincts - combined_precincts




In [28]:


combined.to_csv("ID.csv", index=False)
# Step 3: Identify unmatched precincts and retrieve full rows
combined_precincts = set(combined["precinct"])

# Rows in primary_result but not in combined
primary_filtered_out = primary_result[~primary_result["precinct"].isin(combined_precincts)]
primary_filtered_out.to_csv("ID_primary_filtered.csv", index=False)

# Rows in general_result but not in combined
general_filtered_out = general_result[~general_result["precinct"].isin(combined_precincts)]
general_filtered_out.to_csv("ID_general_filtered.csv", index=False)

In [29]:
print(f"primary: {len(primary_precincts)}, general: {len(general_precincts)}, combined: {len(combined)}")

primary: 961, general: 961, combined: 936


In [30]:
print(f"primary: {len(pri_combined_df['county'].unique())}, general: {len(gen_combined_df['county'].unique())}")


primary: 44, general: 44
