In [3]:
# Import all the libraries
import pandas as pd
import glob
import os
import re
from pprint import pprint

In [4]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\CT\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['primary'])
]


In [5]:
print("General files:")
for f in general_files:
    print(f)
len(general_files)

General files:
C:\Huy Phan\College\VoterTurnout\data\CT\20161108__ct__general__precinct.csv


1

In [6]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\CT\20160426__ct__primary__president.csv


In [7]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)
        
        # CHOOSING PRESIDENT ONLY
        # office, 
        # Combine precinct as county + precinct

        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"].str.upper().str.contains("\w*PRESIDENT\w*", na=False)]

        # cleaning precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]        
            df = df[df['precinct'].str.upper() != "ELECTION TOTAL"]     

        df["precinct"] = df["town"].str.upper()
    

        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)

# Checking any suspicious precinct
suspicious_precincts = pri_combined_df[
    pri_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]
pri_combined_df


Unnamed: 0,town,office,district,party,candidate,votes,precinct
0,Andover,President,,DEM,De La Fuente,1,ANDOVER
1,Ansonia,President,,DEM,De La Fuente,6,ANSONIA
2,Ashford,President,,DEM,De La Fuente,3,ASHFORD
3,Avon,President,,DEM,De La Fuente,5,AVON
4,Barkhamsted,President,,DEM,De La Fuente,2,BARKHAMSTED
...,...,...,...,...,...,...,...
1520,Windsor Locks,President,,REP,Uncommitted,11,WINDSOR LOCKS
1521,Wolcott,President,,REP,Uncommitted,8,WOLCOTT
1522,Woodbridge,President,,REP,Uncommitted,5,WOODBRIDGE
1523,Woodbury,President,,REP,Uncommitted,9,WOODBURY


In [53]:
# cleaning 

In [9]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,ANDOVER,DEM,De La Fuente,1
1,ANSONIA,DEM,De La Fuente,6
2,ASHFORD,DEM,De La Fuente,3
3,AVON,DEM,De La Fuente,5
4,BARKHAMSTED,DEM,De La Fuente,2
...,...,...,...,...
1520,WINDSOR LOCKS,REP,Uncommitted,11
1521,WOLCOTT,REP,Uncommitted,8
1522,WOODBRIDGE,REP,Uncommitted,5
1523,WOODBURY,REP,Uncommitted,9


In [10]:
# Viewing candidate data
primary_data["candidate"].value_counts(dropna=False)

candidate
Uncommitted     339
De La Fuente    170
Clinton         170
Sanders         170
Cruz            169
Carson          169
Trump           169
Kasich          169
Name: count, dtype: int64

In [11]:
# Cleaning Candidates

# Turning all primary data to uppercase
primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()

unwanted_keywords = [
    r"\w*VOTE\w*",
    r"\w*UNCOM\w*",
    r"\w*TOTAL\w*",
    r"\w*NUMBER\w*",
    r"\w*TIME\w*",
]

pattern = "|".join(unwanted_keywords)

# Assuming candidate column is already string and uppercase
primary_data = primary_data[~primary_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# Fixing the , candidate
primary_data["candidate"] = (
    primary_data["candidate"].str.split(",")
    .str[0]
    .str.strip()
    )

# Fixing De la Fuente
primary_data.loc[
    primary_data["candidate"].str.contains("\w*FUEN\w*", case=False, na=False),
    "candidate"
] = "FUENTE"

#Selecting only last name
primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]

# Fixing Huckabee
primary_data.loc[
    primary_data["candidate"].str.contains("\w*ABEE\w*", case=False, na=False),
    "candidate"
] = "HUCKABEE"

# Fixing Fiorina
primary_data.loc[
    primary_data["candidate"].str.contains("\w*ORINA\W*", case=False, na=False),
    "candidate"
] = "FIORINA"

# Fixing Gray
primary_data.loc[
    primary_data["candidate"].str.contains("GREY", case=False, na=False),
    "candidate"
] = "GRAY"

# Fixing Wilson
primary_data.loc[
    primary_data["candidate"].str.contains("WISON", case=False, na=False),
    "candidate"
] = "WILSON"

# Fixing the , candidate
primary_data["candidate"] = (
    primary_data["candidate"].str.split(",")
    .str[0]
    .str.strip()
    )
# Fixing separator 
# primary_data["candidate"] = (
#     primary_data["candidate"]
#     .str.split(r"\s*(?:and|/|&|–|-|\+)\s*", n=1, expand=True)[0]
#     .str.strip()
#     .str.upper()
# )

# # Fixing McMullin

# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("MCMULLIN", case=False, na=False) |
#         primary_data["candidate"].str.contains("EVAN MCMULLEN", case=False, na=False)
#     ),
#     "candidate"
# ] = "MCMULLIN"

primary_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]
A value is trying to be

candidate
FUENTE     170
CLINTON    170
SANDERS    170
CRUZ       169
CARSON     169
TRUMP      169
KASICH     169
Name: count, dtype: int64

In [12]:
# Viewing Party
primary_data["party"].value_counts(dropna=False)

party
REP    676
DEM    510
Name: count, dtype: int64

In [58]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown
    

#=====================================
# Function to fill remaining missing party from master lookup
#=====================================
def fill_party_from_master(df, master_df):
    party_map = master_df.set_index("candidate")["party"].to_dict()
    df["party"] = df.apply(
        lambda row: party_map.get(row["candidate"], row["party"])
        if pd.isna(row["party"]) else row["party"],
        axis=1
    )
    return df


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)
    
    # Combine unique pairs
    new_data = df[["candidate", "party"]].dropna().drop_duplicates()
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
primary_data["party"] = primary_data.apply(
    lambda row: fill_party_from_data(row, primary_data),
    axis=1
)

# Fill remaining party using general master CSV
# master_party_df = pd.read_csv("data/master_primary_candidate_party.csv")
# general_data = fill_party_from_master(general_data, master_party_df)

# STEP 3: Update master file with new (candidate, party) pairs
# update_master_candidate_party(general_data, "data/master_candidate_party.csv")
primary_data["party"].value_counts(dropna=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = primary_data.apply(


party
REP                 5768
DEM                 2472
GRN                  824
REPUBLICAN PARTY     112
DEMOCRATIC PARTY      48
GREEN PARTY           16
Name: count, dtype: int64

In [14]:
# Cleaning Party
# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# # )
# primary_data.loc[:,"party"] = (
#     primary_data["party"]
#     .replace({
#         "REPUBLICAN PARTY": "REP",
#         "R": "REP",
#         "Republican":"REP",
#         "DEMOCRATIC PARTY": "DEM",
#         "D": "DEM",
#         "Democratic": "DEM",
#         "GREEN PARTY":"GRN"
#     })
#     .fillna("IND")
# )
primary_data["party"].value_counts(dropna=False)

party
REP    676
DEM    510
Name: count, dtype: int64

In [16]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"]
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_SANDERS,pri_rep_CARSON,pri_rep_CRUZ,pri_rep_KASICH,pri_rep_TRUMP
0,ANDOVER,145,1,259,4,42,80,163
1,ANSONIA,584,6,598,10,116,96,497
2,ASHFORD,212,3,377,12,50,70,158
3,AVON,1097,5,771,9,242,811,1027
4,BARKHAMSTED,137,2,210,9,68,127,242
...,...,...,...,...,...,...,...,...
165,WINDSOR LOCKS,442,4,552,10,81,166,422
166,WOLCOTT,384,4,514,13,152,190,1021
167,WOODBRIDGE,893,1,546,4,66,265,406
168,WOODBURY,465,1,527,14,172,397,835


In [61]:
# Process general files
general_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)
        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]           

        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()

             
        
        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        general_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(general_df_list, ignore_index=True)

# Checking any suspicious precinct
suspicious_precincts = gen_combined_df[
    gen_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]
gen_combined_df

Unnamed: 0.2,county,precinct,votes,election_day,early_voting,provisional,polling_place_votes_ds200,early_votes_ds200,party,office,district,candidate,Unnamed: 0.1,Unnamed: 0,central_count,late_early_voting
0,Apache,APACHEALPINE,60,10,5,0.0,0.0,0.0,DEM,President,,"CLINTON, KAINE",,,,
1,Apache,APACHECANYON DE CHELLY,1387,674,63,7.0,0.0,0.0,DEM,President,,"CLINTON, KAINE",,,,
2,Apache,APACHECHINLE,678,300,34,3.0,0.0,0.0,DEM,President,,"CLINTON, KAINE",,,,
3,Apache,APACHECONCHO,228,40,18,0.0,0.0,0.0,DEM,President,,"CLINTON, KAINE",,,,
4,Apache,APACHECORNFIELDS,349,262,8,0.0,0.0,0.0,DEM,President,,"CLINTON, KAINE",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4328,Yuma,YUMA44,0,0,0,0.0,,,REP,President,,TRUMP / PENCE,,,,0.0
4329,Yuma,YUMA44,0,0,0,0.0,,,DEM,President,,CLINTON / KAINE,,,,0.0
4330,Yuma,YUMA44,0,0,0,0.0,,,LIB,President,,JOHNSON / WELD,,,,0.0
4331,Yuma,YUMA44,0,0,0,0.0,,,GRN,President,,STEIN / BARAKA,,,,0.0


In [62]:
# Select only the relevant columns
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]

general_data

Unnamed: 0,precinct,party,candidate,votes
0,APACHEALPINE,DEM,"CLINTON, KAINE",60
1,APACHECANYON DE CHELLY,DEM,"CLINTON, KAINE",1387
2,APACHECHINLE,DEM,"CLINTON, KAINE",678
3,APACHECONCHO,DEM,"CLINTON, KAINE",228
4,APACHECORNFIELDS,DEM,"CLINTON, KAINE",349
...,...,...,...,...
4328,YUMA44,REP,TRUMP / PENCE,0
4329,YUMA44,DEM,CLINTON / KAINE,0
4330,YUMA44,LIB,JOHNSON / WELD,0
4331,YUMA44,GRN,STEIN / BARAKA,0


In [63]:
# Viewing candidate data
general_data["candidate"].value_counts(dropna=False)

candidate
WRITE-IN                 489
OVER VOTES               489
UNDER VOTES              489
STEIN                    350
JOHNSON                  350
TRUMP                    350
CLINTON                  350
CLINTON, KAINE           139
TRUMP, PENCE             139
JOHNSON, WELD            139
STEIN, BARAKA            139
STEIN / BARAKA           100
JOHNSON / WELD           100
CLINTON / KAINE          100
TRUMP / PENCE            100
Number of Under Votes     55
Times Over Voted          55
Times Blank Voted         55
Registered Voters         55
WRITE-IN 900              45
Overvote                  45
Undervote                 45
TotalVotes                45
Ballots Cast              44
Write-in 50               44
Write-in                  11
Times Counted             11
Name: count, dtype: int64

In [64]:
# Cleaning Candidates

# Turning all primary data to uppercase
general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


unwanted_keywords = [
    r"\w*VOTE\w*",
    r"\w*UNCOM\w*",
    r"\w*TOTAL\w*",
    r"\w*WRITE[\s-]\w*",
    r"NAN",
    r"UNCERTIFIED",
    r"UNVERIFIED",
    r"NONE OF THE ABOVE",
    r"LBT",
    r"DEM",
    r"REP",
    r"GRN",
    r"NAN",
    r"\w*CAST\w*",
    r"\w*COUNT\w*"

]

pattern = "|".join(unwanted_keywords)

# FILTER OUT TRASH WORDS
general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# Fixing Darrell Castle"
general_data.loc[
    general_data["candidate"].str.contains("\w*KAIN\w*", case=False, na=False),
    "candidate"
] = "CLINTON"

general_data.loc[
    general_data["candidate"].str.contains("\w*PENCE\w*", case=False, na=False),
    "candidate"
] = "TRUMP"

# Fixing mixed president + vice_president by /
general_data["candidate"] = (
    general_data["candidate"].str.split("/")
    .str[0]
    .str.strip()
    )

#Selecting only last name
general_data["candidate"] = general_data["candidate"].str.split().str[-1]

# Fixing the , candidate
general_data["candidate"] = (
    general_data["candidate"].str.split(",")
    .str[0]
    .str.strip()
    )



# Fixing Cubbler
general_data.loc[
    ( 
        general_data["candidate"].str.contains("\w*UBBIER\w*", case=False, na=False)|
        general_data["candidate"].str.contains("\w*UBLER\w*", case=False, na=False)|
        general_data["candidate"].str.contains("\w*OBBLER\w*", case=False, na=False),
    "candidate")
] = "CUBBLER"


# Fixing Kotlikoff
general_data.loc[
    general_data["candidate"].str.contains("\w*BARAKA\w*", case=False, na=False),
    "candidate"
] = "STEIN"

# Fixing Valdivia
general_data.loc[
    general_data["candidate"].str.contains("\w*WELD\W*", case=False, na=False),
    "candidate"
] = "JOHNSON"

# Fixing HOEFLING
general_data.loc[
    general_data["candidate"].str.contains("\w*HOEFFLING", case=False, na=False),
    "candidate"
] = "HOEFLING"


# Fixing McMullin
general_data.loc[
    general_data["candidate"].str.contains("MCMULLEN", case=False, na=False) ,
    "candidate"
] = "MCMULLIN"

general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].str.split().str[-1]
A value is trying to be

candidate
CLINTON    589
TRUMP      589
JOHNSON    589
STEIN      589
Name: count, dtype: int64

In [65]:
# Viewing Party
general_data["party"].value_counts(dropna=False)

party
DEM    589
REP    589
GRN    589
LBT    534
LIB     55
Name: count, dtype: int64

In [66]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown
    

#=====================================
# Function to fill remaining missing party from master lookup
#=====================================
def fill_party_from_master(df, master_df):
    party_map = master_df.set_index("candidate")["party"].to_dict()
    df["party"] = df.apply(
        lambda row: party_map.get(row["candidate"], row["party"])
        if pd.isna(row["party"]) else row["party"],
        axis=1
    )
    return df


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)
    
    # Combine unique pairs
    new_data = df[["candidate", "party"]].dropna().drop_duplicates()
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
general_data["party"] = general_data.apply(
    lambda row: fill_party_from_data(row, general_data),
    axis=1
)

# Fill remaining party using general master CSV
# master_party_df = pd.read_csv(r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\master_candidate_party.csv") # USE YOUR OWN ADDRESS
#  general_data = fill_party_from_master(general_data, master_party_df)
general_data["party"].value_counts(dropna=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data.apply(


party
DEM    589
REP    589
GRN    589
LBT    534
LIB     55
Name: count, dtype: int64

In [67]:
# Cleaning Party
# Turning all general data party to uppercase
general_data["party"] = general_data["party"].astype(str).str.upper()

general_data["party"] = (
    general_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GREEN",
        "LIBERTARIAN": "LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK"
    })
)

general_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data["party"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = (


party
DEM    589
REP    589
LIB    589
GRN    589
Name: count, dtype: int64

In [68]:
# UPDATE MASTER FILE, CAREFUL
update_master_candidate_party(general_data, r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv")

In [69]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate_column"] = (


candidate_column,precinct,gen_dem_CLINTON,gen_grn_STEIN,gen_lib_JOHNSON,gen_rep_TRUMP
0,APACHEALPINE,60,2,13,216
1,APACHECANYON DE CHELLY,1387,32,72,143
2,APACHECHINLE,678,17,38,93
3,APACHECONCHO,228,11,44,1135
4,APACHECORNFIELDS,349,3,16,33
...,...,...,...,...,...
584,YUMA5,461,4,58,808
585,YUMA6,902,24,47,298
586,YUMA7,1351,36,39,199
587,YUMA8,267,5,24,515


In [70]:
# Merge
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")

# Convert DEM primary columns to numeric and calculate total
dem_cols = combined.filter(like="pri_dem_").columns
combined[dem_cols] = combined[dem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_primary_total"] = combined[dem_cols].sum(axis=1)

# Convert general election columns to numeric and calculate total
gen_cols = combined.filter(like="gen_").columns
combined[gen_cols] = combined[gen_cols].apply(pd.to_numeric, errors="coerce")
combined["general_total"] = combined[gen_cols].sum(axis=1)

# Convert all numeric columns to int
for col in combined.columns[1:]:
    combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0).astype(int)

# Save combined
combined.to_csv("TX.csv", index=False)

# Identify filtered-out precincts
primary_precincts = set(primary_result["precinct"])
general_precincts = set(general_result["precinct"])
combined_precincts = set(combined["precinct"])

primary_filtered_out = primary_precincts - combined_precincts
general_filtered_out = general_precincts - combined_precincts




In [71]:

combined.to_csv("AZ.csv", index=False)
# Step 3: Identify unmatched precincts and retrieve full rows
combined_precincts = set(combined["precinct"])

# Rows in primary_result but not in combined
primary_filtered_out = primary_result[~primary_result["precinct"].isin(combined_precincts)]
primary_filtered_out.to_csv("AZ_primary_filtered.csv", index=False)

# Rows in general_result but not in combined
general_filtered_out = general_result[~general_result["precinct"].isin(combined_precincts)]
general_filtered_out.to_csv("AZ_general_filtered.csv", index=False)

In [72]:
print(f"primary: {len(primary_precincts)}, general: {len(general_precincts)}, combined: {len(combined)}")

primary: 411, general: 589, combined: 388


In [73]:
print(f"primary: {len(pri_combined_df['county'].unique())}, general: {len(gen_combined_df['county'].unique())}")


primary: 10, general: 10


In [74]:
pri_combined_df['county'].unique()

array(['APACHE', 'COCHISE', 'GREENLEE', 'LA PAZ', 'MOHAVE', 'NAVAJO',
       'PIMA', 'PINAL', 'YAVAPAI', 'YUMA'], dtype=object)

In [75]:
gen_combined_df['county'].unique()


array(['Apache', 'Cochise', 'Greenlee', 'La Paz', 'Mohave', 'Navajo',
       'Pima', 'Pinal', 'Yavapai', 'Yuma'], dtype=object)