In [1]:
# Import all the libraries
import pandas as pd
import glob
import os
import re
from pprint import pprint

In [2]:
#Get all CSV files in the folder
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\TX\*.csv")
all_files += glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\TX\counties\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]

# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary'])
]


In [3]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20161108__tx__general__anderson__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20161108__tx__general__andrews__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20161108__tx__general__angelina__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20161108__tx__general__aransas__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20161108__tx__general__archer__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20161108__tx__general__armstrong__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20161108__tx__general__atascosa__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20161108__tx__general__austin__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20161108__tx__general__bailey__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20161108__tx__general__bandera__precinct.csv
C:\Huy Phan\College\VoterTurnout\

In [4]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20160301__tx__primary__anderson__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20160301__tx__primary__andrews__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20160301__tx__primary__angelina__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20160301__tx__primary__aransas__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20160301__tx__primary__armstrong__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20160301__tx__primary__atascosa__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20160301__tx__primary__austin__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20160301__tx__primary__bandera__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20160301__tx__primary__bastrop__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\TX\counties\20160301__tx__primary__baylor__precinct.csv
C:\Huy Phan\College\VoterTurnou

In [5]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)
        
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]                
            df = df[~df['precinct'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]   

        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]       
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]   

        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)


# Checking any suspicious precinct
suspicious_precincts = pri_combined_df[
    pri_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]


In [6]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,ANDERSON1,DEM,Hillary Clinton,54
1,ANDERSON1,DEM,Willie L. Wilson,0
2,ANDERSON1,DEM,Bernie Sanders,20
3,ANDERSON1,DEM,Keith Judd,0
4,ANDERSON1,DEM,Calvis L. Hawes,0
...,...,...,...,...
143437,BREWSTERPROVISIONAL,REP,Donald J. Trump,2
143438,BREWSTERPROVISIONAL,REP,Ben Carson,0
143439,BREWSTERPROVISIONAL,REP,Marco Rubio,0
143440,BREWSTERPROVISIONAL,REP,Elizabeth Grey,0


In [7]:
# Viewing candidate data
primary_data["candidate"].value_counts(dropna=False)

candidate
Hillary Clinton       5890
Bernie Sanders        5890
Keith Judd            5886
Star Locke            5885
Martin J. O'Malley    5856
                      ... 
Fiorina, Carly           4
Cruz, Ted                4
Christie, Chris          4
Carson, Ben              4
Bush, Jeb                4
Name: count, Length: 83, dtype: int64

In [8]:
# Cleaning Candidates

# Turning all primary data to uppercase
primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()

unwanted_keywords = [
    r"\w*VOTE\w*",
    r"\w*UNCOM\w*",
    r"\w*TOTAL\w*"
]

pattern = "|".join(unwanted_keywords)

# Assuming candidate column is already string and uppercase
primary_data = primary_data[~primary_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# Fixing the , candidate
primary_data["candidate"] = (
    primary_data["candidate"].str.split(",")
    .str[0]
    .str.strip()
    )

# Fixing De la Fuente
primary_data.loc[
    primary_data["candidate"].str.contains("\w*Fuen\w*", case=False, na=False),
    "candidate"
] = "LA FUENTE"

#Selecting only last name
primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]

# Fixing Huckabee
primary_data.loc[
    primary_data["candidate"].str.contains("\w*ABEE\w*", case=False, na=False),
    "candidate"
] = "HUCKABEE"

# Fixing Fiorina
primary_data.loc[
    primary_data["candidate"].str.contains("\w*ORINA\W*", case=False, na=False),
    "candidate"
] = "FIORINA"

# Fixing Gray
primary_data.loc[
    primary_data["candidate"].str.contains("GREY", case=False, na=False),
    "candidate"
] = "GRAY"

# Fixing Wilson
primary_data.loc[
    primary_data["candidate"].str.contains("WISON", case=False, na=False),
    "candidate"
] = "WILSON"


# Fixing separator 
# primary_data["candidate"] = (
#     primary_data["candidate"]
#     .str.split(r"\s*(?:and|/|&|–|-|\+)\s*", n=1, expand=True)[0]
#     .str.strip()
#     .str.upper()
# )

# # Fixing McMullin

# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("MCMULLIN", case=False, na=False) |
#         primary_data["candidate"].str.contains("EVAN MCMULLEN", case=False, na=False)
#     ),
#     "candidate"
# ] = "MCMULLIN"

primary_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]


candidate
CLINTON     5932
SANDERS     5932
WILSON      5928
JUDD        5928
HAWES       5928
O'MALLEY    5928
FUENTE      5928
LOCKE       5926
TRUMP       5912
CRUZ        5911
RUBIO       5911
CARSON      5910
KASICH      5907
BUSH        5907
GRAHAM      5906
CHRISTIE    5905
FIORINA     5905
GRAY        5905
PAUL        5905
HUCKABEE    5905
SANTORUM    5905
Name: count, dtype: int64

In [9]:
# Viewing Party
primary_data["party"].value_counts(dropna=False)

party
REP                 73458
DEM                 45184
REPUBLICAN PARTY     1750
DEMOCRATIC PARTY     1142
R                     819
Republican            624
D                     504
Democratic            384
NaN                   359
Name: count, dtype: int64

In [10]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown
    

#=====================================
# Function to fill remaining missing party from master lookup
#=====================================
def fill_party_from_master(df, master_df):
    party_map = master_df.set_index("candidate")["party"].to_dict()
    df["party"] = df.apply(
        lambda row: party_map.get(row["candidate"], row["party"])
        if pd.isna(row["party"]) else row["party"],
        axis=1
    )
    return df


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)
    
    # Combine unique pairs
    new_data = df[["candidate", "party"]].dropna().drop_duplicates()
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
primary_data["party"] = primary_data.apply(
    lambda row: fill_party_from_data(row, primary_data),
    axis=1
)

# Fill remaining party using general master CSV
# master_party_df = pd.read_csv("data/master_primary_candidate_party.csv")
# general_data = fill_party_from_master(general_data, master_party_df)

# STEP 3: Update master file with new (candidate, party) pairs
# update_master_candidate_party(general_data, "data/master_candidate_party.csv")
primary_data["party"].value_counts(dropna=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = primary_data.apply(


party
REP                 73601
DEM                 45400
REPUBLICAN PARTY     1750
DEMOCRATIC PARTY     1142
R                     819
Republican            624
D                     504
Democratic            384
Name: count, dtype: int64

In [11]:
# Cleaning Party
primary_data["party"] = primary_data.apply(
    lambda row: fill_party_from_data(row, primary_data),
    axis=1
)
primary_data.loc[:,"party"] = (
    primary_data["party"]
    .replace({
        "REPUBLICAN PARTY": "REP",
        "R": "REP",
        "Republican":"REP",
        "DEMOCRATIC PARTY": "DEM",
        "D": "DEM",
        "Democratic": "DEM"
    })
    .fillna("IND")
)
primary_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = primary_data.apply(


party
REP    76794
DEM    47430
Name: count, dtype: int64

In [12]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"]
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data.loc[:,"candidate_column"] = (


candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_HAWES,pri_dem_JUDD,pri_dem_LOCKE,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_dem_WILSON,pri_rep_BUSH,...,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_GRAHAM,pri_rep_GRAY,pri_rep_HUCKABEE,pri_rep_KASICH,pri_rep_PAUL,pri_rep_RUBIO,pri_rep_SANTORUM,pri_rep_TRUMP
0,ANDERSON1,54,0,0,0,0,0,20,0,10,...,258,0,0,0,4,16,3,72,1,143
1,ANDERSON10,13,0,0,0,0,0,3,0,1,...,103,0,0,0,0,1,0,18,0,56
2,ANDERSON11,51,0,0,0,0,0,12,0,2,...,193,0,0,0,0,7,1,31,0,98
3,ANDERSON13,65,0,0,0,0,1,18,0,8,...,250,1,1,1,1,15,0,57,0,158
4,ANDERSON15,20,0,0,0,0,0,13,0,0,...,190,0,0,0,0,10,1,41,0,103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5992,YOUNG7,6,0,0,0,0,1,4,0,1,...,28,0,0,0,1,0,0,7,0,21
5993,ZAPATA1,597,34,2,6,5,7,223,10,0,...,11,0,0,0,0,0,0,2,0,7
5994,ZAPATA2,518,29,3,6,1,3,149,9,0,...,8,0,0,0,0,0,0,4,0,4
5995,ZAPATA3,440,40,5,5,1,7,195,7,1,...,12,0,0,0,1,0,0,5,0,22


In [13]:
# Process general files
general_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)
        
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]                
            df = df[~df['precinct'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]   

        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]       
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]   
        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        general_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(general_df_list, ignore_index=True)

# Checking any suspicious precinct
suspicious_precincts = gen_combined_df[
    gen_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]


In [14]:
# Select only the relevant columns
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]

general_data

Unnamed: 0,precinct,party,candidate,votes
0,ANDERSON1,Rep,Donald J. Trump / Mike Pence,742
1,ANDERSON1,Dem,Hillary Clinton / Tim Kaine,262
2,ANDERSON1,Lib,Gary Johnson / William Weld,18
3,ANDERSON1,Grn,Jill Stein / Ajamu Baraka,5
4,ANDERSON1,,Darrel L Castle,1
...,...,...,...,...
76328,ZAVALA2,Grn,Jill Stein,3
76329,ZAVALA3,Grn,Jill Stein,9
76330,ZAVALA4,Grn,Jill Stein,3
76331,ZAVALA4A,Grn,Jill Stein,0


In [15]:
# Viewing candidate data
general_data["candidate"].value_counts(dropna=False)

candidate
Gary Johnson                          5882
Jill Stein                            5871
Hillary Clinton                       5866
Donald Trump                          3624
Under Votes                           2715
                                      ... 
Monica MooreHead/Lamont Lilly            1
Laurence Kotikoff                        1
Michael a maturen/juan a munoz (w)       1
Marco Rubio                              1
Robert morrow/todd sanders (w)           1
Name: count, Length: 209, dtype: int64

In [16]:
# Cleaning Candidates

# Turning all primary data to uppercase
general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


unwanted_keywords = [
    r"\w*VOTE\w*",
    r"\w*UNCOM\w*",
    r"\w*TOTAL\w*",
    r"\w*WRITE[\s-]\w*",
    r"NAN",
    r"UNCERTIFIED",
    r"UNVERIFIED",
    r"NONE OF THE ABOVE",
    r"LBT",
    r"DEM",
    r"REP",
    r"GRN",
    r"NAN"

]

pattern = "|".join(unwanted_keywords)

# FILTER OUT TRASH WORDS
general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# Fixing Darrell Castle"
general_data.loc[
    general_data["candidate"].str.contains("\w*ASTLE\w*", case=False, na=False),
    "candidate"
] = "CASTLE"

general_data.loc[
    general_data["candidate"].str.contains("\w*ATUREN\w*", case=False, na=False),
    "candidate"
] = "MATUREN"

# Fixing mixed president + vice_president by /
general_data["candidate"] = (
    general_data["candidate"].str.split("/")
    .str[0]
    .str.strip()
    )

#Selecting only last name
general_data["candidate"] = general_data["candidate"].str.split().str[-1]

# Fixing the , candidate
# general_data["candidate"] = (
#     general_data["candidate"].str.split(",")
#     .str[0]
#     .str.strip()
#     )



# Fixing Cubbler
general_data.loc[
    ( 
        general_data["candidate"].str.contains("\w*UBBIER\w*", case=False, na=False)|
        general_data["candidate"].str.contains("\w*UBLER\w*", case=False, na=False)|
        general_data["candidate"].str.contains("\w*OBBLER\w*", case=False, na=False),
    "candidate")
] = "CUBBLER"


# Fixing Kotlikoff
general_data.loc[
    general_data["candidate"].str.contains("\w*TIKOFF\w*", case=False, na=False),
    "candidate"
] = "KOTLIKOFF"

# Fixing Valdivia
general_data.loc[
    general_data["candidate"].str.contains("\w*VALDIVA\W*", case=False, na=False),
    "candidate"
] = "VALDIVIA"

# Fixing HOEFLING
general_data.loc[
    general_data["candidate"].str.contains("\w*HOEFFLING", case=False, na=False),
    "candidate"
] = "HOEFLING"


# Fixing McMullin
general_data.loc[
    general_data["candidate"].str.contains("MCMULLEN", case=False, na=False) ,
    "candidate"
] = "MCMULLIN"

general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].str.split().str[-1]


candidate
TRUMP        8276
JOHNSON      8252
CLINTON      8233
STEIN        8210
MCMULLIN     3961
CASTLE       3318
MATUREN      2638
HOEFLING     2454
KOTLIKOFF    2412
VALDIVIA     2264
CUBBLER      2188
MOOREHEAD    2173
MORROW       2115
SOLTYSIK     1915
LEE          1859
STEFFES      1852
FOX          1667
HURD           17
GALLEGO        17
CORVALAN       17
BARAKA         14
SANDERS        11
CRUZ            8
SCHULIN         8
CHRISTIE        6
LEAMER          4
ERSKINE         4
LILLY           4
WALKER          4
CASE            4
BARRIERE        4
KUSHNER         4
RODRIGUEZ       4
RUBIO           1
Name: count, dtype: int64

In [17]:
# Viewing Party
general_data["party"].value_counts(dropna=False)

party
NaN             31339
REP              4153
LIB              4131
DEM              4112
GRN              4064
Republican       2307
Green            2305
Libertarian      2305
Democratic       2305
Dem              1624
Rep              1624
Grn              1624
Lbt              1572
WRITE-IN          104
D                  58
R                  58
L                  58
G                  58
Lib                51
ONA                30
wri                10
Constitution        5
GEN                 5
GREEN               4
LIBERTARIAN         4
DEMOCRATIC          4
REPUBLICAN          4
Name: count, dtype: int64

In [None]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown
    

#=====================================
# Function to fill remaining missing party from master lookup
#=====================================
def fill_party_from_master(df, master_df):
    party_map = master_df.set_index("candidate")["party"].to_dict()
    df["party"] = df.apply(
        lambda row: party_map.get(row["candidate"], row["party"])
        if pd.isna(row["party"]) else row["party"],
        axis=1
    )
    return df


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)

    # Filter and combine only known party entries (exclude 'UNK')
    new_data = (
        df[["candidate", "party"]]
        .dropna()
        .query('party != "UNK"')
        .drop_duplicates()
    )

    # Merge with master and remove duplicates by candidate
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
general_data["party"] = general_data.apply(
    lambda row: fill_party_from_data(row, general_data),
    axis=1
)

# Fill remaining party using general master CSV
# master_party_df = pd.read_csv(r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\master_candidate_party.csv") # USE YOUR OWN ADDRESS
#  general_data = fill_party_from_master(general_data, master_party_df)
general_data["party"].value_counts(dropna=False)


In [None]:
# Cleaning Party
# Turning all general data party to uppercase
general_data["party"] = general_data["party"].astype(str).str.upper()

general_data["party"] = (
    general_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GRN",
        "LIBERTARIAN": "LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK"
    })
)

general_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data["party"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = (


party
IND    27775
REP     8452
LIB     8426
DEM     8408
GRN     8349
CON     3367
UNK       66
Name: count, dtype: int64

In [None]:
# UPDATE MASTER FILE, CAREFUL
update_master_candidate_party(general_data, r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv")

In [None]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate_column"] = (
  general_result = general_data.pivot_table(


candidate_column,precinct,gen_con_CASTLE,gen_dem_CLINTON,gen_dem_GALLEGO,gen_grn_BARAKA,gen_grn_STEIN,gen_ind_CASTLE,gen_ind_CUBBLER,gen_ind_FOX,gen_ind_HOEFLING,...,gen_unk_CRUZ,gen_unk_ERSKINE,gen_unk_KUSHNER,gen_unk_LEAMER,gen_unk_LILLY,gen_unk_RODRIGUEZ,gen_unk_RUBIO,gen_unk_SANDERS,gen_unk_SCHULIN,gen_unk_WALKER
0,ANDERSON1,1,262,0,0,5,0,0.0,0.0,0,...,0,0,0,0,0,0,0.0,0.0,0,0
1,ANDERSON10,0,37,0,0,0,0,0.0,0.0,0,...,0,0,0,0,0,0,0.0,0.0,0,0
2,ANDERSON11,0,130,0,0,2,0,0.0,0.0,0,...,0,0,0,0,0,0,0.0,0.0,0,0
3,ANDERSON13-14,0,268,0,0,7,0,0.0,0.0,0,...,0,0,0,0,0,0,0.0,0.0,0,0
4,ANDERSON15,0,84,0,0,1,0,0.0,0.0,0,...,0,0,0,0,0,0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8299,ZAVALA2,0,764,0,0,3,0,0.0,0.0,0,...,0,0,0,0,0,0,0.0,0.0,0,0
8300,ZAVALA3,0,737,0,0,9,0,0.0,0.0,0,...,0,0,0,0,0,0,0.0,0.0,0,0
8301,ZAVALA4,0,353,0,0,3,0,0.0,0.0,0,...,0,0,0,0,0,0,0.0,0.0,0,0
8302,ZAVALA4A,0,237,0,0,0,0,0.0,0.0,0,...,0,0,0,0,0,0,0.0,0.0,0,0


In [None]:
# Merge
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")

# Convert DEM primary columns to numeric and calculate total
dem_cols = combined.filter(like="pri_dem_").columns
combined[dem_cols] = combined[dem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_primary_total"] = combined[dem_cols].sum(axis=1)

# Convert general election columns to numeric and calculate total
gen_cols = combined.filter(like="gen_").columns
combined[gen_cols] = combined[gen_cols].apply(pd.to_numeric, errors="coerce")
combined["general_total"] = combined[gen_cols].sum(axis=1)

# Convert all numeric columns to int
for col in combined.columns[1:]:
    combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0).astype(int)

# Identify filtered-out precincts
primary_precincts = set(primary_result["precinct"])
general_precincts = set(general_result["precinct"])
combined_precincts = set(combined["precinct"])

primary_filtered_out = primary_precincts - combined_precincts
general_filtered_out = general_precincts - combined_precincts




In [None]:
# TX lo, AZ lo

combined.to_csv("TX.csv", index=False)
# Step 3: Identify unmatched precincts and retrieve full rows
combined_precincts = set(combined["precinct"])

# Rows in primary_result but not in combined
primary_filtered_out = primary_result[~primary_result["precinct"].isin(combined_precincts)]
primary_filtered_out.to_csv("TX_primary_filtered.csv", index=False)

# Rows in general_result but not in combined
general_filtered_out = general_result[~general_result["precinct"].isin(combined_precincts)]
general_filtered_out.to_csv("TX_general_filtered.csv", index=False)

In [None]:
print(f"primary: {len(primary_precincts)}, general: {len(general_precincts)}, combined: {len(combined)}")

primary: 6002, general: 8304, combined: 5097


In [None]:
print(f"primary: {len(pri_combined_df['county'].unique())}, general: {len(gen_combined_df['county'].unique())}")


primary: 133, general: 246
