In [81]:
# Import all the libraries
import pandas as pd
import glob
import os
import re
from pprint import pprint

In [87]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\NY\unfiltered\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['general','precinct'])
]

# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['primary','precinct'])
]

In [88]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\NY\unfiltered\20161108__ny__general__albany__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\NY\unfiltered\20161108__ny__general__allegany__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\NY\unfiltered\20161108__ny__general__bronx__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\NY\unfiltered\20161108__ny__general__broome__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\NY\unfiltered\20161108__ny__general__cattaraugus__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\NY\unfiltered\20161108__ny__general__cayuga__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\NY\unfiltered\20161108__ny__general__chautauqua__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\NY\unfiltered\20161108__ny__general__chemung__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\NY\unfiltered\20161108__ny__general__chenango__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\NY\unfiltered\20161108__ny__general__clinton__precinct.csv
C:\Huy Phan

In [79]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:


In [52]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)
        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
            
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]                
            df = df[~df['precinct'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]   

        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]       
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]               
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
pri_combined_df

Unnamed: 0,county,precinct,office,district,party,candidate,votes,assembly_district
0,Albany,ALBANY1ALBANYW1ED1,President,,DEM,Bernie Sanders,12,
1,Albany,ALBANY2ALBANYW1ED2,President,,DEM,Bernie Sanders,73,
2,Albany,ALBANY3ALBANYW1ED3,President,,DEM,Bernie Sanders,84,
3,Albany,ALBANY4ALBANYW1ED4,President,,DEM,Bernie Sanders,91,
4,Albany,ALBANY5ALBANYW1ED5,President,,DEM,Bernie Sanders,3,
...,...,...,...,...,...,...,...,...
83773,Richmond,"RICHMOND=""62/64""",President,,Republican,Affidavit,1,64.0
83774,Richmond,"RICHMOND=""62/64""",President,,Republican,Donald J. Trump,41,64.0
83775,Richmond,"RICHMOND=""62/64""",President,,Republican,John R. Kasich,5,64.0
83776,Richmond,"RICHMOND=""62/64""",President,,Republican,Ben Carson,0,64.0


In [53]:
# Checking any suspicious precinct, county
suspicious_precincts = pri_combined_df[
    pri_combined_df["precinct"].isna() |
    pri_combined_df["precinct"].str.strip().str.upper().str.contains(r"\w*TOTAL\w*", na=False) |
    pri_combined_df["precinct"].str.strip().str.upper().isin(["NONE", "NAN"]) |
    pri_combined_df["county"].isna() |
    pri_combined_df["county"].str.strip().str.upper().str.contains(r"\w*TOTAL\w*", na=False) |
    pri_combined_df["county"].str.strip().str.upper().isin(["NONE", "NAN"])
]

In [54]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,ALBANY1ALBANYW1ED1,DEM,Bernie Sanders,12
1,ALBANY2ALBANYW1ED2,DEM,Bernie Sanders,73
2,ALBANY3ALBANYW1ED3,DEM,Bernie Sanders,84
3,ALBANY4ALBANYW1ED4,DEM,Bernie Sanders,91
4,ALBANY5ALBANYW1ED5,DEM,Bernie Sanders,3
...,...,...,...,...
83773,"RICHMOND=""62/64""",Republican,Affidavit,1
83774,"RICHMOND=""62/64""",Republican,Donald J. Trump,41
83775,"RICHMOND=""62/64""",Republican,John R. Kasich,5
83776,"RICHMOND=""62/64""",Republican,Ben Carson,0


In [55]:
# Viewing candidate data
primary_data["candidate"].value_counts(dropna=False)

candidate
Emergency            10912
Absentee/Military    10912
Federal              10912
Affidavit            10912
Bernie Sanders        5547
Hillary Clinton       5547
Donald J. Trump       5547
John R. Kasich        5547
Ben Carson            5547
Ted Cruz              5547
Public Counter        5456
Over Votes             634
Under Votes            634
Blanks                  62
Void                    62
Name: count, dtype: int64

In [56]:
# Cleaning Candidates

# Turning all primary data to uppercase
primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()

unwanted_keywords = [
    r"\w*VOTE\w*",
    r"\w*UNCOM\w*",
    r"\w*TOTAL\w*",
    r"\w*ATTERING\w*",
    r"\w*UNINSTRUCTED\w*",
    r"UNCOMMITTED", r"OTHER",r"Total Votes Cast",r"No Preference",
    "EMERGENCY",
    r"\w*ABSENTEE\w*",
    r"\w*AFFIDAVIT\w*",
    "FEDERAL","BLANKS","VOID","PUBLIC COUNTER"
]

pattern = "|".join(unwanted_keywords)

# Assuming candidate column is already string and uppercase
primary_data = primary_data[~primary_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

#fIXING DE LA FUENTE
# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("rocky.*fuente", case=False, na=False) |
#         primary_data["candidate"].str.contains("LA FUENTE", case=False, na=False) |
#         primary_data["candidate"].str.contains("\w*Fuque\w*", case=False, na=False)
#     ),
#     "candidate"
# ] = "LA FUENTE"

# # Fixing williams
primary_data.loc[
    primary_data["candidate"].str.contains("FARRELL", case=False, na=False),
    "candidate"
] = "FARRELL"

# # Fixing williams
primary_data.loc[
    primary_data["candidate"].str.contains("FIONINA", case=False, na=False),
    "candidate"
] = "FIORINA"

primary_data.loc[:, "candidate"] = primary_data["candidate"].replace({
    "Donald I. Trump": "Donald J. Trump",
})

# # Fixing the , candidate
# primary_data["candidate"] = (
#     primary_data["candidate"].str.split(",")
#     .str[0]
#     .str.strip()
#     )


# #Selecting only last name
primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]

# # Fixing Christie
# primary_data.loc[
#     primary_data["candidate"].str.contains("CHRISTLE", case=False, na=False),
#     "candidate"
# ] = "CHRISTIE"

# # Fixing KASICH
# primary_data.loc[
#     primary_data["candidate"].str.contains("KAISCH", case=False, na=False),
#     "candidate"
# ] = "CHRISTIE"

# # Fixing O'MALLEY
# primary_data.loc[
#     primary_data["candidate"].str.contains("O'MALLEY", case=False, na=False),
#     "candidate"
# ] = "O'MAILEY"

# # Fixing O'MALLEY
# primary_data.loc[
#     primary_data["candidate"].str.contains("PATAKL", case=False, na=False),
#     "candidate"
# ] = "PATAKI"

# # Fixing O'MALLEY
# primary_data.loc[
#     primary_data["candidate"].str.contains("RUBLO", case=False, na=False),
#     "candidate"
# ] = "RUBIO"

# # Fixing Huckabee
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ABEE\w*", case=False, na=False),
#     "candidate"
# ] = "HUCKABEE"

# # Fixing Fiorina
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ORINA\W*", case=False, na=False),
#     "candidate"
# ] = "FIORINA"

# # Fixing Gray
# primary_data.loc[
#     primary_data["candidate"].str.contains("GREY", case=False, na=False),
#     "candidate"
# ] = "GRAY"

# # Fixing Wilson
# primary_data.loc[
#     primary_data["candidate"].str.contains("WISON", case=False, na=False),
#     "candidate"
# ] = "WILSON"


# # Fixing separator 
# primary_data["candidate"] = (
#     primary_data["candidate"]
#     .str.split(r"\s*(?:and|/|&|–|-|\+)\s*", n=1, expand=True)[0]
#     .str.strip()
#     .str.upper()
# )

# # Fixing McMullin

# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("MCMULLIN", case=False, na=False) |
#         primary_data["candidate"].str.contains("EVAN MCMULLEN", case=False, na=False)
#     ),
#     "candidate"
# ] = "MCMULLIN"

# primary_data.loc[
#     primary_data["candidate"].str.contains("De La Fuen", case=False, na=False),
#     "candidate"
# ] = "FUENTE D"

# primary_data.loc[:,"candidate"] = (
#     primary_data["candidate"].str.split().str[0].str.upper()
# )

primary_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]


candidate
SANDERS    5547
CLINTON    5547
TRUMP      5547
KASICH     5547
CARSON     5547
CRUZ       5547
Name: count, dtype: int64

In [57]:
# Viewing Party
primary_data["party"].value_counts(dropna=False)

party
Republican    20796
Democratic    10398
REP            1392
DEM             696
Name: count, dtype: int64

In [58]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown

#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# )

primary_data["party"].value_counts(dropna=False)


party
Republican    20796
Democratic    10398
REP            1392
DEM             696
Name: count, dtype: int64

In [59]:
# Cleaning Party
# Turning all general data party to uppercase
primary_data["party"] = primary_data["party"].astype(str).str.upper()

# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# )
primary_data["party"] = (
    primary_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEMOCRAT":"DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GREEN",
        "GREEN-RAINBOW":"GRN",
        "LIBERTARIAN": "LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK"
    })
)
primary_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = primary_data["party"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = (


party
REP    22188
DEM    11094
Name: count, dtype: int64

In [60]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"]
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data.loc[:,"candidate_column"] = (


candidate_column,precinct,pri_dem_CLINTON,pri_dem_SANDERS,pri_rep_CARSON,pri_rep_CRUZ,pri_rep_KASICH,pri_rep_TRUMP
0,ALBANY100ALBANYW12ED4,8,79,0,1,1,10
1,ALBANY101ALBANYW12ED5,90,87,1,5,7,4
2,ALBANY102ALBANYW12ED6,101,78,0,0,2,4
3,ALBANY103ALBANYW12ED7,0,0,0,0,0,0
4,ALBANY104ALBANYW12ED8,123,148,0,1,11,7
...,...,...,...,...,...,...,...
5542,"RICHMOND=""81/63""",46,51,1,7,9,76
5543,"RICHMOND=""9/61""",166,113,4,11,8,18
5544,"RICHMOND=""9/62""",28,22,1,6,7,85
5545,"RICHMOND=""9/63""",3,4,0,0,0,11


In [61]:
# Process general files
general_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file, on_bad_lines='skip')
        
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()
        df["county"] = df["county"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]                
        
        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]       
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]      
            
        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        general_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(general_df_list, ignore_index=True)
# Checking any suspicious precinct
suspicious_precincts = gen_combined_df[
    gen_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]
gen_combined_df


  df = pd.read_csv(file, on_bad_lines='skip')


Unnamed: 0,county,precinct,office,district,candidate,party,votes
0,ALBANY,"ALBANY=""1ALBANYW1ED1""",President,,Hillary Clinton,DEM,39
1,ALBANY,"ALBANY=""2ALBANYW1ED2""",President,,Hillary Clinton,DEM,194
2,ALBANY,"ALBANY=""3ALBANYW1ED3""",President,,Hillary Clinton,DEM,294
3,ALBANY,"ALBANY=""4ALBANYW1ED4""",President,,Hillary Clinton,DEM,325
4,ALBANY,"ALBANY=""5ALBANYW1ED5""",President,,Hillary Clinton,DEM,6
...,...,...,...,...,...,...,...
80452,RICHMOND,"RICHMOND=""58/64""",President,NYC,Absentee/Military,,0
80453,RICHMOND,"RICHMOND=""59/64""",President,NYC,Absentee/Military,,0
80454,RICHMOND,"RICHMOND=""60/64""",President,NYC,Absentee/Military,,0
80455,RICHMOND,"RICHMOND=""61/64""",President,NYC,Absentee/Military,,0


In [62]:
# Select only the relevant columns
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]

general_data

Unnamed: 0,precinct,party,candidate,votes
0,"ALBANY=""1ALBANYW1ED1""",DEM,Hillary Clinton,39
1,"ALBANY=""2ALBANYW1ED2""",DEM,Hillary Clinton,194
2,"ALBANY=""3ALBANYW1ED3""",DEM,Hillary Clinton,294
3,"ALBANY=""4ALBANYW1ED4""",DEM,Hillary Clinton,325
4,"ALBANY=""5ALBANYW1ED5""",DEM,Hillary Clinton,6
...,...,...,...,...
80452,"RICHMOND=""58/64""",,Absentee/Military,0
80453,"RICHMOND=""59/64""",,Absentee/Military,0
80454,"RICHMOND=""60/64""",,Absentee/Military,0
80455,"RICHMOND=""61/64""",,Absentee/Military,0


In [63]:
# Viewing candidate data
general_data["candidate"] = general_data["candidate"].astype(str).str.upper()
general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


candidate
MANUALLY COUNTED EMERGENCY                           5456
ABSENTEE / MILITARY                                  5456
FEDERAL                                              5456
SPECIAL PRESIDENTIAL                                 5456
AFFIDAVIT                                            5456
JILL STEIN / AJAMU BARAKA (GREEN)                    5346
GARY JOHNSON / BILL WELD (INDEPENDENCE)              5346
GARY JOHNSON / BILL WELD (LIBERTARIAN)               5346
HILLARY CLINTON / TIM KAINE (DEMOCRATIC)             5346
DONALD J. TRUMP / MICHAEL R. PENCE (REPUBLICAN)      5346
DONALD J. TRUMP / MICHAEL R. PENCE (CONSERVATIVE)    5346
HILLARY CLINTON / TIM KAINE (WOMEN'S EQUALITY)       5346
HILLARY CLINTON / TIM KAINE (WORKING FAMILIES)       5346
SCATTERED                                            4237
ABSENTEE/MILITARY                                    2978
HILLARY CLINTON                                       951
DONALD TRUMP                                          634
GARY

In [64]:
# Cleaning Candidates

# Turning all general data to uppercase
general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


unwanted_keywords = [
    "BVS",
    "ABSENTEE",
    "MANUALLY COUNTED EMERGENCY",
    "FEDERAL",
    "SPECIAL PRESIDENTIAL",
    "AFFIDAVIT",
    "r\w*ABSENTEE\w*",
    "r\w*SCATT\w*",
    "NO BODY",
    "NEITHER"
    "NO CONFIDENCE",
    "MARTIN LUTHER KING JR.",
    "MICKEY MOUSE",
    "LITTERALLY ANYONE ELSE",
    "LORI A TREAT",
    "ABRAHAM LINCOLN",
    "ANONYMOUS",
    "GEORGE WASHINGTON",
    "DO OVER",
    "POPE FRANCIS"
    r"\w*VOTE\w*",                 # VOTE, VOTES
    r"\bUNCOM\w*\b",                 # UNCOMMITTED, UNCOM
    r"\bTOTAL(S)?\b",                # TOTAL, TOTALS
    r"\w*WRITE[- ]?IN\w*\b",         # WRITE-IN, WRITE-INS, WRITE INS
    r"\bSCATTER(ING|INGS)?\b",       # SCATTERING, SCATTERINGS
    r"\bOVER VOTE(S)?\b",            # OVER VOTE, OVER VOTES
    r"\bUNDER VOTE(S)?\b",           # UNDER VOTE, UNDER VOTES
    r"\bSPECIAL VOTE(S)?\b",         # SPECIAL VOTE, SPECIAL VOTES
    r"\bBLANK(S)?\b",                # BLANK, BLANKS
    r"\bBLANK/OVER VOTE(S)?\b",      # BLANK/OVER VOTE, BLANK/OVER VOTES
    r"\bVOID(S)?\b",                 # VOID, VOIDS
    r"\bUNQUALIFIED WRITE[- ]?IN(S)?\b",  # UNQUALIFIED WRITE-IN(S)
    r"\bBALLOT(S)? CAST\b",          # BALLOTS CAST
    r"\bNONE OF (THE )?ABOVE\b",     # NONE OF THE ABOVE, NONE OF ABOVE
    r"\bANONYMOUS\b",                # ANONYMOUS
    r"\bSCATTERED\b",                # SCATTERED
    r"\bOVER AND UNDER VOTE(S)?\b",  # OVER AND UNDER VOTES
    r"\bUNCERTIFIED\b",              # UNCERTIFIED
    r"\bUNVERIFIED\b",               # UNVERIFIED
    r"\bLBT\b",                      # LBT
    r"\bDEM\b",                      # DEM
    r"\bREP\b",                      # REP
    r"\bGRN\b",                      # GRN
    r"\bBVS\b",                      # BVS
]
pattern = "|".join(unwanted_keywords)
# then filter:
general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# Fixing FUENTE
general_data.loc[
    general_data["candidate"].str.contains("\w*FUENTE\w*", case=False, na=False),
    "candidate"
] = "FUENTE"

general_data.loc[
    general_data["candidate"].str.contains("\w*CHRISTLEY\w*", case=False, na=False),
    "candidate"
] = "CHRISTIE"

# Fixing FUENTE
general_data.loc[
    general_data["candidate"].str.contains("\w*BAZZARI\w*", case=False, na=False),
    "candidate"
] = "BAZZARI"

general_data.loc[
    general_data["candidate"].str.contains("\w*CHRISTY\w*", case=False, na=False),
    "candidate"
] = "CHRISTIE"

# Fixing MCCAIN
general_data.loc[
    general_data["candidate"].str.contains("\w*MCCA\w*", case=False, na=False),
    "candidate"
] = "FUENTE"
pattern = "|".join(unwanted_keywords)

# FILTER OUT TRASH WORDS
general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# # Fixing Darrell Castle"
# general_data.loc[
#     general_data["candidate"].str.contains("\w*ASTLE\w*", case=False, na=False),
#     "candidate"
# ] = "CASTLE"

general_data.loc[
    general_data["candidate"].str.contains("\w*ATUREN\w*", case=False, na=False),
    "candidate"
] = "MATUREN"

general_data.loc[
    general_data["candidate"].str.contains("\w*OLTYSIK\w*", case=False, na=False),
    "candidate"
] = "SOLTYSIK"

general_data.loc[
    general_data["candidate"].str.contains("\w*MALDONADO\w*", case=False, na=False),
    "candidate"
] = "MALDONADO"

# Fixing mixed president + vice_president by /
general_data["candidate"] = (
    general_data["candidate"].str.split("/")
    .str[0]
    .str.strip()
    )

# #Selecting only last name
# general_data["candidate"] = general_data["candidate"].str.split().str[-1]

# Fixing the & candidate
# general_data["candidate"] = (
#     general_data["candidate"].str.split("/")
#     .str[0]
#     .str.strip()
#     )




# # Fixing Cubbler
# general_data.loc[
#     ( 
#         general_data["candidate"].str.contains("\w*UBBIER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*UBLER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*OBBLER\w*", case=False, na=False),
#     "candidate")
# ] = "CUBBLER"


# Fixing Kotlikoff
general_data.loc[
    general_data["candidate"].str.contains("\w*LIKOFF\w*", case=False, na=False),
    "candidate"
] = "KOTLIKOFF"

# Fixing Valdivia
general_data.loc[
    general_data["candidate"].str.contains("\w*KENISTON\W*", case=False, na=False),
    "candidate"
] = "KENISTON"

# Fixing HOEFLING
general_data.loc[(
    general_data["candidate"].str.contains("\w*HOEF\w*", case=False, na=False),
    "candidate")
] = "HOEFLING"


general_data.loc[
    general_data["candidate"].str.contains("\w*SCHOENKE", case=False, na=False),
    "candidate"
] = "SCHOENKE"


# Fixing McMullin
general_data.loc[
    general_data["candidate"].str.contains("MCMULLIN", case=False, na=False) ,
    "candidate"
] = "MCMULLIN"

general_data.loc[
    general_data["candidate"].str.contains("FOX", case=False, na=False) ,
    "candidate"
] = "FOX"

# Fixing LASTNAME + First name Initial
# general_data.loc[:,"candidate"] = (
#     general_data["candidate"].str.split().str[0].str.upper()
# )


# #Selecting only last name
general_data["candidate"] = general_data["candidate"].str.split().str[-1]

general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].astype(str).str.upper()
  general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]
  general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]


candidate
CLINTON    17082
TRUMP      11388
JOHNSON    11388
STEIN       5694
Name: count, dtype: int64

In [65]:
# Viewing Party
general_data["party"].value_counts(dropna=False)

party
Democratic          5377
Republican          5377
Conservative        5377
Green               5377
Working Families    5377
Independence        5377
Women's Equality    5377
Libertarian         5377
DEM                  317
REP                  317
CON                  317
GRN                  317
WOR                  317
IND                  317
WEP                  317
LIB                  317
Name: count, dtype: int64

In [66]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown
    

#=====================================
# Function to fill remaining missing party from master lookup
#=====================================
def fill_party_from_master(df, master_df):
    party_map = master_df.set_index("candidate")["party"].to_dict()
    df["party"] = df.apply(
        lambda row: party_map.get(row["candidate"], row["party"])
        if pd.isna(row["party"]) else row["party"],
        axis=1
    )
    return df


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)

    # Filter and combine only known party entries (exclude 'UNK')
    new_data = (
        df[["candidate", "party"]]
        .dropna()
        .query('party != "UNK"')
        .drop_duplicates()
    )

    # Merge with master and remove duplicates by candidate
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
# general_data["party"] = general_data.apply(
#     lambda row: fill_party_from_data(row, general_data),
#     axis=1
# )

# # Fill remaining party using general master CSV
# master_party_df = pd.read_csv(r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv") # USE YOUR OWN ADDRESS
# general_data = fill_party_from_master(general_data, master_party_df)
general_data["party"].value_counts(dropna=False)


party
Democratic          5377
Republican          5377
Conservative        5377
Green               5377
Working Families    5377
Independence        5377
Women's Equality    5377
Libertarian         5377
DEM                  317
REP                  317
CON                  317
GRN                  317
WOR                  317
IND                  317
WEP                  317
LIB                  317
Name: count, dtype: int64

In [67]:
# Cleaning Party
# Turning all general data party to uppercase
general_data["party"] = general_data["party"].astype(str).str.upper()

general_data["party"] = (
    general_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "CONSERVATIVE":"REP",
        "WORKING FAMILIES":"DEM",
        "WOMEN'S EQUALITY":"DEM",
        "DEMOCRATIC": "DEM",
        "DEMOCRAT":"DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GRN",
        "WGR":"GRN",
        "GREEN AND RAINBOW":"GRN",
        "GREEN-RAINBOW":"GRN",
        "MTN":"GRN",
        "LIBERTARIAN": "LIB",
        "LIBERTARIN":"LIB",
        "LPN":"LIB",
        "LBT": "LIB",
        "L": "LIB",
        "IAP":"CON",
        "CONSTITUTION": "CON",
        "CST":"CON",
        "AMERICAN DELTA":"AMD",
        "PROHIBITION":"PRO",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "INDEPENDENCE":"IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK",
        "NPA":"IND",
        "UST":"CON",
        "NPP":"IND",
        "(WRITE-IN)":"IND"
    })
)

general_data["party"].value_counts(dropna=False)

party
DEM    16448
REP    11071
GRN     5694
IND     5694
LIB     5694
CON      317
WOR      317
WEP      317
Name: count, dtype: int64

In [68]:
# UPDATE MASTER FILE, CAREFUL
update_master_candidate_party(general_data, r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv")

In [69]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

  general_result = general_data.pivot_table(


candidate_column,precinct,gen_con_TRUMP,gen_dem_CLINTON,gen_grn_STEIN,gen_ind_JOHNSON,gen_lib_JOHNSON,gen_rep_TRUMP,gen_wep_CLINTON,gen_wor_CLINTON
0,"ALBANY=""100ALBANYW12ED4""",3,235,9,7,2,45,1,1
1,"ALBANY=""101ALBANYW12ED5""",8,265,11,7,10,103,10,13
2,"ALBANY=""102ALBANYW12ED6""",5,253,10,1,2,77,2,5
3,"ALBANY=""103ALBANYW12ED7""",0,2,0,0,0,0,0,0
4,"ALBANY=""104ALBANYW12ED8""",6,349,13,15,3,80,6,18
...,...,...,...,...,...,...,...,...,...
5689,"RICHMOND=""81/63""",0,223,6,5,1,310,0,0
5690,"RICHMOND=""9/61""",0,485,13,6,5,121,0,0
5691,"RICHMOND=""9/62""",0,88,4,3,3,267,0,0
5692,"RICHMOND=""9/63""",0,13,0,0,2,42,0,0


In [70]:
# Merge
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")

# Convert DEM primary columns to numeric and calculate total
dem_cols = combined.filter(like="pri_dem_").columns
combined[dem_cols] = combined[dem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_primary_total"] = combined[dem_cols].sum(axis=1)

# Convert general election columns to numeric and calculate total
gen_cols = combined.filter(like="gen_").columns
combined[gen_cols] = combined[gen_cols].apply(pd.to_numeric, errors="coerce")
combined["general_total"] = combined[gen_cols].sum(axis=1)

# Convert all numeric columns to int
for col in combined.columns[1:]:
    combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0).astype(int)

# Identify filtered-out precincts
primary_precincts = set(primary_result["precinct"])
general_precincts = set(general_result["precinct"])
combined_precincts = set(combined["precinct"])

primary_filtered_out = primary_precincts - combined_precincts
general_filtered_out = general_precincts - combined_precincts




In [71]:


combined.to_csv("NY.csv", index=False)
# Step 3: Identify unmatched precincts and retrieve full rows
combined_precincts = set(combined["precinct"])

# Rows in primary_result but not in combined
primary_filtered_out = primary_result[~primary_result["precinct"].isin(combined_precincts)]
primary_filtered_out.to_csv("NY_primary_filtered.csv", index=False)

# Rows in general_result but not in combined
general_filtered_out = general_result[~general_result["precinct"].isin(combined_precincts)]
general_filtered_out.to_csv("NY_general_filtered.csv", index=False)

In [72]:
print(f"primary: {len(primary_precincts)}, general: {len(general_precincts)}, combined: {len(combined)}")

primary: 5547, general: 5694, combined: 5224


In [73]:
print(f"primary: {len(pri_combined_df['county'].unique())}, general: {len(gen_combined_df['county'].unique())}")


primary: 7, general: 7


In [74]:
pri_combined_df['county'].unique()

array(['Albany', 'Bronx', 'Chenango', 'Kings', 'New York', 'Queens',
       'Richmond'], dtype=object)

In [75]:
gen_combined_df['county'].unique()

array(['ALBANY', 'BRONX', 'CHENANGO', 'KINGS', 'NEW YORK', 'QUEENS',
       'RICHMOND'], dtype=object)

In [76]:
primary_counties = set(pri_combined_df['jurisdiction'].dropna().str.strip().str.upper())
general_counties = set(gen_combined_df['county'].dropna().str.strip().str.upper())

diff = primary_counties - general_counties
print(f"Counties in primary but not in general: {len(diff)}")
print(diff)


KeyError: 'jurisdiction'