In [1]:
# Import all the libraries
import pandas as pd
import glob
import os
import re
from pprint import pprint

In [2]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\OH\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['general'])
]

# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['primary'])
]

In [3]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\OH\20161108__oh__general__precinct.csv


In [4]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\OH\20160315__oh__primary__precinct.csv


In [5]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)
        
        # CHOOSING PRESIDENT ONLY
        # office, 
        # Combine precinct as county + precinct

        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"].str.upper().str.contains("\w*PRESIDENT\w*", na=False)]

        # cleaning precinct

        df["precinct"] = df["county"].astype(str).str.strip() + df["precinct code"].astype(str).str.strip()
        df["precinct"] = df["precinct"].str.upper()
    

        
        # Dropping duplicates value
        df = df.drop_duplicates()
        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)

# Checking any suspicious precinct
suspicious_precincts = pri_combined_df[
    pri_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]
pri_combined_df


Unnamed: 0,county,precinct name,precinct code,office,district,party,candidate,votes,precinct
0,Adams,BRATTON TOWNSHIP,AAA,President (at-large),,D,Hillary Clinton,27.0,ADAMSAAA
1,Adams,BRATTON TOWNSHIP,AAA,President (at-large),,D,"Roque ""Rocky"" De La Fuente",1.0,ADAMSAAA
2,Adams,BRATTON TOWNSHIP,AAA,President (at-large),,D,Bernie Sanders,26.0,ADAMSAAA
3,Adams,BRUSH CREEK TOWNSHIP,AAB,President (at-large),,D,Hillary Clinton,36.0,ADAMSAAB
4,Adams,BRUSH CREEK TOWNSHIP,AAB,President (at-large),,D,"Roque ""Rocky"" De La Fuente",0.0,ADAMSAAB
...,...,...,...,...,...,...,...,...,...
280006,Wyandot,TYMOCHTEE TS,ABM,President (district),5.0,R,Carly Fiorina,0.0,WYANDOTABM
280007,Wyandot,TYMOCHTEE TS,ABM,President (district),5.0,R,John R. Kasich,81.0,WYANDOTABM
280008,Wyandot,TYMOCHTEE TS,ABM,President (district),5.0,R,Marco Rubio,8.0,WYANDOTABM
280009,Wyandot,TYMOCHTEE TS,ABM,President (district),5.0,R,Rick Santorum,0.0,WYANDOTABM


In [6]:
# Checking any suspicious precinct, county
suspicious_precincts = pri_combined_df[
    pri_combined_df["precinct"].isna() |
    pri_combined_df["precinct"].str.strip().str.upper().str.contains(r"\w*TOTAL\w*", na=False) |
    pri_combined_df["precinct"].str.strip().str.upper().isin(["NONE", "NAN"]) |
    pri_combined_df["county"].isna() |
    pri_combined_df["county"].str.strip().str.upper().str.contains(r"\w*TOTAL\w*", na=False) |
    pri_combined_df["county"].str.strip().str.upper().isin(["NONE", "NAN"])
]

In [7]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,ADAMSAAA,D,Hillary Clinton,27.0
1,ADAMSAAA,D,"Roque ""Rocky"" De La Fuente",1.0
2,ADAMSAAA,D,Bernie Sanders,26.0
3,ADAMSAAB,D,Hillary Clinton,36.0
4,ADAMSAAB,D,"Roque ""Rocky"" De La Fuente",0.0
...,...,...,...,...
280006,WYANDOTABM,R,Carly Fiorina,0.0
280007,WYANDOTABM,R,John R. Kasich,81.0
280008,WYANDOTABM,R,Marco Rubio,8.0
280009,WYANDOTABM,R,Rick Santorum,0.0


In [8]:
# Viewing candidate data
primary_data["candidate"].value_counts(dropna=False)

candidate
Jeb Bush                      26918
Ben Carson                    26918
Chris Christie                26918
Ted Cruz                      26918
Carly Fiorina                 26918
John R. Kasich                26918
Marco Rubio                   26918
Donald J. Trump               26918
Rick Santorum                 19901
Mike Huckabee                 18105
Hillary Clinton                8887
Roque "Rocky" De La Fuente     8887
Bernie Sanders                 8887
Name: count, dtype: int64

In [9]:
# Cleaning Candidates

# Turning all primary data to uppercase
primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()

unwanted_keywords = [
    r"\w*VOTE\w*",
    r"\w*UNCOM\w*",
    r"\w*TOTAL\w*",
    r"\w*ATTERING\w*",
    r"\w*UNINSTRUCTED\w*",
    r"UNCOMMITTED", r"OTHER",r"Total Votes Cast",r"No Preference",
    "EMERGENCY",
    r"\w*ABSENTEE\w*",
    r"\w*AFFIDAVIT\w*",
    "FEDERAL","BLANKS","VOID","PUBLIC COUNTER"
]

pattern = "|".join(unwanted_keywords)

# Assuming candidate column is already string and uppercase
primary_data = primary_data[~primary_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

#fIXING DE LA FUENTE
# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("rocky.*fuente", case=False, na=False) |
#         primary_data["candidate"].str.contains("LA FUENTE", case=False, na=False) |
#         primary_data["candidate"].str.contains("\w*Fuque\w*", case=False, na=False)
#     ),
#     "candidate"
# ] = "LA FUENTE"

# # Fixing williams
primary_data.loc[
    primary_data["candidate"].str.contains("FARRELL", case=False, na=False),
    "candidate"
] = "FARRELL"

# # Fixing williams
primary_data.loc[
    primary_data["candidate"].str.contains("FIONINA", case=False, na=False),
    "candidate"
] = "FIORINA"

primary_data.loc[:, "candidate"] = primary_data["candidate"].replace({
    "Donald I. Trump": "Donald J. Trump",
})

# # Fixing the , candidate
# primary_data["candidate"] = (
#     primary_data["candidate"].str.split(",")
#     .str[0]
#     .str.strip()
#     )


# #Selecting only last name
primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]

# # Fixing Christie
# primary_data.loc[
#     primary_data["candidate"].str.contains("CHRISTLE", case=False, na=False),
#     "candidate"
# ] = "CHRISTIE"

# # Fixing KASICH
# primary_data.loc[
#     primary_data["candidate"].str.contains("KAISCH", case=False, na=False),
#     "candidate"
# ] = "CHRISTIE"

# # Fixing O'MALLEY
# primary_data.loc[
#     primary_data["candidate"].str.contains("O'MALLEY", case=False, na=False),
#     "candidate"
# ] = "O'MAILEY"

# # Fixing O'MALLEY
# primary_data.loc[
#     primary_data["candidate"].str.contains("PATAKL", case=False, na=False),
#     "candidate"
# ] = "PATAKI"

# # Fixing O'MALLEY
# primary_data.loc[
#     primary_data["candidate"].str.contains("RUBLO", case=False, na=False),
#     "candidate"
# ] = "RUBIO"

# # Fixing Huckabee
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ABEE\w*", case=False, na=False),
#     "candidate"
# ] = "HUCKABEE"

# # Fixing Fiorina
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ORINA\W*", case=False, na=False),
#     "candidate"
# ] = "FIORINA"

# # Fixing Gray
# primary_data.loc[
#     primary_data["candidate"].str.contains("GREY", case=False, na=False),
#     "candidate"
# ] = "GRAY"

# # Fixing Wilson
# primary_data.loc[
#     primary_data["candidate"].str.contains("WISON", case=False, na=False),
#     "candidate"
# ] = "WILSON"


# # Fixing separator 
# primary_data["candidate"] = (
#     primary_data["candidate"]
#     .str.split(r"\s*(?:and|/|&|–|-|\+)\s*", n=1, expand=True)[0]
#     .str.strip()
#     .str.upper()
# )

# # Fixing McMullin

# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("MCMULLIN", case=False, na=False) |
#         primary_data["candidate"].str.contains("EVAN MCMULLEN", case=False, na=False)
#     ),
#     "candidate"
# ] = "MCMULLIN"

# primary_data.loc[
#     primary_data["candidate"].str.contains("De La Fuen", case=False, na=False),
#     "candidate"
# ] = "FUENTE D"

# primary_data.loc[:,"candidate"] = (
#     primary_data["candidate"].str.split().str[0].str.upper()
# )

primary_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()


candidate
BUSH        26918
CARSON      26918
CHRISTIE    26918
CRUZ        26918
FIORINA     26918
KASICH      26918
RUBIO       26918
TRUMP       26918
SANTORUM    19901
HUCKABEE    18105
CLINTON      8887
FUENTE       8887
SANDERS      8887
Name: count, dtype: int64

In [10]:
# Viewing Party
primary_data["party"].value_counts(dropna=False)

party
R    253350
D     26661
Name: count, dtype: int64

In [11]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown

#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# )

primary_data["party"].value_counts(dropna=False)


party
R    253350
D     26661
Name: count, dtype: int64

In [12]:
# Cleaning Party
# Turning all general data party to uppercase
primary_data["party"] = primary_data["party"].astype(str).str.upper()

# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# )
primary_data["party"] = (
    primary_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEMOCRAT":"DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GREEN",
        "GREEN-RAINBOW":"GRN",
        "LIBERTARIAN": "LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK"
    })
)
primary_data["party"].value_counts(dropna=False)

party
REP    253350
DEM     26661
Name: count, dtype: int64

In [13]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"]
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CHRISTIE,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_HUCKABEE,pri_rep_KASICH,pri_rep_RUBIO,pri_rep_SANTORUM,pri_rep_TRUMP
0,ADAMSAAA,27.0,1.0,26.0,4.0,7.0,3.0,108.0,2.0,2.0,213.0,13.0,0.0,299.0
1,ADAMSAAB,36.0,0.0,28.0,1.0,11.0,2.0,89.0,2.0,0.0,125.0,6.0,2.0,242.0
2,ADAMSAAD,39.0,0.0,34.0,1.0,3.0,4.0,66.0,2.0,0.0,142.0,6.0,2.0,320.0
3,ADAMSAAE,35.0,0.0,22.0,0.0,1.0,1.0,23.0,1.0,0.0,43.0,0.0,0.0,93.0
4,ADAMSAAG,22.0,1.0,21.0,6.0,8.0,1.0,32.0,2.0,1.0,94.0,1.0,0.0,178.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8882,WYANDOTABH,17.0,1.0,8.0,1.0,1.0,0.0,44.0,0.0,0.0,106.0,6.0,0.0,112.0
8883,WYANDOTABI,38.0,1.0,18.0,5.0,3.0,0.0,63.0,0.0,0.0,171.0,4.0,0.0,152.0
8884,WYANDOTABJ,31.0,1.0,28.0,5.0,4.0,3.0,42.0,1.0,0.0,112.0,4.0,1.0,137.0
8885,WYANDOTABL,20.0,1.0,15.0,2.0,8.0,2.0,32.0,3.0,1.0,98.0,6.0,1.0,139.0


In [14]:
# Process general files
general_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file, on_bad_lines='skip')
        
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["precinct code"].astype(str)
        df["precinct"] = df["precinct"].str.upper()
        df["county"] = df["county"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]                
        
        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]       
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]      
            
        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        general_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(general_df_list, ignore_index=True)
# Checking any suspicious precinct
suspicious_precincts = gen_combined_df[
    gen_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]
gen_combined_df


Unnamed: 0,county,precinct name,precinct code,office,district,party,candidate,votes,precinct
0,ADAMS,BRATTON TOWNSHIP,AAA,President,,,James Jerome Bell,0,ADAMSAAA
1,ADAMS,BRATTON TOWNSHIP,AAA,President,,,Michael Bickelmeyer,0,ADAMSAAA
2,ADAMS,BRATTON TOWNSHIP,AAA,President,,,Darrell L. Castle,0,ADAMSAAA
3,ADAMS,BRATTON TOWNSHIP,AAA,President,,D,Hillary Clinton,96,ADAMSAAA
4,ADAMS,BRATTON TOWNSHIP,AAA,President,,I,Richard Duncan,4,ADAMSAAA
...,...,...,...,...,...,...,...,...,...
204396,WYANDOT,TYMOCHTEE TS,ABM,President,,,Mike Smith,0,WYANDOTABM
204397,WYANDOT,TYMOCHTEE TS,ABM,President,,G,Jill Stein,3,WYANDOTABM
204398,WYANDOT,TYMOCHTEE TS,ABM,President,,,Josiah R. Stroh,0,WYANDOTABM
204399,WYANDOT,TYMOCHTEE TS,ABM,President,,,Douglas W. Thomson,0,WYANDOTABM


In [15]:
# Select only the relevant columns
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]

general_data

Unnamed: 0,precinct,party,candidate,votes
0,ADAMSAAA,,James Jerome Bell,0
1,ADAMSAAA,,Michael Bickelmeyer,0
2,ADAMSAAA,,Darrell L. Castle,0
3,ADAMSAAA,D,Hillary Clinton,96
4,ADAMSAAA,I,Richard Duncan,4
...,...,...,...,...
204396,WYANDOTABM,,Mike Smith,0
204397,WYANDOTABM,G,Jill Stein,3
204398,WYANDOTABM,,Josiah R. Stroh,0
204399,WYANDOTABM,,Douglas W. Thomson,0


In [16]:
# Viewing candidate data
general_data["candidate"] = general_data["candidate"].astype(str).str.upper()
general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


candidate
JAMES JEROME BELL         8887
LAURENCE KOTLIKOFF        8887
DOUGLAS W. THOMSON        8887
JOSIAH R. STROH           8887
JILL STEIN                8887
MIKE SMITH                8887
JOE SCHRINER              8887
MONICA MOOREHEAD          8887
EVAN MCMULLIN             8887
MICHAEL ANDREW MATUREN    8887
JOSEPH MALDONADO          8887
BARRY KIRSCHNER           8887
MICHAEL BICKELMEYER       8887
CHRIS KENISTON            8887
GARY JOHNSON              8887
BRUCE E. JAYNES           8887
TOM HOEFLING              8887
BEN HARTNELL              8887
CHERUNDA FOX              8887
RICHARD DUNCAN            8887
HILLARY CLINTON           8887
DARRELL L. CASTLE         8887
DONALD J. TRUMP           8887
Name: count, dtype: int64

In [17]:
# Cleaning Candidates

# Turning all general data to uppercase
general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


unwanted_keywords = [
    "BVS",
    "ABSENTEE",
    "MANUALLY COUNTED EMERGENCY",
    "FEDERAL",
    "SPECIAL PRESIDENTIAL",
    "AFFIDAVIT",
    "r\w*ABSENTEE\w*",
    "r\w*SCATT\w*",
    "NO BODY",
    "NEITHER"
    "NO CONFIDENCE",
    "MARTIN LUTHER KING JR.",
    "MICKEY MOUSE",
    "LITTERALLY ANYONE ELSE",
    "LORI A TREAT",
    "ABRAHAM LINCOLN",
    "ANONYMOUS",
    "GEORGE WASHINGTON",
    "DO OVER",
    "POPE FRANCIS"
    r"\w*VOTE\w*",                 # VOTE, VOTES
    r"\bUNCOM\w*\b",                 # UNCOMMITTED, UNCOM
    r"\bTOTAL(S)?\b",                # TOTAL, TOTALS
    r"\w*WRITE[- ]?IN\w*\b",         # WRITE-IN, WRITE-INS, WRITE INS
    r"\bSCATTER(ING|INGS)?\b",       # SCATTERING, SCATTERINGS
    r"\bOVER VOTE(S)?\b",            # OVER VOTE, OVER VOTES
    r"\bUNDER VOTE(S)?\b",           # UNDER VOTE, UNDER VOTES
    r"\bSPECIAL VOTE(S)?\b",         # SPECIAL VOTE, SPECIAL VOTES
    r"\bBLANK(S)?\b",                # BLANK, BLANKS
    r"\bBLANK/OVER VOTE(S)?\b",      # BLANK/OVER VOTE, BLANK/OVER VOTES
    r"\bVOID(S)?\b",                 # VOID, VOIDS
    r"\bUNQUALIFIED WRITE[- ]?IN(S)?\b",  # UNQUALIFIED WRITE-IN(S)
    r"\bBALLOT(S)? CAST\b",          # BALLOTS CAST
    r"\bNONE OF (THE )?ABOVE\b",     # NONE OF THE ABOVE, NONE OF ABOVE
    r"\bANONYMOUS\b",                # ANONYMOUS
    r"\bSCATTERED\b",                # SCATTERED
    r"\bOVER AND UNDER VOTE(S)?\b",  # OVER AND UNDER VOTES
    r"\bUNCERTIFIED\b",              # UNCERTIFIED
    r"\bUNVERIFIED\b",               # UNVERIFIED
    r"\bLBT\b",                      # LBT
    r"\bDEM\b",                      # DEM
    r"\bREP\b",                      # REP
    r"\bGRN\b",                      # GRN
    r"\bBVS\b",                      # BVS
]
pattern = "|".join(unwanted_keywords)
# then filter:
general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# Fixing FUENTE
general_data.loc[
    general_data["candidate"].str.contains("\w*FUENTE\w*", case=False, na=False),
    "candidate"
] = "FUENTE"

general_data.loc[
    general_data["candidate"].str.contains("\w*CHRISTLEY\w*", case=False, na=False),
    "candidate"
] = "CHRISTIE"

# Fixing FUENTE
general_data.loc[
    general_data["candidate"].str.contains("\w*BAZZARI\w*", case=False, na=False),
    "candidate"
] = "BAZZARI"

general_data.loc[
    general_data["candidate"].str.contains("\w*CHRISTY\w*", case=False, na=False),
    "candidate"
] = "CHRISTIE"

# Fixing MCCAIN
general_data.loc[
    general_data["candidate"].str.contains("\w*MCCA\w*", case=False, na=False),
    "candidate"
] = "FUENTE"
pattern = "|".join(unwanted_keywords)

# FILTER OUT TRASH WORDS
general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# # Fixing Darrell Castle"
# general_data.loc[
#     general_data["candidate"].str.contains("\w*ASTLE\w*", case=False, na=False),
#     "candidate"
# ] = "CASTLE"

general_data.loc[
    general_data["candidate"].str.contains("\w*ATUREN\w*", case=False, na=False),
    "candidate"
] = "MATUREN"

general_data.loc[
    general_data["candidate"].str.contains("\w*OLTYSIK\w*", case=False, na=False),
    "candidate"
] = "SOLTYSIK"

general_data.loc[
    general_data["candidate"].str.contains("\w*MALDONADO\w*", case=False, na=False),
    "candidate"
] = "MALDONADO"

# Fixing mixed president + vice_president by /
general_data["candidate"] = (
    general_data["candidate"].str.split("/")
    .str[0]
    .str.strip()
    )

# #Selecting only last name
# general_data["candidate"] = general_data["candidate"].str.split().str[-1]

# Fixing the & candidate
# general_data["candidate"] = (
#     general_data["candidate"].str.split("/")
#     .str[0]
#     .str.strip()
#     )




# # Fixing Cubbler
# general_data.loc[
#     ( 
#         general_data["candidate"].str.contains("\w*UBBIER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*UBLER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*OBBLER\w*", case=False, na=False),
#     "candidate")
# ] = "CUBBLER"


# Fixing Kotlikoff
general_data.loc[
    general_data["candidate"].str.contains("\w*LIKOFF\w*", case=False, na=False),
    "candidate"
] = "KOTLIKOFF"

# Fixing Valdivia
general_data.loc[
    general_data["candidate"].str.contains("\w*KENISTON\W*", case=False, na=False),
    "candidate"
] = "KENISTON"

# Fixing HOEFLING
general_data.loc[(
    general_data["candidate"].str.contains("\w*HOEF\w*", case=False, na=False),
    "candidate")
] = "HOEFLING"


general_data.loc[
    general_data["candidate"].str.contains("\w*SCHOENKE", case=False, na=False),
    "candidate"
] = "SCHOENKE"


# Fixing McMullin
general_data.loc[
    general_data["candidate"].str.contains("MCMULLIN", case=False, na=False) ,
    "candidate"
] = "MCMULLIN"

general_data.loc[
    general_data["candidate"].str.contains("FOX", case=False, na=False) ,
    "candidate"
] = "FOX"

# Fixing LASTNAME + First name Initial
# general_data.loc[:,"candidate"] = (
#     general_data["candidate"].str.split().str[0].str.upper()
# )


# #Selecting only last name
general_data["candidate"] = general_data["candidate"].str.split().str[-1]

general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].astype(str).str.upper()
  general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]
  general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]


candidate
BELL           8887
KOTLIKOFF      8887
THOMSON        8887
STROH          8887
STEIN          8887
SMITH          8887
SCHRINER       8887
MOOREHEAD      8887
MCMULLIN       8887
MATUREN        8887
MALDONADO      8887
KIRSCHNER      8887
BICKELMEYER    8887
KENISTON       8887
JOHNSON        8887
JAYNES         8887
HOEFLING       8887
HARTNELL       8887
FOX            8887
DUNCAN         8887
CLINTON        8887
CASTLE         8887
TRUMP          8887
Name: count, dtype: int64

In [18]:
# Viewing Party
general_data["party"].value_counts(dropna=False)

party
NaN    159966
I       17774
D        8887
G        8887
R        8887
Name: count, dtype: int64

In [None]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
# def fill_party_from_data(row, df):
#     if pd.notna(row["party"]):
#         return row["party"]
    
#     # Try to find other rows with the same candidate and known party
#     matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
#     if not matches.empty:
#         return matches["party"].iloc[0]  # Return the first match's party
#     else:
#         return None  # Still unknown
def fill_party_from_master_fast(df, master_df):
    party_map = master_df.set_index("candidate")["party"]
    # Only fill missing values
    df["party"] = df["party"].fillna(df["candidate"].map(party_map))
    return df


#=====================================
# Function to fill remaining missing party from master lookup
#=====================================
def fill_party_from_master(df, master_df):
    party_map = master_df.set_index("candidate")["party"].to_dict()
    df["party"] = df.apply(
        lambda row: party_map.get(row["candidate"], row["party"])
        if pd.isna(row["party"]) else row["party"],
        axis=1
    )
    return df


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)

    # Filter and combine only known party entries (exclude 'UNK')
    new_data = (
        df[["candidate", "party"]]
        .dropna()
        .query('party != "UNK"')
        .drop_duplicates()
    )

    # Merge with master and remove duplicates by candidate
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
general_data["party"] = general_data.apply(
    lambda row: fill_party_from_data(row, general_data),
    axis=1
)

# # Fill remaining party using general master CSV
master_party_df = pd.read_csv(r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv") # USE YOUR OWN ADDRESS
general_data = fill_party_from_master(general_data, master_party_df)
general_data["party"].value_counts(dropna=False)


In [20]:
# Cleaning Party
# Turning all general data party to uppercase
general_data["party"] = general_data["party"].astype(str).str.upper()

general_data["party"] = (
    general_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "CONSERVATIVE":"REP",
        "WORKING FAMILIES":"DEM",
        "WOMEN'S EQUALITY":"DEM",
        "DEMOCRATIC": "DEM",
        "DEMOCRAT":"DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GRN",
        "WGR":"GRN",
        "GREEN AND RAINBOW":"GRN",
        "GREEN-RAINBOW":"GRN",
        "MTN":"GRN",
        "LIBERTARIAN": "LIB",
        "LIBERTARIN":"LIB",
        "LPN":"LIB",
        "LBT": "LIB",
        "L": "LIB",
        "IAP":"CON",
        "CONSTITUTION": "CON",
        "CST":"CON",
        "AMERICAN DELTA":"AMD",
        "PROHIBITION":"PRO",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "INDEPENDENCE":"IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK",
        "NPA":"IND",
        "UST":"CON",
        "NPP":"IND",
        "(WRITE-IN)":"IND",
        "NAN":"UNK",
        "I":"IND"
    })
)

general_data["party"].value_counts(dropna=False)

party
UNK    159966
IND     17774
DEM      8887
GRN      8887
REP      8887
Name: count, dtype: int64

In [68]:
# UPDATE MASTER FILE, CAREFUL
update_master_candidate_party(general_data, r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv")

In [21]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

candidate_column,precinct,gen_dem_CLINTON,gen_grn_STEIN,gen_ind_DUNCAN,gen_ind_JOHNSON,gen_rep_TRUMP,gen_unk_BELL,gen_unk_BICKELMEYER,gen_unk_CASTLE,gen_unk_FOX,...,gen_unk_KIRSCHNER,gen_unk_KOTLIKOFF,gen_unk_MALDONADO,gen_unk_MATUREN,gen_unk_MCMULLIN,gen_unk_MOOREHEAD,gen_unk_SCHRINER,gen_unk_SMITH,gen_unk_STROH,gen_unk_THOMSON
0,ADAMSAAA,96,4,4,12,532,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ADAMSAAB,95,0,5,9,390,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ADAMSAAD,94,4,2,5,408,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ADAMSAAE,76,1,2,2,176,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ADAMSAAG,73,2,1,8,258,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8882,WYANDOTABH,45,3,1,13,221,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8883,WYANDOTABI,103,2,3,19,368,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8884,WYANDOTABJ,100,3,1,13,293,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8885,WYANDOTABL,55,1,1,12,241,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Merge
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")

# Convert DEM primary columns to numeric and calculate total
dem_cols = combined.filter(like="pri_dem_").columns
combined[dem_cols] = combined[dem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_primary_total"] = combined[dem_cols].sum(axis=1)

# Convert general election columns to numeric and calculate total
gen_cols = combined.filter(like="gen_").columns
combined[gen_cols] = combined[gen_cols].apply(pd.to_numeric, errors="coerce")
combined["general_total"] = combined[gen_cols].sum(axis=1)

# Convert all numeric columns to int
for col in combined.columns[1:]:
    combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0).astype(int)

# Identify filtered-out precincts
primary_precincts = set(primary_result["precinct"])
general_precincts = set(general_result["precinct"])
combined_precincts = set(combined["precinct"])

primary_filtered_out = primary_precincts - combined_precincts
general_filtered_out = general_precincts - combined_precincts




In [23]:


combined.to_csv("OH.csv", index=False)
# Step 3: Identify unmatched precincts and retrieve full rows
combined_precincts = set(combined["precinct"])

# Rows in primary_result but not in combined
primary_filtered_out = primary_result[~primary_result["precinct"].isin(combined_precincts)]
primary_filtered_out.to_csv("OH_primary_filtered.csv", index=False)

# Rows in general_result but not in combined
general_filtered_out = general_result[~general_result["precinct"].isin(combined_precincts)]
general_filtered_out.to_csv("OH_general_filtered.csv", index=False)

In [24]:
print(f"primary: {len(primary_precincts)}, general: {len(general_precincts)}, combined: {len(combined)}")

primary: 8887, general: 8887, combined: 8885


In [25]:
print(f"primary: {len(pri_combined_df['county'].unique())}, general: {len(gen_combined_df['county'].unique())}")


primary: 88, general: 88


In [74]:
pri_combined_df['county'].unique()

array(['Albany', 'Bronx', 'Chenango', 'Kings', 'New York', 'Queens',
       'Richmond'], dtype=object)

In [75]:
gen_combined_df['county'].unique()

array(['ALBANY', 'BRONX', 'CHENANGO', 'KINGS', 'NEW YORK', 'QUEENS',
       'RICHMOND'], dtype=object)

In [76]:
primary_counties = set(pri_combined_df['jurisdiction'].dropna().str.strip().str.upper())
general_counties = set(gen_combined_df['county'].dropna().str.strip().str.upper())

diff = primary_counties - general_counties
print(f"Counties in primary but not in general: {len(diff)}")
print(diff)


KeyError: 'jurisdiction'