In [91]:
# Import all the libraries
import pandas as pd
import glob
import os
import re
from pprint import pprint

In [92]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\PA\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['general'])
]

# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['primary'])
]

In [93]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\PA\PA_General_2016.csv


In [94]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\PA\PA_Primary_2016.csv


In [95]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)

        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"].str.upper().str.contains("\w*USP\w*", na=False)]

        # cleaning precinct

        df["precinct"] = df["county_number"].astype(str).str.strip() + df["precinct"].astype(str).str.strip()
        df["precinct"] = df["precinct"].str.upper().str.strip().str.replace(r"\s+", "", regex=True)

    
        # Dropping duplicates value
        df = df.drop_duplicates()
        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)

# Checking any suspicious precinct
suspicious_precincts = pri_combined_df[
    pri_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]
pri_combined_df


  df = pd.read_csv(file)


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,county_number,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,office,party,...,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,county
0,2016,P,1,10,1,0,1.0,1,USP,DEM,...,,0,5,1,10,0,0,33,193,Adams
1,2016,P,1,20,1,0,1.0,1,USP,DEM,...,,0,10,1,20,0,0,33,0,Adams
2,2016,P,1,30,1,0,1.0,1,USP,DEM,...,,0,15,1,30,0,0,33,193,Adams
3,2016,P,1,40,1,0,1.0,1,USP,DEM,...,,0,20,1,40,0,0,33,193,Adams
4,2016,P,1,50,1,0,1.0,1,USP,DEM,...,,0,25,1,50,0,0,33,193,Adams
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87122,2016,P,67,1409,1,0,2.0,99,USP,REP,...,3.0,0,350,133,1409,0,0,28,93,York
87123,2016,P,67,1413,1,0,2.0,99,USP,REP,...,1.0,0,350,133,1413,0,0,28,93,York
87124,2016,P,67,1416,1,0,2.0,99,USP,REP,...,2.0,0,350,133,1416,0,0,28,93,York
87125,2016,P,67,1419,1,0,2.0,99,USP,REP,...,3.0,0,350,133,1419,0,0,28,93,York


In [96]:
# Checking any suspicious precinct, county
suspicious_precincts = pri_combined_df[
    pri_combined_df["precinct"].isna() |
    pri_combined_df["precinct"].str.strip().str.upper().str.contains(r"\w*TOTAL\w*", na=False) |
    pri_combined_df["precinct"].str.strip().str.upper().isin(["NONE", "NAN"]) |
    pri_combined_df["county"].isna() |
    pri_combined_df["county"].str.strip().str.upper().str.contains(r"\w*TOTAL\w*", na=False) |
    pri_combined_df["county"].str.strip().str.upper().isin(["NONE", "NAN"])
]

In [97]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,1ABBOTTSTOWN,DEM,CLINTON,37
1,1ARENDTSVILLE,DEM,CLINTON,42
2,1BENDERSVILLE,DEM,CLINTON,13
3,1BERWICK,DEM,CLINTON,105
4,1BIGLERVILLE,DEM,CLINTON,35
...,...,...,...,...
87122,67YORK,REP,Scattered,1
87123,67YORK,REP,Scattered,1
87124,67YORK,REP,Scattered,1
87125,67YORK,REP,Scattered,4


In [98]:
# Viewing candidate data
primary_data["candidate"].value_counts(dropna=False)

candidate
CLINTON               9159
SANDERS               9159
TRUMP                 9158
CRUZ                  9155
KASICH                9153
DE LA FUENTE          9100
CARSON                9007
RUBIO                 8981
BUSH                  8931
Scattered             5243
SCATTERED               49
Scattered writeins      32
Name: count, dtype: int64

In [99]:
# Cleaning Candidates

# Turning all primary data to uppercase
primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()

unwanted_keywords = [
    r"\w*VOTE\w*",
    r"\w*UNCOM\w*",
    r"\w*TOTAL\w*",
    r"\w*ATTER\w*",
    r"\w*UNINSTRUCTED\w*",
    r"UNCOMMITTED", r"OTHER",r"Total Votes Cast",r"No Preference",
    "EMERGENCY",
    r"\w*ABSENTEE\w*",
    r"\w*AFFIDAVIT\w*",
    "FEDERAL","BLANKS","VOID","PUBLIC COUNTER"
]

pattern = "|".join(unwanted_keywords)

# Assuming candidate column is already string and uppercase
primary_data = primary_data[~primary_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

#fIXING DE LA FUENTE
# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("rocky.*fuente", case=False, na=False) |
#         primary_data["candidate"].str.contains("LA FUENTE", case=False, na=False) |
#         primary_data["candidate"].str.contains("\w*Fuque\w*", case=False, na=False)
#     ),
#     "candidate"
# ] = "LA FUENTE"

# # Fixing williams
primary_data.loc[
    primary_data["candidate"].str.contains("FARRELL", case=False, na=False),
    "candidate"
] = "FARRELL"

# # Fixing williams
primary_data.loc[
    primary_data["candidate"].str.contains("FIONINA", case=False, na=False),
    "candidate"
] = "FIORINA"

primary_data.loc[:, "candidate"] = primary_data["candidate"].replace({
    "Donald I. Trump": "Donald J. Trump",
})

# # Fixing the , candidate
# primary_data["candidate"] = (
#     primary_data["candidate"].str.split(",")
#     .str[0]
#     .str.strip()
#     )


# #Selecting only last name
primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]

# # Fixing Christie
# primary_data.loc[
#     primary_data["candidate"].str.contains("CHRISTLE", case=False, na=False),
#     "candidate"
# ] = "CHRISTIE"

# # Fixing KASICH
# primary_data.loc[
#     primary_data["candidate"].str.contains("KAISCH", case=False, na=False),
#     "candidate"
# ] = "CHRISTIE"

# # Fixing O'MALLEY
# primary_data.loc[
#     primary_data["candidate"].str.contains("O'MALLEY", case=False, na=False),
#     "candidate"
# ] = "O'MAILEY"

# # Fixing O'MALLEY
# primary_data.loc[
#     primary_data["candidate"].str.contains("PATAKL", case=False, na=False),
#     "candidate"
# ] = "PATAKI"

# # Fixing O'MALLEY
# primary_data.loc[
#     primary_data["candidate"].str.contains("RUBLO", case=False, na=False),
#     "candidate"
# ] = "RUBIO"

# # Fixing Huckabee
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ABEE\w*", case=False, na=False),
#     "candidate"
# ] = "HUCKABEE"

# # Fixing Fiorina
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ORINA\W*", case=False, na=False),
#     "candidate"
# ] = "FIORINA"

# # Fixing Gray
# primary_data.loc[
#     primary_data["candidate"].str.contains("GREY", case=False, na=False),
#     "candidate"
# ] = "GRAY"

# # Fixing Wilson
# primary_data.loc[
#     primary_data["candidate"].str.contains("WISON", case=False, na=False),
#     "candidate"
# ] = "WILSON"


# # Fixing separator 
# primary_data["candidate"] = (
#     primary_data["candidate"]
#     .str.split(r"\s*(?:and|/|&|–|-|\+)\s*", n=1, expand=True)[0]
#     .str.strip()
#     .str.upper()
# )

# # Fixing McMullin

# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("MCMULLIN", case=False, na=False) |
#         primary_data["candidate"].str.contains("EVAN MCMULLEN", case=False, na=False)
#     ),
#     "candidate"
# ] = "MCMULLIN"

# primary_data.loc[
#     primary_data["candidate"].str.contains("De La Fuen", case=False, na=False),
#     "candidate"
# ] = "FUENTE D"

# primary_data.loc[:,"candidate"] = (
#     primary_data["candidate"].str.split().str[0].str.upper()
# )

primary_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]


candidate
CLINTON    9159
SANDERS    9159
TRUMP      9158
CRUZ       9155
KASICH     9153
FUENTE     9100
CARSON     9007
RUBIO      8981
BUSH       8931
Name: count, dtype: int64

In [100]:
# Viewing Party
primary_data["party"].value_counts(dropna=False)

party
REP    54385
DEM    27418
Name: count, dtype: int64

In [101]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown

#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# )

primary_data["party"].value_counts(dropna=False)


party
REP    54385
DEM    27418
Name: count, dtype: int64

In [102]:
# Cleaning Party
# Turning all general data party to uppercase
primary_data["party"] = primary_data["party"].astype(str).str.upper()

# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# )
primary_data["party"] = (
    primary_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEMOCRAT":"DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GREEN",
        "GREEN-RAINBOW":"GRN",
        "LIBERTARIAN": "LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK"
    })
)
primary_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = primary_data["party"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = (


party
REP    54385
DEM    27418
Name: count, dtype: int64

In [103]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"]
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data.loc[:,"candidate_column"] = (


candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CRUZ,pri_rep_KASICH,pri_rep_RUBIO,pri_rep_TRUMP
0,10ADAMS,668,5,425,11,10,614,693,16,1564
1,10ALLEGHENY,16,0,19,0,1,39,17,2,64
2,10BRADY,48,2,57,1,1,43,22,0,114
3,10BRUIN,12,0,9,1,1,25,7,0,63
4,10BUFFALO,394,14,344,4,9,270,270,6,870
...,...,...,...,...,...,...,...,...,...,...
2455,9WARRINGTON,1595,34,1084,24,22,667,885,32,2003
2456,9WARWICK,1079,17,673,7,15,431,593,21,1440
2457,9WESTROCKHILL,301,3,254,5,10,221,161,14,555
2458,9WRIGHTSTOWN,221,4,167,3,3,97,173,4,366


In [104]:
# Process general files
general_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file, on_bad_lines='skip')
         # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]                
        
        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]       
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]    

        # Combine precinct as county + precinct
        df["precinct"] = df["county_number"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper().str.strip().str.replace(r"\s+", "", regex=True)
        df["county"] = df["county"].str.upper()

       
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        general_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(general_df_list, ignore_index=True)
# Checking any suspicious precinct
suspicious_precincts = gen_combined_df[
    gen_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]
gen_combined_df


Unnamed: 0,county,precinct,office,district,candidate,party,votes,county_number
0,ADAMS,1ABBOTTSTOWN,President,,HILLARY CLINTON,DEM,119,1
1,ADAMS,1ARENDTSVILLE,President,,HILLARY CLINTON,DEM,143,1
2,ADAMS,1BENDERSVILLE,President,,HILLARY CLINTON,DEM,83,1
3,ADAMS,1BERWICK,President,,HILLARY CLINTON,DEM,257,1
4,ADAMS,1BIGLERVILLE,President,,HILLARY CLINTON,DEM,148,1
...,...,...,...,...,...,...,...,...
45854,YORK,67YORKWARD5-1,President,,GARY E JOHNSON,LIB,34,67
45855,YORK,67YORKWARD5-2,President,,GARY E JOHNSON,LIB,25,67
45856,YORK,67YORKWARD5-3,President,,GARY E JOHNSON,LIB,39,67
45857,YORK,67YORKHAVEN,President,,GARY E JOHNSON,LIB,10,67


In [105]:
# Select only the relevant columns
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]

general_data

Unnamed: 0,precinct,party,candidate,votes
0,1ABBOTTSTOWN,DEM,HILLARY CLINTON,119
1,1ARENDTSVILLE,DEM,HILLARY CLINTON,143
2,1BENDERSVILLE,DEM,HILLARY CLINTON,83
3,1BERWICK,DEM,HILLARY CLINTON,257
4,1BIGLERVILLE,DEM,HILLARY CLINTON,148
...,...,...,...,...
45854,67YORKWARD5-1,LIB,GARY E JOHNSON,34
45855,67YORKWARD5-2,LIB,GARY E JOHNSON,25
45856,67YORKWARD5-3,LIB,GARY E JOHNSON,39
45857,67YORKHAVEN,LIB,GARY E JOHNSON,10


In [106]:
# Viewing candidate data
general_data["candidate"] = general_data["candidate"].astype(str).str.upper()
general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


candidate
HILLARY CLINTON     9176
DONALD J. TRUMP     9176
GARY E JOHNSON      9173
JILL STEIN          9169
DARRELL L CASTLE    9165
Name: count, dtype: int64

In [107]:
# Cleaning Candidates

# Turning all general data to uppercase
general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


unwanted_keywords = [
    "BVS",
    "ABSENTEE",
    "MANUALLY COUNTED EMERGENCY",
    "FEDERAL",
    "SPECIAL PRESIDENTIAL",
    "AFFIDAVIT",
    "r\w*ABSENTEE\w*",
    "r\w*SCATT\w*",
    "NO BODY",
    "NEITHER"
    "NO CONFIDENCE",
    "MARTIN LUTHER KING JR.",
    "MICKEY MOUSE",
    "LITTERALLY ANYONE ELSE",
    "LORI A TREAT",
    "ABRAHAM LINCOLN",
    "ANONYMOUS",
    "GEORGE WASHINGTON",
    "DO OVER",
    "POPE FRANCIS"
    r"\w*VOTE\w*",                 # VOTE, VOTES
    r"\bUNCOM\w*\b",                 # UNCOMMITTED, UNCOM
    r"\bTOTAL(S)?\b",                # TOTAL, TOTALS
    r"\w*WRITE[- ]?IN\w*\b",         # WRITE-IN, WRITE-INS, WRITE INS
    r"\bSCATTER(ING|INGS)?\b",       # SCATTERING, SCATTERINGS
    r"\bOVER VOTE(S)?\b",            # OVER VOTE, OVER VOTES
    r"\bUNDER VOTE(S)?\b",           # UNDER VOTE, UNDER VOTES
    r"\bSPECIAL VOTE(S)?\b",         # SPECIAL VOTE, SPECIAL VOTES
    r"\bBLANK(S)?\b",                # BLANK, BLANKS
    r"\bBLANK/OVER VOTE(S)?\b",      # BLANK/OVER VOTE, BLANK/OVER VOTES
    r"\bVOID(S)?\b",                 # VOID, VOIDS
    r"\bUNQUALIFIED WRITE[- ]?IN(S)?\b",  # UNQUALIFIED WRITE-IN(S)
    r"\bBALLOT(S)? CAST\b",          # BALLOTS CAST
    r"\bNONE OF (THE )?ABOVE\b",     # NONE OF THE ABOVE, NONE OF ABOVE
    r"\bANONYMOUS\b",                # ANONYMOUS
    r"\bSCATTERED\b",                # SCATTERED
    r"\bOVER AND UNDER VOTE(S)?\b",  # OVER AND UNDER VOTES
    r"\bUNCERTIFIED\b",              # UNCERTIFIED
    r"\bUNVERIFIED\b",               # UNVERIFIED
    r"\bLBT\b",                      # LBT
    r"\bDEM\b",                      # DEM
    r"\bREP\b",                      # REP
    r"\bGRN\b",                      # GRN
    r"\bBVS\b",                      # BVS
]
pattern = "|".join(unwanted_keywords)
# then filter:
general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# Fixing FUENTE
general_data.loc[
    general_data["candidate"].str.contains("\w*FUENTE\w*", case=False, na=False),
    "candidate"
] = "FUENTE"

general_data.loc[
    general_data["candidate"].str.contains("\w*CHRISTLEY\w*", case=False, na=False),
    "candidate"
] = "CHRISTIE"

# Fixing FUENTE
general_data.loc[
    general_data["candidate"].str.contains("\w*BAZZARI\w*", case=False, na=False),
    "candidate"
] = "BAZZARI"

general_data.loc[
    general_data["candidate"].str.contains("\w*CHRISTY\w*", case=False, na=False),
    "candidate"
] = "CHRISTIE"

# Fixing MCCAIN
general_data.loc[
    general_data["candidate"].str.contains("\w*MCCA\w*", case=False, na=False),
    "candidate"
] = "FUENTE"
pattern = "|".join(unwanted_keywords)

# FILTER OUT TRASH WORDS
general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# # Fixing Darrell Castle"
# general_data.loc[
#     general_data["candidate"].str.contains("\w*ASTLE\w*", case=False, na=False),
#     "candidate"
# ] = "CASTLE"

general_data.loc[
    general_data["candidate"].str.contains("\w*ATUREN\w*", case=False, na=False),
    "candidate"
] = "MATUREN"

general_data.loc[
    general_data["candidate"].str.contains("\w*OLTYSIK\w*", case=False, na=False),
    "candidate"
] = "SOLTYSIK"

general_data.loc[
    general_data["candidate"].str.contains("\w*MALDONADO\w*", case=False, na=False),
    "candidate"
] = "MALDONADO"

# Fixing mixed president + vice_president by /
general_data["candidate"] = (
    general_data["candidate"].str.split("/")
    .str[0]
    .str.strip()
    )

# #Selecting only last name
# general_data["candidate"] = general_data["candidate"].str.split().str[-1]

# Fixing the & candidate
# general_data["candidate"] = (
#     general_data["candidate"].str.split("/")
#     .str[0]
#     .str.strip()
#     )




# # Fixing Cubbler
# general_data.loc[
#     ( 
#         general_data["candidate"].str.contains("\w*UBBIER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*UBLER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*OBBLER\w*", case=False, na=False),
#     "candidate")
# ] = "CUBBLER"


# Fixing Kotlikoff
general_data.loc[
    general_data["candidate"].str.contains("\w*LIKOFF\w*", case=False, na=False),
    "candidate"
] = "KOTLIKOFF"

# Fixing Valdivia
general_data.loc[
    general_data["candidate"].str.contains("\w*KENISTON\W*", case=False, na=False),
    "candidate"
] = "KENISTON"

# Fixing HOEFLING
general_data.loc[(
    general_data["candidate"].str.contains("\w*HOEF\w*", case=False, na=False),
    "candidate")
] = "HOEFLING"


general_data.loc[
    general_data["candidate"].str.contains("\w*SCHOENKE", case=False, na=False),
    "candidate"
] = "SCHOENKE"


# Fixing McMullin
general_data.loc[
    general_data["candidate"].str.contains("MCMULLIN", case=False, na=False) ,
    "candidate"
] = "MCMULLIN"

general_data.loc[
    general_data["candidate"].str.contains("FOX", case=False, na=False) ,
    "candidate"
] = "FOX"

# Fixing LASTNAME + First name Initial
# general_data.loc[:,"candidate"] = (
#     general_data["candidate"].str.split().str[0].str.upper()
# )


# #Selecting only last name
general_data["candidate"] = general_data["candidate"].str.split().str[-1]

general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].astype(str).str.upper()
  general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]
  general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]


candidate
CLINTON    9176
TRUMP      9176
JOHNSON    9173
STEIN      9169
CASTLE     9165
Name: count, dtype: int64

In [108]:
# Viewing Party
general_data["party"].value_counts(dropna=False)

party
DEM    9176
REP    9176
LIB    9173
GRN    9169
CON    9165
Name: count, dtype: int64

In [109]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
# def fill_party_from_data(row, df):
#     if pd.notna(row["party"]):
#         return row["party"]
    
#     # Try to find other rows with the same candidate and known party
#     matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
#     if not matches.empty:
#         return matches["party"].iloc[0]  # Return the first match's party
#     else:
#         return None  # Still unknown
def fill_party_from_master_fast(df, master_df):
    party_map = master_df.set_index("candidate")["party"]
    # Only fill missing values
    df["party"] = df["party"].fillna(df["candidate"].map(party_map))
    return df


#=====================================
# Function to fill remaining missing party from master lookup
#=====================================
def fill_party_from_master(df, master_df):
    party_map = master_df.set_index("candidate")["party"].to_dict()
    df["party"] = df.apply(
        lambda row: party_map.get(row["candidate"], row["party"])
        if pd.isna(row["party"]) else row["party"],
        axis=1
    )
    return df


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)

    # Filter and combine only known party entries (exclude 'UNK')
    new_data = (
        df[["candidate", "party"]]
        .dropna()
        .query('party != "UNK"')
        .drop_duplicates()
    )

    # Merge with master and remove duplicates by candidate
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
general_data["party"] = general_data.apply(
    lambda row: fill_party_from_data(row, general_data),
    axis=1
)

# # Fill remaining party using general master CSV
master_party_df = pd.read_csv(r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv") # USE YOUR OWN ADDRESS
general_data = fill_party_from_master(general_data, master_party_df)
general_data["party"].value_counts(dropna=False)


party
DEM    9176
REP    9176
LIB    9173
GRN    9169
CON    9165
Name: count, dtype: int64

In [110]:
# Cleaning Party
# Turning all general data party to uppercase
general_data["party"] = general_data["party"].astype(str).str.upper()

general_data["party"] = (
    general_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "CONSERVATIVE":"REP",
        "WORKING FAMILIES":"DEM",
        "WOMEN'S EQUALITY":"DEM",
        "DEMOCRATIC": "DEM",
        "DEMOCRAT":"DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GRN",
        "WGR":"GRN",
        "GREEN AND RAINBOW":"GRN",
        "GREEN-RAINBOW":"GRN",
        "MTN":"GRN",
        "LIBERTARIAN": "LIB",
        "LIBERTARIN":"LIB",
        "LPN":"LIB",
        "LBT": "LIB",
        "L": "LIB",
        "IAP":"CON",
        "CONSTITUTION": "CON",
        "CST":"CON",
        "AMERICAN DELTA":"AMD",
        "PROHIBITION":"PRO",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "INDEPENDENCE":"IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK",
        "NPA":"IND",
        "UST":"CON",
        "NPP":"IND",
        "(WRITE-IN)":"IND",
        "NAN":"UNK",
        "I":"IND"
    })
)

general_data["party"].value_counts(dropna=False)

party
DEM    9176
REP    9176
LIB    9173
GRN    9169
CON    9165
Name: count, dtype: int64

In [111]:
# UPDATE MASTER FILE, CAREFUL
update_master_candidate_party(general_data, r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv")

In [112]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

candidate_column,precinct,gen_con_CASTLE,gen_dem_CLINTON,gen_grn_STEIN,gen_lib_JOHNSON,gen_rep_TRUMP
0,10ADAMSPRECINCT1,7,956,9,73,1893
1,10ADAMSPRECINCT2,4,668,11,61,1572
2,10ADAMSPRECINCT3,9,751,7,58,1708
3,10ALLEGHENY,6,40,0,5,205
4,10BRADY,1,138,5,21,380
...,...,...,...,...,...,...
9118,9WARWICKDISTRICT5,1,519,4,34,684
9119,9WESTROCKHILLARGUS,1,344,8,17,603
9120,9WESTROCKHILLROCKHILL,7,738,25,43,1122
9121,9WRIGHTSTOWN,4,845,19,34,1037


In [113]:
# Merge
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")

# Convert DEM primary columns to numeric and calculate total
dem_cols = combined.filter(like="pri_dem_").columns
combined[dem_cols] = combined[dem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_primary_total"] = combined[dem_cols].sum(axis=1)

# Convert general election columns to numeric and calculate total
gen_cols = combined.filter(like="gen_").columns
combined[gen_cols] = combined[gen_cols].apply(pd.to_numeric, errors="coerce")
combined["general_total"] = combined[gen_cols].sum(axis=1)

rep_cols = combined.filter(like="pri_rep_").columns
combined[rep_cols] = combined[rep_cols].apply(pd.to_numeric, errors="coerce")
combined["rep_primary_total"] = combined[rep_cols].sum(axis=1)

# Convert all numeric columns to int
for col in combined.columns[1:]:
    combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0).astype(int)

# Identify filtered-out precincts
primary_precincts = set(primary_result["precinct"])
general_precincts = set(general_result["precinct"])
combined_precincts = set(combined["precinct"])

primary_filtered_out = primary_precincts - combined_precincts
general_filtered_out = general_precincts - combined_precincts




In [114]:


combined.to_csv("PA.csv", index=False)
# Step 3: Identify unmatched precincts and retrieve full rows
combined_precincts = set(combined["precinct"])

# Rows in primary_result but not in combined
primary_filtered_out = primary_result[~primary_result["precinct"].isin(combined_precincts)]
primary_filtered_out.to_csv("PA_primary_filtered.csv", index=False)

# Rows in general_result but not in combined
general_filtered_out = general_result[~general_result["precinct"].isin(combined_precincts)]
general_filtered_out.to_csv("PA_general_filtered.csv", index=False)

In [115]:
print(f"primary: {len(primary_precincts)}, general: {len(general_precincts)}, combined: {len(combined)}")

primary: 2460, general: 9123, combined: 1471


In [116]:
print(f"primary: {len(pri_combined_df['county'].unique())}, general: {len(gen_combined_df['county'].unique())}")


primary: 67, general: 67


In [117]:
pri_combined_df['precinct'].unique()

array(['1ABBOTTSTOWN', '1ARENDTSVILLE', '1BENDERSVILLE', ..., '67YORK',
       '67YORKHAVEN', '67YORKANA'], dtype=object)

In [118]:
gen_combined_df['county'].unique()

array(['ADAMS', 'ALLEGHENY', 'ARMSTRONG', 'BEAVER', 'BEDFORD', 'BERKS',
       'BLAIR', 'BRADFORD', 'BUCKS', 'BUTLER', 'CAMBRIA', 'CAMERON',
       'CARBON', 'CENTRE', 'CHESTER', 'CLARION', 'CLEARFIELD', 'CLINTON',
       'COLUMBIA', 'CRAWFORD', 'CUMBERLAND', 'DAUPHIN', 'DELAWARE', 'ELK',
       'ERIE', 'FAYETTE', 'FOREST', 'FRANKLIN', 'FULTON', 'GREENE',
       'HUNTINGDON', 'INDIANA', 'JEFFERSON', 'JUNIATA', 'LACKAWANNA',
       'LANCASTER', 'LAWRENCE', 'LEBANON', 'LEHIGH', 'LUZERNE',
       'LYCOMING', 'MCKEAN', 'MERCER', 'MIFFLIN', 'MONROE', 'MONTGOMERY',
       'MONTOUR', 'NORTHAMPTON', 'NORTHUMBERLAND', 'PERRY',
       'PHILADELPHIA', 'PIKE', 'POTTER', 'SCHUYLKILL', 'SNYDER',
       'SOMERSET', 'SULLIVAN', 'SUSQUEHANNA', 'TIOGA', 'UNION', 'VENANGO',
       'WARREN', 'WASHINGTON', 'WAYNE', 'WESTMORELAND', 'WYOMING', 'YORK'],
      dtype=object)

In [119]:
primary_counties = set(pri_combined_df['precinct'].dropna().str.strip().str.upper())
general_counties = set(gen_combined_df['precinct'].dropna().str.strip().str.upper())

diff = primary_counties - general_counties
print(f"Counties in primary but not in general: {len(diff)}")
print(diff)


Counties in primary but not in general: 989
{'36WESTHEMPFIELD', '60MIFFLINBURG', '54COALDALE', '21LOWERALLEN', '2CASTLESHANNON', '36MARTIC', '6CUMRU', '2PINE', '66FACTORYVILLE', '23EDGMONT', '31PENN', '2COLLIER', '49POINT', '38LEBANON', '1FRANKLIN', '38MYERSTOWN', '6COLEBROOKDALE', '3FORDCITY', '4CHIPPEWA', '9NORTHAMPTON', '35SCOTT', '22LONDONDERRY', '11CONEMAUGH', '46PERKIOMEN', '1MCSHERRYSTOWN', '46LOWERMERION', '49RALPHO', '36EPHRATA', '61OILCITY', '2GREENTREE', '40HAZLE', '15WARWICK', '40FREELAND', '2MT.OLIVER', '6EARL', '11JACKSON', '54MAHANOYCITY', '63WESTBROWNSVILLE', '54POTTSVILLE', '40LARKSVILLE', '38UNION', '14BENNER', '43SHENANGO', '6RICHMOND', '7ALLEGHENY', '64TEXAS', '35OLYPHANT', '13LOWERTOWAMENSING', '25HARBORCREEK', '65VANDERGRIFT', '2TURTLECREEK', '23SHARONHILL', '37UNION', '65MURRYSVILLE', '35JERMYN', '11RICHLAND', '65SOUTHGREENSBURG', '40PLAINS', '4HARMONY', '9EASTROCKHILL', '61CRANBERRY', '13LANSFORD', '13KIDDER', '21DICKINSON', '46EASTNORRITON', '15EASTMARLBOROUGH'