In [307]:
# Import all the libraries
import pandas as pd
import glob
import os
import re
from pprint import pprint

In [308]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\GA\2016\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary', 'president'])
]


In [309]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160119__ga__special__general__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160216__ga__special__general__runoff__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160329__ga__special__general__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160426__ga__special__general__runoff__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20161108__ga__general__appling__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20161108__ga__general__atkinson__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20161108__ga__general__bacon__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20161108__ga__general__baker__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20161108__ga__general__baldwin__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20161108__ga__general__banks__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20161108__ga__gene

In [310]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__appling__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__atkinson__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__bacon__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__baker__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__baldwin__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__banks__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__barrow__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__bartow__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__ben_hill__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__pre

In [311]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)
        
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]      
            df = df[~df['precinct'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]           
        
        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]  
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]    

        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)

# Checking any suspicious precinct
suspicious_precincts = pri_combined_df[
    pri_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]
pri_combined_df

Unnamed: 0,county,precinct,office,district,party,candidate,votes,absentee_by_mail,election_day,advance_in_person,provisional,advance_in_person_1,advance_in_person_2,advance_in_person_3
0,Appling,APPLINGNAN,President,,REP,JEB BUSH,32,5,20,7.0,0,,,
1,Appling,APPLING1B,President,,REP,JEB BUSH,5,1,3,1.0,0,,,
2,Appling,APPLING1C,President,,REP,JEB BUSH,3,1,2,0.0,0,,,
3,Appling,APPLING2,President,,REP,JEB BUSH,3,0,3,0.0,0,,,
4,Appling,APPLING3A,President,,REP,JEB BUSH,0,0,0,0.0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48581,Worth,WORTHSHINGLER,President,,DEM,MICHAEL STEINBERG,1,0,1,0.0,0,,,
48582,Worth,WORTHRED ROCK,President,,DEM,MICHAEL STEINBERG,0,0,0,0.0,0,,,
48583,Worth,WORTHDOLES,President,,DEM,MICHAEL STEINBERG,0,0,0,0.0,0,,,
48584,Worth,WORTHOAKFIELD,President,,DEM,MICHAEL STEINBERG,0,0,0,0.0,0,,,


In [312]:

county_name_chatham = "CHATHAM"

pri_combined_df.loc[
    pri_combined_df["precinct"].str.contains(county_name_chatham, case=False, na=False),
    "precinct"
] = pri_combined_df["precinct"].str.extract(fr"^({county_name_chatham}[^\s]*)", flags=re.IGNORECASE)[0]

county_name2 = "GWINNETT"

# Extract parts only for rows that contain GWINNETT
mask = pri_combined_df["precinct"].str.contains(county_name2, case=False, na=False)
matches = pri_combined_df.loc[mask, "precinct"].str.extract(
    fr"^({county_name2})\d*\s+(.*)", flags=re.IGNORECASE
)

# Drop only unmatched GWINNETT rows
valid_mask = matches[0].notna()
pri_combined_df.loc[mask[mask].index[valid_mask], "precinct"] = (
    matches.loc[valid_mask, 0].str.upper() + matches.loc[valid_mask, 1]
)

county_name3 = "LOWNDES"

pri_combined_df.loc[
    pri_combined_df["precinct"].str.contains(county_name3, case=False, na=False),
    "precinct"
] = pri_combined_df["precinct"].str.extract(fr"^({county_name3}[^\s]*)", flags=re.IGNORECASE)[0]


In [313]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,APPLINGNAN,REP,JEB BUSH,32
1,APPLING1B,REP,JEB BUSH,5
2,APPLING1C,REP,JEB BUSH,3
3,APPLING2,REP,JEB BUSH,3
4,APPLING3A,REP,JEB BUSH,0
...,...,...,...,...
48581,WORTHSHINGLER,DEM,MICHAEL STEINBERG,1
48582,WORTHRED ROCK,DEM,MICHAEL STEINBERG,0
48583,WORTHDOLES,DEM,MICHAEL STEINBERG,0
48584,WORTHOAKFIELD,DEM,MICHAEL STEINBERG,0


In [314]:
# Viewing candidate data
primary_data["candidate"].value_counts(dropna=False)

candidate
JEB BUSH             2858
BEN CARSON           2858
MICHAEL STEINBERG    2858
BERNIE SANDERS       2858
MARTIN O'MALLEY      2858
HILLARY CLINTON      2858
DONALD J. TRUMP      2858
RICK SANTORUM        2858
MARCO RUBIO          2858
RAND PAUL            2858
GEORGE PATAKI        2858
MIKE HUCKABEE        2858
LINDSEY GRAHAM       2858
CARLY FIORINA        2858
TED CRUZ             2858
CHRIS CHRISTIE       2858
JOHN R. KASICH       2830
JOHN R.KASICH          28
Name: count, dtype: int64

In [315]:
# Cleaning Candidates

# Turning all primary data to uppercase
primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()

unwanted_keywords = [
    r"\w*VOTE\w*",
    r"\w*UNCOM\w*",
    r"\w*TOTAL\w*"
]

pattern = "|".join(unwanted_keywords)

# Assuming candidate column is already string and uppercase
primary_data = primary_data[~primary_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# Fixing the , candidate
primary_data["candidate"] = (
    primary_data["candidate"].str.split(",")
    .str[0]
    .str.strip()
    )

# Fixing De la Fuente
primary_data.loc[
    primary_data["candidate"].str.contains("\w*Fuen\w*", case=False, na=False),
    "candidate"
] = "LA FUENTE"

#Selecting only last name
primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]

# Fixing Huckabee
primary_data.loc[
    primary_data["candidate"].str.contains("\w*ABEE\w*", case=False, na=False),
    "candidate"
] = "HUCKABEE"

# Fixing Fiorina
primary_data.loc[
    primary_data["candidate"].str.contains("\w*ORINA\W*", case=False, na=False),
    "candidate"
] = "FIORINA"

# Fixing Gray
primary_data.loc[
    primary_data["candidate"].str.contains("GREY", case=False, na=False),
    "candidate"
] = "GRAY"

# Fixing Wilson
primary_data.loc[
    primary_data["candidate"].str.contains("WISON", case=False, na=False),
    "candidate"
] = "WILSON"

# Fixing Kasich
primary_data.loc[
    primary_data["candidate"].str.contains("\w*KASICH\W*", case=False, na=False),
    "candidate"
] = "KASICH"

# Fixing separator 
# primary_data["candidate"] = (
#     primary_data["candidate"]
#     .str.split(r"\s*(?:and|/|&|–|-|\+)\s*", n=1, expand=True)[0]
#     .str.strip()
#     .str.upper()
# )

# # Fixing McMullin

# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("MCMULLIN", case=False, na=False) |
#         primary_data["candidate"].str.contains("EVAN MCMULLEN", case=False, na=False)
#     ),
#     "candidate"
# ] = "MCMULLIN"

primary_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()


candidate
BUSH         2858
PAUL         2858
SANDERS      2858
O'MALLEY     2858
CLINTON      2858
TRUMP        2858
SANTORUM     2858
RUBIO        2858
PATAKI       2858
CARSON       2858
KASICH       2858
HUCKABEE     2858
GRAHAM       2858
FIORINA      2858
CRUZ         2858
CHRISTIE     2858
STEINBERG    2858
Name: count, dtype: int64

In [316]:
# Viewing Party
primary_data["party"].value_counts(dropna=False)

party
REP    37154
DEM    11432
Name: count, dtype: int64

In [317]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown
    

#=====================================
# Function to fill remaining missing party from master lookup
#=====================================
def fill_party_from_master(df, master_df):
    party_map = master_df.set_index("candidate")["party"].to_dict()
    df["party"] = df.apply(
        lambda row: party_map.get(row["candidate"], row["party"])
        if pd.isna(row["party"]) else row["party"],
        axis=1
    )
    return df


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)
    
    # Combine unique pairs
    new_data = df[["candidate", "party"]].dropna().drop_duplicates()
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# )

# Fill remaining party using general master CSV
# master_party_df = pd.read_csv("data/master_primary_candidate_party.csv")
# general_data = fill_party_from_master(general_data, master_party_df)

# STEP 3: Update master file with new (candidate, party) pairs
# update_master_candidate_party(general_data, "data/master_candidate_party.csv")
primary_data["party"].value_counts(dropna=False)


party
REP    37154
DEM    11432
Name: count, dtype: int64

In [318]:
# Cleaning Party
# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# )
# primary_data.loc[:,"party"] = (
#     primary_data["party"]
#     .replace({
#         "REPUBLICAN PARTY": "REP",
#         "R": "REP",
#         "Republican":"REP",
#         "DEMOCRATIC PARTY": "DEM",
#         "D": "DEM",
#         "Democratic": "DEM"
#     })
#     .fillna("IND")
# )
primary_data["party"].value_counts(dropna=False)

party
REP    37154
DEM    11432
Name: count, dtype: int64

In [319]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"]
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

candidate_column,precinct,pri_dem_CLINTON,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_dem_STEINBERG,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CHRISTIE,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_GRAHAM,pri_rep_HUCKABEE,pri_rep_KASICH,pri_rep_PATAKI,pri_rep_PAUL,pri_rep_RUBIO,pri_rep_SANTORUM,pri_rep_TRUMP
0,APPLING1B,23,1,12,1,5,34,0,108,1,0,3,12,0,0,54,0,165
1,APPLING1C,14,1,5,0,3,28,0,88,0,1,2,9,0,2,45,0,125
2,APPLING2,284,3,41,1,3,28,0,45,0,0,0,7,0,0,27,0,96
3,APPLING3A,1,0,3,0,0,10,0,41,0,0,0,4,0,0,13,0,48
4,APPLING3A1,11,0,6,0,3,26,0,68,0,0,2,4,0,0,18,0,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2852,WORTHSHINGLER,19,0,8,1,3,12,0,50,0,0,0,4,0,1,20,0,111
2853,WORTHSUMNER,27,0,6,0,1,22,0,56,0,0,1,1,0,0,10,0,135
2854,WORTHSYLVER EAST,46,1,16,1,7,36,0,83,1,0,2,17,1,1,51,0,188
2855,WORTHSYLVESTER,314,0,38,1,6,14,1,82,1,0,2,16,1,0,26,0,154


In [320]:
# Process general files
general_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)
        
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]   
            df = df[~df['precinct'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]              
        
        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]  
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]    
            
        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President of the United States"]
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        general_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(general_df_list, ignore_index=True)

# Checking any suspicious precinct
suspicious_precincts = gen_combined_df[
    gen_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]


In [321]:
# cleaning 
# Fixing Canyon De Chelly
gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*BARROW08 FIRST BAPTIST CHURCH WINDER\w*", case=False, na=False),
    "precinct")
] = "BARROW08 HOLSENBECK ELEMENTARY SCHOOL"

gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*BROOKSBARWICK\w*", case=False, na=False),
    "precinct")
] = "BROOKSBARWICK/DRYLAKE"

gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*BROOKSBRIGGS\w*", case=False, na=False),
    "precinct")
] = "BROOKSE.BROOKS/BRIGGS"

gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*BROOKSNANKIN\w*", case=False, na=False),
    "precinct")
] = "BROOKSS.BROOKS/NANKIN"


gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*BRYANPUBLIC SAFETY\w*", case=False, na=False),
    "precinct")
] = "BRYANPUBLIC SAFETY COMPLEX"

gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*BRYANRH RECREATION\w*", case=False, na=False),
    "precinct")
] = "BRYANRH REC COMPLEX"


gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*CHARLTONFOLKSTON FIRE\w*", case=False, na=False),
    "precinct")
] = "CHARLTONFOLKSTON FIRE STATION"


county_name_chatham = "CHATHAM"

gen_combined_df.loc[
    gen_combined_df["precinct"].str.contains(county_name_chatham, case=False, na=False),
    "precinct"
] = gen_combined_df["precinct"].str.extract(fr"^({county_name_chatham}[^\s]*)", flags=re.IGNORECASE)[0]


gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*BROOKSPAVO\w*", case=False, na=False),
    "precinct")
] = "BROOKSPAVO/WILLIAMS"

county_name2 = "CRAWFORD"

gen_combined_df.loc[
    gen_combined_df["precinct"].str.contains(county_name2, case=False, na=False),
    "precinct"
] = gen_combined_df["precinct"].str.extract(fr"^({county_name2}[^\s]*)", flags=re.IGNORECASE)[0]



gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*FULTONJC18\w*", case=False, na=False),
    "precinct")
] = "FULTONJC18A"

gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*FULTONJC18\w*", case=False, na=False),
    "precinct")
] = "FULTONJC18A"

gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*FULTONJC13\w*", case=False, na=False),
    "precinct")
] = "FULTONJC13A"

gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*FULTONJC15\w*", case=False, na=False),
    "precinct")
] = "FULTONJC15A"


gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*GWINNETT035 CATES D - 04\w*", case=False, na=False),
    "precinct")
] = "GWINNETTCATES D - 04"

gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*SUMTERANDERSONV\w*", case=False, na=False),
    "precinct")
] = "SUMTERANDERSONVILLE"


gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*TELFAIRLUMBER-CITY\w*", case=False, na=False),
    "precinct")
] = "TELFAIRLUMBER CITY"

gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*APACHETEEC NOS POS\w*", case=False, na=False),
    "precinct")
] = "APACHETEEC"

gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*APACHEWIDE RUINS\w*", case=False, na=False),
    "precinct")
] = "APACHEWIDE"

# APACHEWINDOW ROCK
gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*APACHEWINDOW ROCK\w*", case=False, na=False),
    "precinct")
] = "APACHEWINDOW ROCK"
# LA PAZ058 QUARTZSITE TWO, THREE
gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*LA PAZ058 QUARTZSITE TWO\w*", case=False, na=False),
    "precinct")
] = "LA PAZ058 QUARTZSITE 2"
gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*LA PAZ058 QUARTZSITE THREE\w*", case=False, na=False),
    "precinct")
] = "LA PAZ058 QUARTZSITE 3"

# MOHAVE219-LAKE HAVASU NORTH N., S.
gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*MOHAVE219-LAKE HAVASU NORTH\w*", case=False, na=False),
    "precinct")
] = "MOHAVE219-LAKE HAVASU N."

gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*MOHAVE219-LAKE HAVASU SOUTH\w*", case=False, na=False),
    "precinct")
] = "MOHAVE219-LAKE HAVASU S."

# PINALAPACHE JUNCTION E. CENTRAL,32,0,4,74
# PINALAPACHE JUNCTION N. CENTRAL,56,1,5,120
gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*PINALAPACHE JUNCTION E. CENTRAL\w*", case=False, na=False),
    "precinct")
] = "PINALAPACHE JUNCTION"

gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*PINALAPACHE JUNCTION N. CENTRAL\w*", case=False, na=False),
    "precinct")
] = "PINALAPACHE JUNCTION N. CENTRA"

# PINALCASA GRANDE S CENTRAL
gen_combined_df.loc[
    ( 
        gen_combined_df["precinct"].str.contains("\w*PINALCASA GRANDE S CENTRAL\w*", case=False, na=False),
    "precinct")
] = "PINALCASA GRANDE S CEN"

In [322]:
# Select only the relevant columns
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]

general_data

Unnamed: 0,precinct,party,candidate,votes
0,APPLINGNAN,REP,DONALD J. TRUMP,5494
1,APPLING1B,REP,DONALD J. TRUMP,685
2,APPLING1C,REP,DONALD J. TRUMP,496
3,APPLING2,REP,DONALD J. TRUMP,427
4,APPLING3A,REP,DONALD J. TRUMP,209
...,...,...,...,...
8551,WORTHSHINGLER,,GARY JOHNSON,4
8552,WORTHRED ROCK,,GARY JOHNSON,8
8553,WORTHDOLES,,GARY JOHNSON,4
8554,WORTHOAKFIELD,,GARY JOHNSON,1


In [323]:
# Viewing candidate data
general_data["candidate"].value_counts(dropna=False)

candidate
DONALD J. TRUMP    2852
HILLARY CLINTON    2852
GARY JOHNSON       2852
Name: count, dtype: int64

In [324]:
# Cleaning Candidates

# Turning all primary data to uppercase
# general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


# unwanted_keywords = [
#     r"\w*VOTE\w*",
#     r"\w*UNCOM\w*",
#     r"\w*TOTAL\w*",
#     r"\w*WRITE[\s-]\w*",
#     r"NAN",
#     r"UNCERTIFIED",
#     r"UNVERIFIED",
#     r"NONE OF THE ABOVE",
#     r"LBT",
#     r"DEM",
#     r"REP",
#     r"GRN",
#     r"NAN"

# ]

# pattern = "|".join(unwanted_keywords)

# # FILTER OUT TRASH WORDS
# general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# # Fixing Darrell Castle"
# general_data.loc[
#     general_data["candidate"].str.contains("\w*ASTLE\w*", case=False, na=False),
#     "candidate"
# ] = "CASTLE"

# general_data.loc[
#     general_data["candidate"].str.contains("\w*ATUREN\w*", case=False, na=False),
#     "candidate"
# ] = "MATUREN"

# # Fixing mixed president + vice_president by /
# general_data["candidate"] = (
#     general_data["candidate"].str.split("/")
#     .str[0]
#     .str.strip()
#     )

# #Selecting only last name
# general_data["candidate"] = general_data["candidate"].str.split().str[-1]

# # Fixing the , candidate
# # general_data["candidate"] = (
# #     general_data["candidate"].str.split(",")
# #     .str[0]
# #     .str.strip()
# #     )



# # Fixing Cubbler
# general_data.loc[
#     ( 
#         general_data["candidate"].str.contains("\w*UBBIER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*UBLER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*OBBLER\w*", case=False, na=False),
#     "candidate")
# ] = "CUBBLER"


# # Fixing Kotlikoff
# general_data.loc[
#     general_data["candidate"].str.contains("\w*TIKOFF\w*", case=False, na=False),
#     "candidate"
# ] = "KOTLIKOFF"

# # Fixing Valdivia
# general_data.loc[
#     general_data["candidate"].str.contains("\w*VALDIVA\W*", case=False, na=False),
#     "candidate"
# ] = "VALDIVIA"

# # Fixing HOEFLING
# general_data.loc[
#     general_data["candidate"].str.contains("\w*HOEFFLING", case=False, na=False),
#     "candidate"
# ] = "HOEFLING"


# # Fixing McMullin
# general_data.loc[
#     general_data["candidate"].str.contains("MCMULLEN", case=False, na=False) ,
#     "candidate"
# ] = "MCMULLIN"

general_data["candidate"].value_counts(dropna=False)

candidate
DONALD J. TRUMP    2852
HILLARY CLINTON    2852
GARY JOHNSON       2852
Name: count, dtype: int64

In [325]:
# Viewing Party
general_data["party"].value_counts(dropna=False)

party
NaN    8511
REP      15
DEM      15
LIB      15
Name: count, dtype: int64

In [326]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown
    

#=====================================
# Function to fill remaining missing party from master lookup
#=====================================
def fill_party_from_master(df, master_df):
    party_map = master_df.set_index("candidate")["party"].to_dict()
    df["party"] = df.apply(
        lambda row: party_map.get(row["candidate"], row["party"])
        if pd.isna(row["party"]) else row["party"],
        axis=1
    )
    return df


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)

    # Filter and combine only known party entries (exclude 'UNK')
    new_data = (
        df[["candidate", "party"]]
        .dropna()
        .query('party != "UNK"')
        .drop_duplicates()
    )

    # Merge with master and remove duplicates by candidate
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
general_data["party"] = general_data.apply(
    lambda row: fill_party_from_data(row, general_data),
    axis=1
)

# Fill remaining party using general master CSV
# master_party_df = pd.read_csv(r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\master_candidate_party.csv") # USE YOUR OWN ADDRESS
#  general_data = fill_party_from_master(general_data, master_party_df)
general_data["party"].value_counts(dropna=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data.apply(


party
REP    2852
DEM    2852
LIB    2852
Name: count, dtype: int64

In [327]:
# Cleaning Party
# Turning all general data party to uppercase
general_data["party"] = general_data["party"].astype(str).str.upper()

general_data["party"] = (
    general_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GREEN",
        "LIBERTARIAN": "LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK"
    })
)

general_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data["party"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = (


party
REP    2852
DEM    2852
LIB    2852
Name: count, dtype: int64

In [328]:
# UPDATE MASTER FILE, CAREFUL
update_master_candidate_party(general_data, r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv")

In [329]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate_column"] = (


candidate_column,precinct,gen_dem_CLINTON,gen_lib_JOHNSON,gen_rep_TRUMP
0,APPLING1B,92,9,685
1,APPLING1C,42,5,496
2,APPLING2,742,10,427
3,APPLING3A,4,4,209
4,APPLING3A1,17,1,319
...,...,...,...,...
2841,WORTHSHINGLER,47,4,337
2842,WORTHSUMNER,91,3,418
2843,WORTHSYLVER EAST,162,25,664
2844,WORTHSYLVESTER,905,16,562


In [330]:
# Merge
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")

# Convert DEM primary columns to numeric and calculate total
dem_cols = combined.filter(like="pri_dem_").columns
combined[dem_cols] = combined[dem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_primary_total"] = combined[dem_cols].sum(axis=1)

# Convert general election columns to numeric and calculate total
gen_cols = combined.filter(like="gen_").columns
combined[gen_cols] = combined[gen_cols].apply(pd.to_numeric, errors="coerce")
combined["general_total"] = combined[gen_cols].sum(axis=1)

# Convert all numeric columns to int
for col in combined.columns[1:]:
    combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0).astype(int)

# Save combined
combined.to_csv("TX.csv", index=False)

# Identify filtered-out precincts
primary_precincts = set(primary_result["precinct"])
general_precincts = set(general_result["precinct"])
combined_precincts = set(combined["precinct"])

primary_filtered_out = primary_precincts - combined_precincts
general_filtered_out = general_precincts - combined_precincts




In [331]:
combined.to_csv("GA.csv", index=False)
# Step 3: Identify unmatched precincts and retrieve full rows
combined_precincts = set(combined["precinct"])

# Rows in primary_result but not in combined
primary_filtered_out = primary_result[~primary_result["precinct"].isin(combined_precincts)]
primary_filtered_out.to_csv("GA_primary_filtered.csv", index=False)

# Rows in general_result but not in combined
general_filtered_out = general_result[~general_result["precinct"].isin(combined_precincts)]
general_filtered_out.to_csv("GA_general_filtered.csv", index=False)

In [332]:
print(f"primary: {len(primary_precincts)}, general: {len(general_precincts)}, combined: {len(combined)}")

primary: 2857, general: 2846, combined: 2821


In [333]:
print(f"primary: {len(pri_combined_df['county'].unique())}, general: {len(gen_combined_df['county'].unique())}")


primary: 159, general: 159
