In [1]:
import pandas as pd
import glob
import os
from pprint import pprint

In [2]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\SC\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary', 'president'])
]


In [3]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\SC\20161108__sc__general__precinct.csv


In [4]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\SC\20160227__sc__primary__president__precinct.csv


In [6]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)

        # Drop rows where 'precinct' is NaN
        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]
        
        df = df.drop_duplicates()

        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
pri_combined_df["precinct"] = pri_combined_df["county"].astype(str) + pri_combined_df["precinct"].astype(str)
pri_combined_df["precinct"] = pri_combined_df["precinct"].str.upper()
pri_combined_df


Unnamed: 0,county,precinct,office,district,candidate,party,votes
0,Saluda,SALUDACENTENNIAL,PRESIDENT,,Hillary Clinton,DEM,15
1,Saluda,SALUDACLYDE,PRESIDENT,,Hillary Clinton,DEM,0
2,Saluda,SALUDADELMAR,PRESIDENT,,Hillary Clinton,DEM,8
3,Saluda,SALUDAFRUIT HILL,PRESIDENT,,Hillary Clinton,DEM,130
4,Saluda,SALUDAHIGGINS-ZOAR,PRESIDENT,,Hillary Clinton,DEM,58
...,...,...,...,...,...,...,...
9566,Beaufort,BEAUFORTABSENTEE,PRESIDENT,,Willie Wilson,DEM,7
9567,Beaufort,BEAUFORTEMERGENCY,PRESIDENT,,Willie Wilson,DEM,0
9568,Beaufort,BEAUFORTFAILSAFE,PRESIDENT,,Willie Wilson,DEM,0
9569,Beaufort,BEAUFORTPROVISIONAL,PRESIDENT,,Willie Wilson,DEM,1


In [7]:
pri_combined_df["party"].value_counts(dropna=False)

party
DEM    9571
Name: count, dtype: int64

In [8]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]
# primary_data.loc[:, "party"] = primary_data["party"].replace({
#     "President Republican Party": "REP",
#     "President Democratic Party": "DEM",
#     "Democratic Party": "DEM",
#     "Republican Party": "REP"
# })

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,SALUDACENTENNIAL,DEM,Hillary Clinton,15
1,SALUDACLYDE,DEM,Hillary Clinton,0
2,SALUDADELMAR,DEM,Hillary Clinton,8
3,SALUDAFRUIT HILL,DEM,Hillary Clinton,130
4,SALUDAHIGGINS-ZOAR,DEM,Hillary Clinton,58
...,...,...,...,...
9566,BEAUFORTABSENTEE,DEM,Willie Wilson,7
9567,BEAUFORTEMERGENCY,DEM,Willie Wilson,0
9568,BEAUFORTFAILSAFE,DEM,Willie Wilson,0
9569,BEAUFORTPROVISIONAL,DEM,Willie Wilson,1


In [9]:
primary_data["party"].value_counts(dropna=False)

party
DEM    9571
Name: count, dtype: int64

In [10]:
#Remove Write-in and Registered Voters
primary_data = primary_data[~primary_data["candidate"].isin(["Registered Voters", "Write-In"])] 
primary_data["candidate"].value_counts(dropna=False)

candidate
Hillary Clinton    2455
Bernie Sanders     2454
Willie Wilson      2337
Martin O'Malley    2325
Name: count, dtype: int64

In [11]:
primary_data["candidate"].unique()
candidate_party_map = (
    primary_data.dropna(subset=["candidate", "party"])
                .set_index("candidate")["party"]
                .to_dict()
)
print(candidate_party_map)

{'Hillary Clinton': 'DEM', "Martin O'Malley": 'DEM', 'Bernie Sanders': 'DEM', 'Willie Wilson': 'DEM'}


In [12]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

candidate_column,precinct,pri_dem_CLINTON,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_dem_WILSON
0,ABBEVILLEABBEVILLE NO. 1,104,1,28,1
1,ABBEVILLEABBEVILLE NO. 2,193,0,24,1
2,ABBEVILLEABBEVILLE NO. 3,105,1,25,0
3,ABBEVILLEABBEVILLE NO. 4,53,0,5,0
4,ABBEVILLEABSENTEE,354,0,52,4
...,...,...,...,...,...
2450,YORKWATERSTONE,61,0,54,0
2451,YORKWINDJAMMER,79,1,87,0
2452,YORKWYLIE,37,0,29,0
2453,YORKYORK NO. 1,139,0,41,1


In [14]:
# Process general files
gen_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)

        # Select only president
        if 'office' in df.columns:
            df = df[df["office"] == "President" ]

        gen_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(gen_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
gen_combined_df["precinct"] = gen_combined_df["county"].astype(str) + gen_combined_df["precinct"].astype(str)
gen_combined_df["precinct"] = gen_combined_df["precinct"].str.upper()
gen_combined_df


Unnamed: 0,county,precinct,office,district,party,candidate,votes
0,Abbeville,ABBEVILLEABBEVILLE NO. 1,President,,DEM,Hillary Rodham Clinton / Timothy Michael Kaine,245
1,Abbeville,ABBEVILLEABBEVILLE NO. 2,President,,DEM,Hillary Rodham Clinton / Timothy Michael Kaine,373
2,Abbeville,ABBEVILLEABBEVILLE NO. 3,President,,DEM,Hillary Rodham Clinton / Timothy Michael Kaine,214
3,Abbeville,ABBEVILLEABBEVILLE NO. 4,President,,DEM,Hillary Rodham Clinton / Timothy Michael Kaine,131
4,Abbeville,ABBEVILLEANTREVILLE,President,,DEM,Hillary Rodham Clinton / Timothy Michael Kaine,105
...,...,...,...,...,...,...,...
14682,York,YORKFAILSAFE 2,President,,LIB,Gary Johnson / Bill Weld,1
14683,York,YORKPROVISIONAL 1,President,,LIB,Gary Johnson / Bill Weld,3
14684,York,YORKPROVISIONAL 2,President,,LIB,Gary Johnson / Bill Weld,4
14685,York,YORKPROVISIONAL 3,President,,LIB,Gary Johnson / Bill Weld,1


In [15]:

general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]
general_data["party"].value_counts(dropna=False)


party
REP    2456
DEM    2455
LIB    2310
IND    2161
GRN    2108
CON    1736
AMR    1178
AME     150
AMC     133
Name: count, dtype: int64

In [16]:
general_data["candidate"].value_counts(dropna=False)

candidate
Donald J Trump / Michael R Pence                  2456
Hillary Rodham Clinton / Timothy Michael Kaine    2455
Gary Johnson / Bill Weld                          2310
Evan McMullin / Nathan Johnson                    2161
Jill Stein / Ajamu Baraka                         2108
Darrell Castle / Scott Bradley                    1736
Peter Skewes / Michael Lacy                       1461
Name: count, dtype: int64

In [17]:
general_data["candidate"] = general_data["candidate"].str.extract(r"^([^/&]+)").iloc[:, 0].str.strip()
general_data = general_data[~general_data["candidate"].isin(["Registered Voters", "Write-In"])] 
general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].str.extract(r"^([^/&]+)").iloc[:, 0].str.strip()


candidate
Donald J Trump            2456
Hillary Rodham Clinton    2455
Gary Johnson              2310
Evan McMullin             2161
Jill Stein                2108
Darrell Castle            1736
Peter Skewes              1461
Name: count, dtype: int64

In [18]:
# general_data = general_data[~general_data["candidate"].isin(["YES", "NO"])]
# general_data["party"] = general_data.apply(
#     lambda row: candidate_party_map.get(row["candidate"], row["party"]) if pd.isna(row["party"]) else row["party"],
#     axis=1
# )
# def fill_party_from_general_data(row, df):
#     if pd.notna(row["party"]):
#         return row["party"]
    
#     # Try to find other rows with the same candidate and known party
#     matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
#     if not matches.empty:
#         return matches["party"].iloc[0]  # Return the first match's party
#     else:
#         return None  # Still unknown

# general_data["party"] = general_data.apply(
#     lambda row: fill_party_from_general_data(row, general_data),
#     axis=1
# )
# general_data["party"] = (
#     general_data["party"]
#     .replace({
#         "Democratic": "DEM",
#         "Republican": "REP",
#         "Libertarian": "LIB",
#         "Green": "GRN",
#         "Constitution": "CON",
#         "American Delta": "AMD",
#         "Prohibition": "PRO",
#         "Socialism & Liberation": "SOL",
#         "Socialist Workers": "SOW"
#     })
#     .fillna("IND")
# )

general_data

Unnamed: 0,precinct,party,candidate,votes
0,ABBEVILLEABBEVILLE NO. 1,DEM,Hillary Rodham Clinton,245
1,ABBEVILLEABBEVILLE NO. 2,DEM,Hillary Rodham Clinton,373
2,ABBEVILLEABBEVILLE NO. 3,DEM,Hillary Rodham Clinton,214
3,ABBEVILLEABBEVILLE NO. 4,DEM,Hillary Rodham Clinton,131
4,ABBEVILLEANTREVILLE,DEM,Hillary Rodham Clinton,105
...,...,...,...,...
14682,YORKFAILSAFE 2,LIB,Gary Johnson,1
14683,YORKPROVISIONAL 1,LIB,Gary Johnson,3
14684,YORKPROVISIONAL 2,LIB,Gary Johnson,4
14685,YORKPROVISIONAL 3,LIB,Gary Johnson,1


In [19]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

candidate_column,precinct,gen_amc_SKEWES,gen_ame_SKEWES,gen_amr_SKEWES,gen_con_CASTLE,gen_dem_CLINTON,gen_grn_STEIN,gen_ind_MCMULLIN,gen_lib_JOHNSON,gen_rep_TRUMP
0,ABBEVILLEABBEVILLE NO. 1,0,0,2,2,245,1,4,21,610
1,ABBEVILLEABBEVILLE NO. 2,0,0,1,1,373,0,2,7,303
2,ABBEVILLEABBEVILLE NO. 3,0,0,2,3,214,2,3,5,284
3,ABBEVILLEABBEVILLE NO. 4,0,0,0,2,131,1,2,5,279
4,ABBEVILLEABSENTEE,0,0,2,12,1270,7,8,29,1439
...,...,...,...,...,...,...,...,...,...,...
2459,YORKWATERSTONE,2,0,0,4,419,11,16,44,602
2460,YORKWINDJAMMER,2,0,0,5,382,8,12,41,699
2461,YORKWYLIE,1,0,0,4,168,2,5,25,411
2462,YORKYORK NO. 1,1,0,0,0,405,7,7,14,309


In [21]:
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")
combined["rep_primary_total"] = combined.filter(like="pri_rep_").sum(axis=1)
combined["dem_primary_total"] = combined.filter(like="pri_dem_").sum(axis=1)
combined["general_total"] = combined.filter(like="gen_").sum(axis=1)
for col in combined.columns[1:]:
    combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0).astype(int)
combined

candidate_column,precinct,pri_dem_CLINTON,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_dem_WILSON,gen_amc_SKEWES,gen_ame_SKEWES,gen_amr_SKEWES,gen_con_CASTLE,gen_dem_CLINTON,gen_grn_STEIN,gen_ind_MCMULLIN,gen_lib_JOHNSON,gen_rep_TRUMP,rep_primary_total,dem_primary_total,general_total
0,ABBEVILLEABBEVILLE NO. 1,104,1,28,1,0,0,2,2,245,1,4,21,610,0,134,885
1,ABBEVILLEABBEVILLE NO. 2,193,0,24,1,0,0,1,1,373,0,2,7,303,0,218,687
2,ABBEVILLEABBEVILLE NO. 3,105,1,25,0,0,0,2,3,214,2,3,5,284,0,131,513
3,ABBEVILLEABBEVILLE NO. 4,53,0,5,0,0,0,0,2,131,1,2,5,279,0,58,420
4,ABBEVILLEABSENTEE,354,0,52,4,0,0,2,12,1270,7,8,29,1439,0,410,2767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2291,YORKWATERSTONE,61,0,54,0,2,0,0,4,419,11,16,44,602,0,115,1098
2292,YORKWINDJAMMER,79,1,87,0,2,0,0,5,382,8,12,41,699,0,167,1149
2293,YORKWYLIE,37,0,29,0,1,0,0,4,168,2,5,25,411,0,66,616
2294,YORKYORK NO. 1,139,0,41,1,1,0,0,0,405,7,7,14,309,0,181,743


In [22]:
combined.to_csv("SC.csv", index=False)
