In [32]:
import pandas as pd
import glob
import os
from pprint import pprint

In [33]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\DE\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary'])
]


In [34]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\DE\20161108__de__general__precinct.csv


In [35]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\DE\20160426__de__primary__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\DE\20160913__de__primary__precinct.csv


In [36]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)
        
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
            
        df = df.drop_duplicates()

        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
pri_combined_df["precinct"] = pri_combined_df["county"].astype(str) + pri_combined_df["election_district"].astype(str)
pri_combined_df["precinct"] = pri_combined_df["precinct"].str.upper()
pri_combined_df


Unnamed: 0,county,election_district,office,district,party,candidate,election_day,absentee,votes,precinct
0,New Castle,01-01,President,,DEMOCRATIC,Clinton H,603,22,625,NEW CASTLE01-01
1,New Castle,01-01,President,,DEMOCRATIC,De La Fuen,3,0,3,NEW CASTLE01-01
2,New Castle,01-01,President,,DEMOCRATIC,Sanders B,286,2,288,NEW CASTLE01-01
3,New Castle,01-01,President,,REPUBLICAN,Bush J,1,0,1,NEW CASTLE01-01
4,New Castle,02-01,President,,DEMOCRATIC,Clinton H,437,16,453,NEW CASTLE02-01
...,...,...,...,...,...,...,...,...,...,...
2821,Sussex,Total,President,,REPUBLICAN,Carson B,865,20,885,SUSSEXTOTAL
2822,Sussex,Total,President,,REPUBLICAN,Cruz R,10621,489,11110,SUSSEXTOTAL
2823,Sussex,Total,President,,REPUBLICAN,Kasich J,13747,478,14225,SUSSEXTOTAL
2824,Sussex,Total,President,,REPUBLICAN,Rubio M,590,32,622,SUSSEXTOTAL


In [37]:
pri_combined_df["party"].value_counts(dropna=False)

party
REPUBLICAN    1884
DEMOCRATIC     942
Name: count, dtype: int64

In [38]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,NEW CASTLE01-01,DEMOCRATIC,Clinton H,625
1,NEW CASTLE01-01,DEMOCRATIC,De La Fuen,3
2,NEW CASTLE01-01,DEMOCRATIC,Sanders B,288
3,NEW CASTLE01-01,REPUBLICAN,Bush J,1
4,NEW CASTLE02-01,DEMOCRATIC,Clinton H,453
...,...,...,...,...
2821,SUSSEXTOTAL,REPUBLICAN,Carson B,885
2822,SUSSEXTOTAL,REPUBLICAN,Cruz R,11110
2823,SUSSEXTOTAL,REPUBLICAN,Kasich J,14225
2824,SUSSEXTOTAL,REPUBLICAN,Rubio M,622


In [39]:
primary_data.loc[:,"party"] = (
    primary_data["party"]
    .replace({
        "DEMOCRATIC": "DEM",
        "REPUBLICAN": "REP"
    })
    .fillna("IND")
)

primary_data["party"].value_counts(dropna=False)

party
REP    1884
DEM     942
Name: count, dtype: int64

In [40]:
primary_data["candidate"].value_counts(dropna=False)

candidate
Clinton H     314
De La Fuen    314
Sanders B     314
Bush J        314
Carson B      314
Cruz R        314
Kasich J      314
Rubio M       314
Trump D       314
Name: count, dtype: int64

In [41]:
# Cleaning Candidates

primary_data.loc[
    primary_data["candidate"].str.contains("De La Fuen", case=False, na=False),
    "candidate"
] = "FUENTE D"

primary_data.loc[:,"candidate"] = (
    primary_data["candidate"].str.split().str[0].str.upper()
)

# Fixing De La Fuente





primary_data["candidate"].value_counts(dropna=False)

candidate
CLINTON    314
FUENTE     314
SANDERS    314
BUSH       314
CARSON     314
CRUZ       314
KASICH     314
RUBIO      314
TRUMP      314
Name: count, dtype: int64

In [42]:
primary_data["candidate"].unique()
candidate_party_map = (
    primary_data.dropna(subset=["candidate", "party"])
                .set_index("candidate")["party"]
                .to_dict()
)
print(candidate_party_map)

{'CLINTON': 'DEM', 'FUENTE': 'DEM', 'SANDERS': 'DEM', 'BUSH': 'REP', 'CARSON': 'REP', 'CRUZ': 'REP', 'KASICH': 'REP', 'RUBIO': 'REP', 'TRUMP': 'REP'}


In [43]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data.loc[:,"candidate_column"] = (


candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CRUZ,pri_rep_KASICH,pri_rep_RUBIO,pri_rep_TRUMP
0,KENT01-28,119,4,72,2,2,25,9,1,78
1,KENT01-29,273,7,156,0,2,53,33,1,162
2,KENT01-30,32,9,26,1,4,22,9,1,104
3,KENT01-31,33,0,27,0,0,5,2,0,21
4,KENT01-32,281,5,205,3,4,58,30,2,147
...,...,...,...,...,...,...,...,...,...,...
309,SUSSEX10-20,34,0,20,0,1,4,8,0,61
310,SUSSEX10-36,10,0,6,0,0,0,3,0,22
311,SUSSEX11-36,39,0,11,0,2,8,11,0,41
312,SUSSEX16-41,10,0,7,0,0,0,0,0,1


In [45]:
# Process general files
gen_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)

        # Select only president
        if 'office' in df.columns:
            df = df[df["office"] == "President" ]

        gen_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(gen_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
gen_combined_df["precinct"] = gen_combined_df["county"].astype(str) + gen_combined_df["election_district"].astype(str)
gen_combined_df["precinct"] = gen_combined_df["precinct"].str.upper()
gen_combined_df


Unnamed: 0,county,election_district,office,district,party,candidate,election_day,absentee,votes,precinct
0,New Castle,01-01,President,,DEMOCRATIC,Clinton H,636,43,679,NEW CASTLE01-01
1,New Castle,01-01,President,,REPUBLICAN,Trump D,64,4,68,NEW CASTLE01-01
2,New Castle,01-01,President,,GREEN,Stein J,8,0,8,NEW CASTLE01-01
3,New Castle,01-01,President,,LIBERTARIN,Johnson G,11,1,12,NEW CASTLE01-01
4,New Castle,02-01,President,,DEMOCRATIC,Clinton H,1125,52,1177,NEW CASTLE02-01
...,...,...,...,...,...,...,...,...,...,...
1727,Sussex,16-41,President,,LIBERTARIN,Johnson G,0,0,0,SUSSEX16-41
1728,Sussex,Total,President,,DEMOCRATIC,Clinton H,221608,13995,235603,SUSSEXTOTAL
1729,Sussex,Total,President,,REPUBLICAN,Trump D,175162,9965,185127,SUSSEXTOTAL
1730,Sussex,Total,President,,GREEN,Stein J,5868,235,6103,SUSSEXTOTAL


In [46]:

general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]
general_data


Unnamed: 0,precinct,party,candidate,votes
0,NEW CASTLE01-01,DEMOCRATIC,Clinton H,679
1,NEW CASTLE01-01,REPUBLICAN,Trump D,68
2,NEW CASTLE01-01,GREEN,Stein J,8
3,NEW CASTLE01-01,LIBERTARIN,Johnson G,12
4,NEW CASTLE02-01,DEMOCRATIC,Clinton H,1177
...,...,...,...,...
1727,SUSSEX16-41,LIBERTARIN,Johnson G,0
1728,SUSSEXTOTAL,DEMOCRATIC,Clinton H,235603
1729,SUSSEXTOTAL,REPUBLICAN,Trump D,185127
1730,SUSSEXTOTAL,GREEN,Stein J,6103


In [47]:
general_data["candidate"].value_counts(dropna=False)

candidate
Clinton H    433
Trump D      433
Stein J      433
Johnson G    433
Name: count, dtype: int64

In [48]:
# Cleaning general candidate

general_data.loc[:,"candidate"] = (
    general_data["candidate"].str.split().str[0].str.upper()
)


general_data["candidate"].value_counts(dropna=False)

candidate
CLINTON    433
TRUMP      433
STEIN      433
JOHNSON    433
Name: count, dtype: int64

In [49]:
general_data["party"].value_counts(dropna=False)

party
DEMOCRATIC    433
REPUBLICAN    433
GREEN         433
LIBERTARIN    433
Name: count, dtype: int64

In [50]:

general_data["party"] = (
    general_data["party"]
    .replace({
        "LIBERTARIN": "LIB",
        "DEMOCRATIC": "DEM",
        "REPUBLICAN":"REP",
        "GREEN": "GRN"
    })
    .fillna("IND")
)

general_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = (


party
DEM    433
REP    433
GRN    433
LIB    433
Name: count, dtype: int64

In [51]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate_column"] = (


candidate_column,precinct,gen_dem_CLINTON,gen_grn_STEIN,gen_lib_JOHNSON,gen_rep_TRUMP
0,KENT01-28,513,18,55,435
1,KENT01-29,1190,22,74,857
2,KENT01-30,149,10,39,487
3,KENT01-31,183,6,19,69
4,KENT01-32,616,9,33,248
...,...,...,...,...,...
428,SUSSEX10-20,78,5,6,187
429,SUSSEX10-36,38,0,4,85
430,SUSSEX11-36,110,6,12,153
431,SUSSEX16-41,46,3,0,11


In [52]:
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")
dem_cols = combined.filter(like="pri_dem_").columns
combined[dem_cols] = combined[dem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_primary_total"] = combined[dem_cols].sum(axis=1)

gen_cols = combined.filter(like="gen_").columns
combined[gen_cols] = combined[gen_cols].apply(pd.to_numeric, errors="coerce")
combined["general_total"] = combined[gen_cols].sum(axis=1)
for col in combined.columns[1:]:
    combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0).astype(int)

combined

candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CRUZ,pri_rep_KASICH,pri_rep_RUBIO,pri_rep_TRUMP,gen_dem_CLINTON,gen_grn_STEIN,gen_lib_JOHNSON,gen_rep_TRUMP,dem_primary_total,general_total
0,KENT01-28,119,4,72,2,2,25,9,1,78,513,18,55,435,195,1021
1,KENT01-29,273,7,156,0,2,53,33,1,162,1190,22,74,857,436,2143
2,KENT01-30,32,9,26,1,4,22,9,1,104,149,10,39,487,67,685
3,KENT01-31,33,0,27,0,0,5,2,0,21,183,6,19,69,60,277
4,KENT01-32,281,5,205,3,4,58,30,2,147,616,9,33,248,491,906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309,SUSSEX10-20,34,0,20,0,1,4,8,0,61,78,5,6,187,54,276
310,SUSSEX10-36,10,0,6,0,0,0,3,0,22,38,0,4,85,16,127
311,SUSSEX11-36,39,0,11,0,2,8,11,0,41,110,6,12,153,50,281
312,SUSSEX16-41,10,0,7,0,0,0,0,0,1,46,3,0,11,17,60


In [54]:
combined.to_csv("DE.csv", index=False)
