In [2]:
import pandas as pd
import glob
import os
from pprint import pprint

In [3]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\GA\2016\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary', 'president'])
]


In [4]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160119__ga__special__general__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160216__ga__special__general__runoff__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160329__ga__special__general__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160426__ga__special__general__runoff__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20161108__ga__general__appling__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20161108__ga__general__atkinson__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20161108__ga__general__bacon__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20161108__ga__general__baker__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20161108__ga__general__baldwin__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20161108__ga__general__banks__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20161108__ga__gene

In [5]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__appling__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__atkinson__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__bacon__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__baker__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__baldwin__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__banks__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__barrow__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__bartow__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__president__ben_hill__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\GA\2016\20160301__ga__primary__pre

In [6]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)

        # Drop rows where 'precinct' is NaN
        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]
        
        df = df.drop_duplicates()

        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
pri_combined_df["precinct"] = pri_combined_df["county"].astype(str) + pri_combined_df["precinct"].astype(str)



In [22]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]
primary_data = primary_data[~primary_data["candidate"].isin(["YES", "NO"])]
primary_data = primary_data[primary_data["party"].isin(["DEM", "REP"])] # Analyzing only republican and democratic

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,Appling1B,REP,JEB BUSH,5
1,Appling1C,REP,JEB BUSH,3
2,Appling2,REP,JEB BUSH,3
3,Appling3A,REP,JEB BUSH,0
4,Appling3A1,REP,JEB BUSH,3
...,...,...,...,...
47555,WorthShingler,DEM,MICHAEL STEINBERG,1
47556,WorthRed Rock,DEM,MICHAEL STEINBERG,0
47557,WorthDoles,DEM,MICHAEL STEINBERG,0
47558,WorthOakfield,DEM,MICHAEL STEINBERG,0


In [23]:
primary_data["party"].value_counts(dropna=False)

party
REP    35100
DEM    10800
Name: count, dtype: int64

In [9]:
primary_data["candidate"].unique()
candidate_party_map = (
    primary_data.dropna(subset=["candidate", "party"])
                .set_index("candidate")["party"]
                .to_dict()
)
print(candidate_party_map)

{'JEB BUSH': 'REP', 'BEN CARSON': 'REP', 'CHRIS CHRISTIE': 'REP', 'TED CRUZ': 'REP', 'CARLY FIORINA': 'REP', 'LINDSEY GRAHAM': 'REP', 'MIKE HUCKABEE': 'REP', 'JOHN R. KASICH': 'REP', 'GEORGE PATAKI': 'REP', 'RAND PAUL': 'REP', 'MARCO RUBIO': 'REP', 'RICK SANTORUM': 'REP', 'DONALD J. TRUMP': 'REP', 'HILLARY CLINTON': 'DEM', "MARTIN O'MALLEY": 'DEM', 'BERNIE SANDERS': 'DEM', 'MICHAEL STEINBERG': 'DEM', 'JOHN R.KASICH': 'REP'}


In [19]:
primary_data["candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

candidate_column,precinct,pri_dem_CLINTON,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_dem_STEINBERG,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CHRISTIE,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_GRAHAM,pri_rep_HUCKABEE,pri_rep_KASICH,pri_rep_PATAKI,pri_rep_PAUL,pri_rep_R.KASICH,pri_rep_RUBIO,pri_rep_SANTORUM,pri_rep_TRUMP
0,Appling1B,23,1,12,1,5,34,0,108,1,0,3,12,0,0,0,54,0,165
1,Appling1C,14,1,5,0,3,28,0,88,0,1,2,9,0,2,0,45,0,125
2,Appling2,284,3,41,1,3,28,0,45,0,0,0,7,0,0,0,27,0,96
3,Appling3A,1,0,3,0,0,10,0,41,0,0,0,4,0,0,0,13,0,48
4,Appling3A1,11,0,6,0,3,26,0,68,0,0,2,4,0,0,0,18,0,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2695,WorthShingler,19,0,8,1,3,12,0,50,0,0,0,4,0,1,0,20,0,111
2696,WorthSumner,27,0,6,0,1,22,0,56,0,0,1,1,0,0,0,10,0,135
2697,WorthSylver East,46,1,16,1,7,36,0,83,1,0,2,17,1,1,0,51,0,188
2698,WorthSylvester,314,0,38,1,6,14,1,82,1,0,2,16,1,0,0,26,0,154


In [11]:
# Process general files
gen_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)

        # Drop rows where 'precinct' is NaN
        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]

        if 'office' in df.columns:
            df = df[df["office"] == "President of the United States"]

        gen_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(gen_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
gen_combined_df["precinct"] = gen_combined_df["county"].astype(str) + gen_combined_df["precinct"].astype(str)
gen_combined_df


Unnamed: 0,county,precinct,office,district,party,candidate,votes,election_day,absentee,early_voting1,early_voting2,provisional,early_voting,absentee_by_mail,advance_in_person,advance_in_person_1,advance_in_person_2,advance_in_person_3
0,Appling,Appling1B,President of the United States,,REP,DONALD J. TRUMP,685,354,,,,1,,54.0,276.0,,,
1,Appling,Appling1C,President of the United States,,REP,DONALD J. TRUMP,496,282,,,,0,,40.0,174.0,,,
2,Appling,Appling2,President of the United States,,REP,DONALD J. TRUMP,427,219,,,,0,,37.0,171.0,,,
3,Appling,Appling3A,President of the United States,,REP,DONALD J. TRUMP,209,156,,,,1,,5.0,47.0,,,
4,Appling,Appling3A1,President of the United States,,REP,DONALD J. TRUMP,319,196,,,,0,,25.0,98.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8074,Worth,WorthShingler,President of the United States,,,GARY JOHNSON,4,1,,,,0,,1.0,2.0,,,
8075,Worth,WorthRed Rock,President of the United States,,,GARY JOHNSON,8,5,,,,0,,1.0,2.0,,,
8076,Worth,WorthDoles,President of the United States,,,GARY JOHNSON,4,2,,,,0,,0.0,2.0,,,
8077,Worth,WorthOakfield,President of the United States,,,GARY JOHNSON,1,1,,,,0,,0.0,0.0,,,


In [18]:
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]
general_data = general_data[~general_data["candidate"].isin(["YES", "NO"])]
general_data["party"] = general_data.apply(
    lambda row: candidate_party_map.get(row["candidate"], row["party"]) if pd.isna(row["party"]) else row["party"],
    axis=1
)
general_data = general_data[general_data["party"].isin(["DEM", "REP"])] # Analyzing only republican and democratic

general_data

Unnamed: 0,precinct,party,candidate,votes
0,Appling1B,REP,DONALD J. TRUMP,685
1,Appling1C,REP,DONALD J. TRUMP,496
2,Appling2,REP,DONALD J. TRUMP,427
3,Appling3A,REP,DONALD J. TRUMP,209
4,Appling3A1,REP,DONALD J. TRUMP,319
...,...,...,...,...
8059,WorthShingler,DEM,HILLARY CLINTON,47
8060,WorthRed Rock,DEM,HILLARY CLINTON,135
8061,WorthDoles,DEM,HILLARY CLINTON,26
8062,WorthOakfield,DEM,HILLARY CLINTON,45


In [13]:
general_data["party"].value_counts(dropna=False)


party
REP    2693
DEM    2693
Name: count, dtype: int64

In [14]:
general_data

Unnamed: 0,precinct,party,candidate,votes
0,Appling1B,REP,DONALD J. TRUMP,685
1,Appling1C,REP,DONALD J. TRUMP,496
2,Appling2,REP,DONALD J. TRUMP,427
3,Appling3A,REP,DONALD J. TRUMP,209
4,Appling3A1,REP,DONALD J. TRUMP,319
...,...,...,...,...
8059,WorthShingler,DEM,HILLARY CLINTON,47
8060,WorthRed Rock,DEM,HILLARY CLINTON,135
8061,WorthDoles,DEM,HILLARY CLINTON,26
8062,WorthOakfield,DEM,HILLARY CLINTON,45


In [15]:
general_data["candidate_column"] = (
    "gen_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

candidate_column,precinct,gen_dem_CLINTON,gen_dem_O'MALLEY,gen_dem_SANDERS,gen_dem_STEINBERG,gen_rep_BUSH,gen_rep_CARSON,gen_rep_CHRISTIE,gen_rep_CRUZ,gen_rep_FIORINA,gen_rep_GRAHAM,gen_rep_HUCKABEE,gen_rep_KASICH,gen_rep_PATAKI,gen_rep_PAUL,gen_rep_RUBIO,gen_rep_SANTORUM,gen_rep_TRUMP
0,Appling1B,0,0,0,0,685,92,0,0,0,0,0,0,0,0,0,0,0
1,Appling1C,0,0,0,0,496,42,0,0,0,0,0,0,0,0,0,0,0
2,Appling2,0,0,0,0,427,742,0,0,0,0,0,0,0,0,0,0,0
3,Appling3A,0,0,0,0,209,4,0,0,0,0,0,0,0,0,0,0,0
4,Appling3A1,0,0,0,0,319,17,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2548,WorthShingler,0,0,0,0,384,0,0,0,0,0,0,0,0,0,0,0,0
2549,WorthSumner,0,0,0,0,509,0,0,0,0,0,0,0,0,0,0,0,0
2550,WorthSylver East,0,0,0,0,826,0,0,0,0,0,0,0,0,0,0,0,0
2551,WorthSylvester,0,0,0,0,1467,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
combined = pd.merge(result, general_result, on="precinct", how="inner")
combined["rep_primary_total"] = combined.filter(like="pri_rep_").sum(axis=1)
combined["dem_primary_total"] = combined.filter(like="pri_dem_").sum(axis=1)
combined["general_total"] = combined.filter(like="gen_").sum(axis=1)
combined

candidate_column,precinct,pri_dem_CLINTON,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_dem_STEINBERG,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CHRISTIE,pri_rep_CRUZ,pri_rep_FIORINA,...,gen_rep_HUCKABEE,gen_rep_KASICH,gen_rep_PATAKI,gen_rep_PAUL,gen_rep_RUBIO,gen_rep_SANTORUM,gen_rep_TRUMP,rep_primary_total,dem_primary_total,general_total
0,Appling1B,23,1,12,1,5,34,0,108,1,...,0,0,0,0,0,0,0,382,37,777
1,Appling1C,14,1,5,0,3,28,0,88,0,...,0,0,0,0,0,0,0,303,20,538
2,Appling2,284,3,41,1,3,28,0,45,0,...,0,0,0,0,0,0,0,206,329,1169
3,Appling3A,1,0,3,0,0,10,0,41,0,...,0,0,0,0,0,0,0,116,4,213
4,Appling3A1,11,0,6,0,3,26,0,68,0,...,0,0,0,0,0,0,0,180,17,336
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2286,WorthShingler,19,0,8,1,3,12,0,50,0,...,0,0,0,0,0,0,0,201,28,384
2287,WorthSumner,27,0,6,0,1,22,0,56,0,...,0,0,0,0,0,0,0,226,33,509
2288,WorthSylver East,46,1,16,1,7,36,0,83,1,...,0,0,0,0,0,0,0,387,64,826
2289,WorthSylvester,314,0,38,1,6,14,1,82,1,...,0,0,0,0,0,0,0,303,353,1467


In [17]:
combined.to_csv("GA.csv", index=False)
