In [2]:
import pandas as pd
import glob
import os
from pprint import pprint

In [5]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\TN\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary', 'president'])
]


In [6]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\TN\20161108__tn__general__precinct.csv


In [7]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\TN\20160301__tn__primary__president__precinct.csv


In [10]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)

        # Drop rows where 'precinct' is NaN
        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]
        
        df = df.drop_duplicates()

        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
pri_combined_df["precinct"] = pri_combined_df["county"].astype(str) + pri_combined_df["precinct"].astype(str)
pri_combined_df


Unnamed: 0,county,precinct,office,district,party,candidate,votes
0,ANDERSON,ANDERSONAndersonville,Presidential Preference,,Republican,Jeb Bush,2
1,ANDERSON,ANDERSONAndersonville,Presidential Preference,,Republican,Rand Paul,1
2,ANDERSON,ANDERSONAndersonville,Presidential Preference,,Democratic,Hillary Clinton,64
3,ANDERSON,ANDERSONBriceville,Presidential Preference,,Republican,Jeb Bush,0
4,ANDERSON,ANDERSONBriceville,Presidential Preference,,Republican,Rand Paul,0
...,...,...,...,...,...,...,...
38161,COFFEE,COFFEE19 CD Stamps Center,Public Defender,14.0,Democratic,No Candidate Qualified,0
38162,COFFEE,COFFEE20 Sr Citizens Center,Public Defender,14.0,Republican,John E. Nicoll,191
38163,COFFEE,COFFEE20 Sr Citizens Center,Public Defender,14.0,Democratic,No Candidate Qualified,0
38164,COFFEE,COFFEE21 Wilson Ave,Public Defender,14.0,Republican,John E. Nicoll,202


In [11]:
pri_combined_df["party"].value_counts(dropna=False)

party
Republican    30143
Democratic     8023
Name: count, dtype: int64

In [13]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]
primary_data.loc[:, "party"] = primary_data["party"].replace({
    "Democratic": "DEM",
    "Republican": "REP"
})
primary_data = primary_data[primary_data["party"].isin(["DEM", "REP"])] # Analyzing only republican and democratic

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,ANDERSONAndersonville,REP,Jeb Bush,2
1,ANDERSONAndersonville,REP,Rand Paul,1
2,ANDERSONAndersonville,DEM,Hillary Clinton,64
3,ANDERSONBriceville,REP,Jeb Bush,0
4,ANDERSONBriceville,REP,Rand Paul,0
...,...,...,...,...
38161,COFFEE19 CD Stamps Center,DEM,No Candidate Qualified,0
38162,COFFEE20 Sr Citizens Center,REP,John E. Nicoll,191
38163,COFFEE20 Sr Citizens Center,DEM,No Candidate Qualified,0
38164,COFFEE21 Wilson Ave,REP,John E. Nicoll,202


In [14]:
primary_data["party"].value_counts(dropna=False)

party
REP    30143
DEM     8023
Name: count, dtype: int64

In [18]:
#Remove Write-in and Registered Voters
primary_data = primary_data[~primary_data["candidate"].isin(["No Candidate Qualified", "Uncommitted"])] 
primary_data["candidate"].value_counts(dropna=False)

candidate
Jeb Bush              2008
Rand Paul             2008
George Pataki         2008
John R. Kasich        2008
Mike Huckabee         2008
Lindsey O. Graham     2008
Jim Gilmore           2008
Carly Fiorina         2008
Donald J. Trump       2008
Ted Cruz              2008
Rick Santorum         2008
Chris Christie        2008
Marco Rubio           2008
Ben Carson            2008
Bernie Sanders        2000
Martin J. O'Malley    2000
Hillary Clinton       2000
John E. Nicoll          23
Name: count, dtype: int64

In [None]:
# primary_data["candidate"].unique()
# candidate_party_map = (
#     primary_data.dropna(subset=["candidate", "party"])
#                 .set_index("candidate")["party"]
#                 .to_dict()
# )
# print(candidate_party_map)

{'Jeb Bush': 'REP', 'Rand Paul': 'REP', 'Hillary Clinton': 'DEM', 'Ben Carson': 'REP', 'Marco Rubio': 'REP', "Martin J. O'Malley": 'DEM', 'Chris Christie': 'REP', 'Rick Santorum': 'REP', 'Bernie Sanders': 'DEM', 'Ted Cruz': 'REP', 'Donald J. Trump': 'REP', 'Carly Fiorina': 'REP', 'Jim Gilmore': 'REP', 'Lindsey O. Graham': 'REP', 'Mike Huckabee': 'REP', 'John R. Kasich': 'REP', 'George Pataki': 'REP', 'John E. Nicoll': 'REP'}


In [20]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

candidate_column,precinct,pri_dem_CLINTON,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CHRISTIE,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_GILMORE,pri_rep_GRAHAM,pri_rep_HUCKABEE,pri_rep_KASICH,pri_rep_NICOLL,pri_rep_PATAKI,pri_rep_PAUL,pri_rep_RUBIO,pri_rep_SANTORUM,pri_rep_TRUMP
0,ANDERSONAndersonville,64,1,39,2,55,3,170,0,0,0,4,24,0,0,1,120,0,304
1,ANDERSONBriceville,11,1,10,0,4,0,44,0,0,0,0,2,0,0,0,10,0,58
2,ANDERSONBull Run,106,5,102,7,79,1,196,1,1,0,1,31,0,0,3,162,1,316
3,ANDERSONClaxton,94,3,58,5,40,2,187,2,0,0,2,23,0,0,1,118,1,282
4,ANDERSONClinton,159,2,94,10,77,0,154,2,0,0,4,44,0,0,3,201,1,326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2006,WILSON24-1,129,1,88,8,42,0,135,1,0,0,3,31,0,1,1,123,1,203
2007,WILSON25-1,86,0,55,6,28,0,192,0,0,0,1,19,0,0,1,122,0,219
2008,WILSON25-2,63,1,31,3,20,1,182,0,0,0,0,23,0,0,2,81,0,192
2009,WILSONAbsentee,99,4,41,27,10,7,60,2,1,1,0,13,0,0,2,44,0,108


In [23]:
# Process general files
gen_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)

        # Select only president
        if 'office' in df.columns:
            df = df[df["office"] == "President" ]

        gen_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(gen_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
gen_combined_df["precinct"] = gen_combined_df["county"].astype(str) + gen_combined_df["precinct"].astype(str)
gen_combined_df


Unnamed: 0,county,precinct,office,district,party,candidate,votes
0,Anderson,AndersonAndersonville,President,,Republican,Donald J. Trump,1222.0
1,Anderson,AndersonBriceville,President,,Republican,Donald J. Trump,243.0
2,Anderson,AndersonBull Run,President,,Republican,Donald J. Trump,1397.0
3,Anderson,AndersonClinton,President,,Republican,Donald J. Trump,1330.0
4,Anderson,AndersonClinton High,President,,Republican,Donald J. Trump,792.0
...,...,...,...,...,...,...,...
16179,Wilson,Wilson1-1,President,,Independent,Write-In - Tom Hoefling,1.0
16180,Wilson,Wilson11-1,President,,Independent,Write-In - Tom Hoefling,2.0
16181,Wilson,Wilson14-1,President,,Independent,Write-In - Tom Hoefling,1.0
16182,Wilson,Wilson16-1,President,,Independent,Write-In - Tom Hoefling,1.0


In [31]:
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]
general_data.loc[:, "party"] = primary_data["party"].replace({
    "Democratic": "DEM",
    "Republican": "REP"
})
general_data = primary_data[primary_data["party"].isin(["DEM", "REP"])] # Analyzing only republican and democratic
general_data["party"].value_counts(dropna=False)


party
REP    28135
DEM     6000
Name: count, dtype: int64

In [33]:
general_data["candidate"].value_counts(dropna=False)

candidate
Jeb Bush              2008
Rand Paul             2008
George Pataki         2008
John R. Kasich        2008
Mike Huckabee         2008
Lindsey O. Graham     2008
Jim Gilmore           2008
Carly Fiorina         2008
Donald J. Trump       2008
Ted Cruz              2008
Rick Santorum         2008
Chris Christie        2008
Marco Rubio           2008
Ben Carson            2008
Bernie Sanders        2000
Martin J. O'Malley    2000
Hillary Clinton       2000
John E. Nicoll          23
Name: count, dtype: int64

In [34]:
general_data = general_data[~general_data["candidate"].isin(["Registered Voters", "Write-In"])] 
general_data["candidate"].value_counts(dropna=False)

candidate
Jeb Bush              2008
Rand Paul             2008
George Pataki         2008
John R. Kasich        2008
Mike Huckabee         2008
Lindsey O. Graham     2008
Jim Gilmore           2008
Carly Fiorina         2008
Donald J. Trump       2008
Ted Cruz              2008
Rick Santorum         2008
Chris Christie        2008
Marco Rubio           2008
Ben Carson            2008
Bernie Sanders        2000
Martin J. O'Malley    2000
Hillary Clinton       2000
John E. Nicoll          23
Name: count, dtype: int64

In [35]:
# general_data = general_data[~general_data["candidate"].isin(["YES", "NO"])]
# general_data["party"] = general_data.apply(
#     lambda row: candidate_party_map.get(row["candidate"], row["party"]) if pd.isna(row["party"]) else row["party"],
#     axis=1
# )
# general_data = general_data[general_data["party"].isin(["DEM", "REP"])] # Analyzing only republican and democratic
general_data

Unnamed: 0,precinct,party,candidate,votes,candidate_column
0,ANDERSONAndersonville,REP,Jeb Bush,2,pri_rep_BUSH
1,ANDERSONAndersonville,REP,Rand Paul,1,pri_rep_PAUL
2,ANDERSONAndersonville,DEM,Hillary Clinton,64,pri_dem_CLINTON
3,ANDERSONBriceville,REP,Jeb Bush,0,pri_rep_BUSH
4,ANDERSONBriceville,REP,Rand Paul,0,pri_rep_PAUL
...,...,...,...,...,...
38156,COFFEE17 Hands On Science,REP,John E. Nicoll,61,pri_rep_NICOLL
38158,COFFEE18 First Christian,REP,John E. Nicoll,177,pri_rep_NICOLL
38160,COFFEE19 CD Stamps Center,REP,John E. Nicoll,63,pri_rep_NICOLL
38162,COFFEE20 Sr Citizens Center,REP,John E. Nicoll,191,pri_rep_NICOLL


In [36]:
general_data["candidate_column"] = (
    "gen_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

candidate_column,precinct,gen_dem_CLINTON,gen_dem_O'MALLEY,gen_dem_SANDERS,gen_rep_BUSH,gen_rep_CARSON,gen_rep_CHRISTIE,gen_rep_CRUZ,gen_rep_FIORINA,gen_rep_GILMORE,gen_rep_GRAHAM,gen_rep_HUCKABEE,gen_rep_KASICH,gen_rep_NICOLL,gen_rep_PATAKI,gen_rep_PAUL,gen_rep_RUBIO,gen_rep_SANTORUM,gen_rep_TRUMP
0,ANDERSONAndersonville,64,1,39,2,55,3,170,0,0,0,4,24,0,0,1,120,0,304
1,ANDERSONBriceville,11,1,10,0,4,0,44,0,0,0,0,2,0,0,0,10,0,58
2,ANDERSONBull Run,106,5,102,7,79,1,196,1,1,0,1,31,0,0,3,162,1,316
3,ANDERSONClaxton,94,3,58,5,40,2,187,2,0,0,2,23,0,0,1,118,1,282
4,ANDERSONClinton,159,2,94,10,77,0,154,2,0,0,4,44,0,0,3,201,1,326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2006,WILSON24-1,129,1,88,8,42,0,135,1,0,0,3,31,0,1,1,123,1,203
2007,WILSON25-1,86,0,55,6,28,0,192,0,0,0,1,19,0,0,1,122,0,219
2008,WILSON25-2,63,1,31,3,20,1,182,0,0,0,0,23,0,0,2,81,0,192
2009,WILSONAbsentee,99,4,41,27,10,7,60,2,1,1,0,13,0,0,2,44,0,108


In [37]:
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")
combined["rep_primary_total"] = combined.filter(like="pri_rep_").sum(axis=1)
combined["dem_primary_total"] = combined.filter(like="pri_dem_").sum(axis=1)
combined["general_total"] = combined.filter(like="gen_").sum(axis=1)
combined

candidate_column,precinct,pri_dem_CLINTON,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CHRISTIE,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_GILMORE,...,gen_rep_KASICH,gen_rep_NICOLL,gen_rep_PATAKI,gen_rep_PAUL,gen_rep_RUBIO,gen_rep_SANTORUM,gen_rep_TRUMP,rep_primary_total,dem_primary_total,general_total
0,ANDERSONAndersonville,64,1,39,2,55,3,170,0,0,...,24,0,0,1,120,0,304,683,104,787
1,ANDERSONBriceville,11,1,10,0,4,0,44,0,0,...,2,0,0,0,10,0,58,118,22,140
2,ANDERSONBull Run,106,5,102,7,79,1,196,1,1,...,31,0,0,3,162,1,316,799,213,1012
3,ANDERSONClaxton,94,3,58,5,40,2,187,2,0,...,23,0,0,1,118,1,282,663,155,818
4,ANDERSONClinton,159,2,94,10,77,0,154,2,0,...,44,0,0,3,201,1,326,822,255,1077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2006,WILSON24-1,129,1,88,8,42,0,135,1,0,...,31,0,1,1,123,1,203,549,218,767
2007,WILSON25-1,86,0,55,6,28,0,192,0,0,...,19,0,0,1,122,0,219,588,141,729
2008,WILSON25-2,63,1,31,3,20,1,182,0,0,...,23,0,0,2,81,0,192,504,95,599
2009,WILSONAbsentee,99,4,41,27,10,7,60,2,1,...,13,0,0,2,44,0,108,275,144,419


In [38]:
combined.to_csv("TN.csv", index=False)
