In [1]:
import pandas as pd
import glob
import os
from pprint import pprint

In [2]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\TN\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary', 'president'])
]


In [3]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\TN\20161108__tn__general__precinct.csv


In [4]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\TN\20160301__tn__primary__president__precinct.csv


In [5]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)

        # Drop rows where 'precinct' is NaN
        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]
        
        df = df.drop_duplicates()

        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
pri_combined_df["precinct"] = pri_combined_df["county"].astype(str) + pri_combined_df["precinct"].astype(str)
pri_combined_df["precinct"] = pri_combined_df["precinct"].str.upper()
pri_combined_df


Unnamed: 0,county,precinct,office,district,party,candidate,votes
0,ANDERSON,ANDERSONANDERSONVILLE,Presidential Preference,,Republican,Jeb Bush,2
1,ANDERSON,ANDERSONANDERSONVILLE,Presidential Preference,,Republican,Rand Paul,1
2,ANDERSON,ANDERSONANDERSONVILLE,Presidential Preference,,Democratic,Hillary Clinton,64
3,ANDERSON,ANDERSONBRICEVILLE,Presidential Preference,,Republican,Jeb Bush,0
4,ANDERSON,ANDERSONBRICEVILLE,Presidential Preference,,Republican,Rand Paul,0
...,...,...,...,...,...,...,...
38161,COFFEE,COFFEE19 CD STAMPS CENTER,Public Defender,14.0,Democratic,No Candidate Qualified,0
38162,COFFEE,COFFEE20 SR CITIZENS CENTER,Public Defender,14.0,Republican,John E. Nicoll,191
38163,COFFEE,COFFEE20 SR CITIZENS CENTER,Public Defender,14.0,Democratic,No Candidate Qualified,0
38164,COFFEE,COFFEE21 WILSON AVE,Public Defender,14.0,Republican,John E. Nicoll,202


In [6]:
pri_combined_df["party"].value_counts(dropna=False)

party
Republican    30143
Democratic     8023
Name: count, dtype: int64

In [7]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]
primary_data.loc[:, "party"] = primary_data["party"].replace({
    "Democratic": "DEM",
    "Republican": "REP"
})

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,ANDERSONANDERSONVILLE,REP,Jeb Bush,2
1,ANDERSONANDERSONVILLE,REP,Rand Paul,1
2,ANDERSONANDERSONVILLE,DEM,Hillary Clinton,64
3,ANDERSONBRICEVILLE,REP,Jeb Bush,0
4,ANDERSONBRICEVILLE,REP,Rand Paul,0
...,...,...,...,...
38161,COFFEE19 CD STAMPS CENTER,DEM,No Candidate Qualified,0
38162,COFFEE20 SR CITIZENS CENTER,REP,John E. Nicoll,191
38163,COFFEE20 SR CITIZENS CENTER,DEM,No Candidate Qualified,0
38164,COFFEE21 WILSON AVE,REP,John E. Nicoll,202


In [8]:
primary_data["party"].value_counts(dropna=False)

party
REP    30143
DEM     8023
Name: count, dtype: int64

In [9]:
#Remove Write-in and Registered Voters
primary_data = primary_data[~primary_data["candidate"].isin(["No Candidate Qualified", "Uncommitted"])] 
primary_data["candidate"].value_counts(dropna=False)

candidate
Jeb Bush              2008
Rand Paul             2008
George Pataki         2008
John R. Kasich        2008
Mike Huckabee         2008
Lindsey O. Graham     2008
Jim Gilmore           2008
Carly Fiorina         2008
Donald J. Trump       2008
Ted Cruz              2008
Rick Santorum         2008
Chris Christie        2008
Marco Rubio           2008
Ben Carson            2008
Bernie Sanders        2000
Martin J. O'Malley    2000
Hillary Clinton       2000
John E. Nicoll          23
Name: count, dtype: int64

In [10]:
# primary_data["candidate"].unique()
# candidate_party_map = (
#     primary_data.dropna(subset=["candidate", "party"])
#                 .set_index("candidate")["party"]
#                 .to_dict()
# )
# print(candidate_party_map)

In [11]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data.loc[:,"candidate_column"] = (


candidate_column,precinct,pri_dem_CLINTON,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CHRISTIE,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_GILMORE,pri_rep_GRAHAM,pri_rep_HUCKABEE,pri_rep_KASICH,pri_rep_NICOLL,pri_rep_PATAKI,pri_rep_PAUL,pri_rep_RUBIO,pri_rep_SANTORUM,pri_rep_TRUMP
0,ANDERSONANDERSONVILLE,64,1,39,2,55,3,170,0,0,0,4,24,0,0,1,120,0,304
1,ANDERSONBRICEVILLE,11,1,10,0,4,0,44,0,0,0,0,2,0,0,0,10,0,58
2,ANDERSONBULL RUN,106,5,102,7,79,1,196,1,1,0,1,31,0,0,3,162,1,316
3,ANDERSONCLAXTON,94,3,58,5,40,2,187,2,0,0,2,23,0,0,1,118,1,282
4,ANDERSONCLINTON,159,2,94,10,77,0,154,2,0,0,4,44,0,0,3,201,1,326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2006,WILSON24-1,129,1,88,8,42,0,135,1,0,0,3,31,0,1,1,123,1,203
2007,WILSON25-1,86,0,55,6,28,0,192,0,0,0,1,19,0,0,1,122,0,219
2008,WILSON25-2,63,1,31,3,20,1,182,0,0,0,0,23,0,0,2,81,0,192
2009,WILSONABSENTEE,99,4,41,27,10,7,60,2,1,1,0,13,0,0,2,44,0,108


In [12]:
# Process general files
gen_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)

        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]
            
        # Select only president
        if 'office' in df.columns:
            df = df[df["office"] == "President" ]

        gen_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(gen_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
gen_combined_df["precinct"] = gen_combined_df["county"].astype(str) + gen_combined_df["precinct"].astype(str)
gen_combined_df["precinct"] = gen_combined_df["precinct"].str.upper()
gen_combined_df


Unnamed: 0,county,precinct,office,district,party,candidate,votes
0,Anderson,ANDERSONANDERSONVILLE,President,,Republican,Donald J. Trump,1222.0
1,Anderson,ANDERSONBRICEVILLE,President,,Republican,Donald J. Trump,243.0
2,Anderson,ANDERSONBULL RUN,President,,Republican,Donald J. Trump,1397.0
3,Anderson,ANDERSONCLINTON,President,,Republican,Donald J. Trump,1330.0
4,Anderson,ANDERSONCLINTON HIGH,President,,Republican,Donald J. Trump,792.0
...,...,...,...,...,...,...,...
16179,Wilson,WILSON1-1,President,,Independent,Write-In - Tom Hoefling,1.0
16180,Wilson,WILSON11-1,President,,Independent,Write-In - Tom Hoefling,2.0
16181,Wilson,WILSON14-1,President,,Independent,Write-In - Tom Hoefling,1.0
16182,Wilson,WILSON16-1,President,,Independent,Write-In - Tom Hoefling,1.0


In [13]:
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]
general_data.loc[:, "party"] = primary_data["party"].replace({
    "Democratic": "DEM",
    "Republican": "REP"
})
general_data["party"].value_counts(dropna=False)


party
REP    10806
DEM     5378
Name: count, dtype: int64

In [14]:
general_data["candidate"].value_counts(dropna=False)

candidate
Donald J. Trump                  2022
Hillary Clinton                  2022
"Rocky" Roque De La Fuente       2022
Gary Johnson                     2022
Alyson Kennedy                   2022
Mike Smith                       2022
Jill Stein                       2022
Write-In - Evan McMullin         1276
Write-In - Darrell L. Castle      623
Write-In - Tom Hoefling            74
Write-In - David Limbaugh          31
Write-In - Laurence Kotlikoff      18
Write-In - Cherunda Fox             6
Write-In - Marshall Schoenke        2
Name: count, dtype: int64

In [15]:
general_data = general_data[~general_data["candidate"].isin(["Registered Voters"])] 
general_data["candidate"].value_counts(dropna=False)

candidate
Donald J. Trump                  2022
Hillary Clinton                  2022
"Rocky" Roque De La Fuente       2022
Gary Johnson                     2022
Alyson Kennedy                   2022
Mike Smith                       2022
Jill Stein                       2022
Write-In - Evan McMullin         1276
Write-In - Darrell L. Castle      623
Write-In - Tom Hoefling            74
Write-In - David Limbaugh          31
Write-In - Laurence Kotlikoff      18
Write-In - Cherunda Fox             6
Write-In - Marshall Schoenke        2
Name: count, dtype: int64

In [16]:

general_data

Unnamed: 0,precinct,party,candidate,votes
0,ANDERSONANDERSONVILLE,REP,Donald J. Trump,1222.0
1,ANDERSONBRICEVILLE,REP,Donald J. Trump,243.0
2,ANDERSONBULL RUN,DEM,Donald J. Trump,1397.0
3,ANDERSONCLINTON,REP,Donald J. Trump,1330.0
4,ANDERSONCLINTON HIGH,REP,Donald J. Trump,792.0
...,...,...,...,...
16179,WILSON1-1,REP,Write-In - Tom Hoefling,1.0
16180,WILSON11-1,REP,Write-In - Tom Hoefling,2.0
16181,WILSON14-1,DEM,Write-In - Tom Hoefling,1.0
16182,WILSON16-1,REP,Write-In - Tom Hoefling,1.0


In [17]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

candidate_column,precinct,gen_dem_CASTLE,gen_dem_CLINTON,gen_dem_FOX,gen_dem_FUENTE,gen_dem_HOEFLING,gen_dem_JOHNSON,gen_dem_KENNEDY,gen_dem_KOTLIKOFF,gen_dem_LIMBAUGH,...,gen_rep_HOEFLING,gen_rep_JOHNSON,gen_rep_KENNEDY,gen_rep_KOTLIKOFF,gen_rep_LIMBAUGH,gen_rep_MCMULLIN,gen_rep_SCHOENKE,gen_rep_SMITH,gen_rep_STEIN,gen_rep_TRUMP
0,ANDERSONANDERSONVILLE,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,32.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1222.0
1,ANDERSONBRICEVILLE,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,243.0
2,ANDERSONBULL RUN,0.0,424.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,76.0,2.0,0.0,0.0,2.0,0.0,13.0,6.0,0.0
3,ANDERSONCLAXTON,3.0,387.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,53.0,0.0,0.0,0.0,5.0,0.0,8.0,5.0,0.0
4,ANDERSONCLINTON,0.0,0.0,0.0,5.0,0.0,47.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,4.0,0.0,0.0,0.0,1330.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017,WILSON8-1,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1512.0
2018,WILSON9-1,0.0,400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,36.0,1.0,0.0,0.0,1.0,0.0,9.0,22.0,852.0
2019,WILSON9-2,0.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,9.0,0.0
2020,WILSONABSENTEE,0.0,0.0,0.0,4.0,0.0,47.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,14.0,0.0,9.0,7.0,0.0


In [18]:
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")
combined["rep_primary_total"] = combined.filter(like="pri_rep_").sum(axis=1)
combined["dem_primary_total"] = combined.filter(like="pri_dem_").sum(axis=1)
combined["general_total"] = combined.filter(like="gen_").sum(axis=1)
combined

candidate_column,precinct,pri_dem_CLINTON,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CHRISTIE,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_GILMORE,...,gen_rep_KOTLIKOFF,gen_rep_LIMBAUGH,gen_rep_MCMULLIN,gen_rep_SCHOENKE,gen_rep_SMITH,gen_rep_STEIN,gen_rep_TRUMP,rep_primary_total,dem_primary_total,general_total
0,ANDERSONANDERSONVILLE,64,1,39,2,55,3,170,0,0,...,0.0,0.0,3.0,0.0,0.0,0.0,1222.0,683,104,1500.0
1,ANDERSONBRICEVILLE,11,1,10,0,4,0,44,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,243.0,118,22,294.0
2,ANDERSONBULL RUN,106,5,102,7,79,1,196,1,1,...,0.0,0.0,2.0,0.0,13.0,6.0,0.0,799,213,1930.0
3,ANDERSONCLAXTON,94,3,58,5,40,2,187,2,0,...,0.0,0.0,5.0,0.0,8.0,5.0,0.0,663,155,1782.0
4,ANDERSONCLINTON,159,2,94,10,77,0,154,2,0,...,0.0,0.0,4.0,0.0,0.0,0.0,1330.0,822,255,1971.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1698,WILSON24-1,129,1,88,8,42,0,135,1,0,...,0.0,0.0,4.0,0.0,7.0,14.0,1097.0,549,218,1601.0
1699,WILSON25-1,86,0,55,6,28,0,192,0,0,...,0.0,0.0,13.0,0.0,1.0,13.0,0.0,588,141,1583.0
1700,WILSON25-2,63,1,31,3,20,1,182,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,809.0,504,95,1164.0
1701,WILSONABSENTEE,99,4,41,27,10,7,60,2,1,...,0.0,0.0,14.0,0.0,9.0,7.0,0.0,275,144,1142.0


In [19]:
combined.to_csv("TN.csv", index=False)
