In [1]:
import pandas as pd
import glob
import os
from pprint import pprint

In [4]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\MS\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]

# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary'])
]


In [5]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\MS\20160308__ms__special__general__state_senate__25__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20160607__ms__special__general__state_house__29__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20160628__ms__special__general__runoff__state_house__29__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20160823__ms__special__general__state__house__72__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20160913__ms__special__general__runoff__state__house__72__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20161108__ms__general__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20161129__ms__special__general__runoff__state__house__106__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20161129__ms__special__general__runoff__state__house__89__precinct.csv


In [6]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\MS\20160308__ms__primary__precinct.csv


In [18]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)

        # Drop rows where 'precinct' is NaN
        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]
            
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        df = df.drop_duplicates()

        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
# pri_combined_df["votes"] = pri_combined_df["votes"].astype(int)
pri_combined_df


Unnamed: 0,county,precinct,office,district,candidate,party,votes
0,Adams,"Dist. 1, Bellemont",President,,Jeb Bush,Republican,6.0
1,Adams,"Dist. 1, Bellemont",President,,Ben Carson,Republican,4.0
2,Adams,"Dist. 1, Bellemont",President,,Chris Christie,Republican,0.0
3,Adams,"Dist. 1, Bellemont",President,,Ted Cruz,Republican,224.0
4,Adams,"Dist. 1, Bellemont",President,,Carly Florina,Republican,0.0
...,...,...,...,...,...,...,...
33331,Yazoo,Total,President,,Hillary Clinton,Democrat,2360.0
33332,Yazoo,Total,President,,"Roque ""Rocky"" De Le Fuente",Democrat,6.0
33333,Yazoo,Total,President,,Martin O'Mailey,Democrat,4.0
33334,Yazoo,Total,President,,Bernie Sanders,Democrat,208.0


In [19]:
pri_combined_df["party"].value_counts(dropna=False)

party
Republican    24076
Democrat       8785
Democratic      475
Name: count, dtype: int64

In [28]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]
primary_data.loc[:, "party"] = primary_data["party"].replace({
    "Democratic": "DEM",
    "Democrat":"DEM",
    "Republican": "REP"
})
# primary_data = primary_data[primary_data["party"].isin(["DEM", "REP"])] # Analyzing only republican and democratic

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,"Dist. 1, Bellemont",REP,Jeb Bush,6.0
1,"Dist. 1, Bellemont",REP,Ben Carson,4.0
2,"Dist. 1, Bellemont",REP,Chris Christie,0.0
3,"Dist. 1, Bellemont",REP,Ted Cruz,224.0
4,"Dist. 1, Bellemont",REP,Carly Florina,0.0
...,...,...,...,...
33331,Total,DEM,Hillary Clinton,2360.0
33332,Total,DEM,"Roque ""Rocky"" De Le Fuente",6.0
33333,Total,DEM,Martin O'Mailey,4.0
33334,Total,DEM,Bernie Sanders,208.0


In [29]:
primary_data["party"].value_counts(dropna=False)

party
REP    24076
DEM     9260
Name: count, dtype: int64

In [30]:
#Remove Uncommited, Overvotes, Undervotes
primary_data = primary_data[~primary_data["candidate"].isin(["Uncommitted", "Overvotes", "Undervotes"])] 
primary_data["candidate"].value_counts(dropna=False)

candidate
Jeb Bush                      1852
Carly Florina                 1852
Lindsey Graham                1852
Mike Huckabee                 1852
Bernie Sanders                1852
Ben Carson                    1852
Rand Paul                     1852
Hillary Clinton               1852
Rick Santorum                 1852
Donald J. Trump               1840
Ted Cruz                      1840
Willie Wilson                 1814
Marco Rubio                   1814
George Pataki                 1814
Roque "Rocky" De Le Fuente    1719
Martin O'Mailey               1719
John R. Kasich                1655
Chris Christie                1153
Chris Christle                 699
John R. Kaisch                 159
Martin O'Malley                133
Roque "Rocky" De La Fuente      62
John R Kasich                   38
George Patakl                   38
Macro Rublo                     38
Roque 'Rocky' De La Fuente      38
Wille Wilson                    38
Roque "Rocky" De La Fuque       33
Yed Cruz  

In [31]:
primary_data["candidate"].unique()
candidate_party_map = (
    primary_data.dropna(subset=["candidate", "party"])
                .set_index("candidate")["party"]
                .to_dict()
)
print(candidate_party_map)

{'Jeb Bush': 'REP', 'Ben Carson': 'REP', 'Chris Christie': 'REP', 'Ted Cruz': 'REP', 'Carly Florina': 'REP', 'Lindsey Graham': 'REP', 'Mike Huckabee': 'REP', 'John R. Kasich': 'REP', 'George Pataki': 'REP', 'Rand Paul': 'REP', 'Marco Rubio': 'REP', 'Rick Santorum': 'REP', 'Donald J. Trump': 'REP', 'Hillary Clinton': 'DEM', 'Roque "Rocky" De La Fuente': 'DEM', "Martin O'Malley": 'DEM', 'Bernie Sanders': 'DEM', 'Willie Wilson': 'DEM', 'Roque "Rocky" De Le Fuente': 'DEM', "Martin O'Mailey": 'DEM', 'Chris Christle': 'REP', 'Roque "Rocky" De La Fuque': 'DEM', 'John R. Kaisch': 'REP', 'John R Kasich': 'REP', 'George Patakl': 'REP', 'Macro Rublo': 'REP', "Roque 'Rocky' De La Fuente": 'DEM', 'Wille Wilson': 'DEM', 'Yed Cruz': 'REP', 'Donald I. Trump': 'REP'}


In [32]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_FUQUE,pri_dem_O'MAILEY,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_dem_WILSON,pri_rep_BUSH,pri_rep_CARSON,...,pri_rep_HUCKABEE,pri_rep_KAISCH,pri_rep_KASICH,pri_rep_PATAKI,pri_rep_PATAKL,pri_rep_PAUL,pri_rep_RUBIO,pri_rep_RUBLO,pri_rep_SANTORUM,pri_rep_TRUMP
0,(01) New Hope Baptist Church,210.0,1.0,0.0,1.0,0.0,26.0,1.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0
1,(02) S MCCOMB BAPTIST CHURCH,84.0,1.0,0.0,0.0,0.0,10.0,0.0,0.0,1.0,...,0.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,30.0
2,"(03) First Bapt. Church, Summit",112.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,2.0,0.0,0.0,16.0
3,(04) Mlk Center,256.0,0.0,0.0,1.0,0.0,22.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
4,(05) American Leg. Hut,44.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,2.0,...,2.0,0.0,47.0,1.0,0.0,0.0,14.0,0.0,0.0,96.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1688,Zion Hill,35.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,3.0,0.0,0.0,85.0
1689,Zion Hill-New Haven,47.0,1.0,0.0,1.0,0.0,14.0,1.0,1.0,5.0,...,0.0,0.0,5.0,0.0,0.0,0.0,10.0,0.0,0.0,171.0
1690,Zion Ridge,197.0,1.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,18.0
1691,Zion Z,27.0,0.0,0.0,1.0,0.0,11.0,0.0,2.0,7.0,...,1.0,0.0,7.0,0.0,0.0,0.0,7.0,0.0,0.0,74.0


In [33]:
# Process general files
gen_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)

        # Select only president
        if 'office' in df.columns:
            df = df[df["office"] == "President" ]

        gen_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(gen_df_list, ignore_index=True)
gen_combined_df


Unnamed: 0,county,precinct,office,district,party,candidate,votes
0,Adams,"Dist. 1, Bellemont",President,,Democrat,Hillary Clinton,442
1,Adams,"Dist. 1, Bellemont",President,,Republican,Donald J. Trump,1090
2,Adams,"Dist. 1, Bellemont",President,,Constitution,Darrell Castle,6
3,Adams,"Dist. 1, Bellemont",President,,American Delta,Rocky' Roque De La Fuente,1
4,Adams,"Dist. 1, Bellemont",President,,Prohibition,Jim Hedges,1
...,...,...,...,...,...,...,...
12959,Yazoo,Zion,President,,Constitution,Darrell Castle,3
12960,Yazoo,Zion,President,,American Delta,Roque 'Rocky' De La Fuente,0
12961,Yazoo,Zion,President,,Prohibition,Jim Hedges,0
12962,Yazoo,Zion,President,,Libertarian,Gary Johnson,1


In [34]:
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]
general_data["party"].value_counts(dropna=False)


party
Democrat          1852
Republican        1852
Constitution      1852
American Delta    1852
Prohibition       1852
Libertarian       1852
Green             1852
Name: count, dtype: int64

In [35]:
general_data["candidate"].value_counts(dropna=False)

candidate
Hillary Clinton               1852
Donald J. Trump               1852
Darrell Castle                1852
Jim Hedges                    1852
Gary Johnson                  1852
Jill Stein                    1852
Rocky' Roque De La Fuente     1256
Roque 'Rocky' De La Fuente     596
Name: count, dtype: int64

In [36]:
# remove WRITE-IN
general_data = general_data[~general_data["candidate"].isin(["WRITE-IN"])] 
general_data["candidate"].value_counts(dropna=False)

candidate
Hillary Clinton               1852
Donald J. Trump               1852
Darrell Castle                1852
Jim Hedges                    1852
Gary Johnson                  1852
Jill Stein                    1852
Rocky' Roque De La Fuente     1256
Roque 'Rocky' De La Fuente     596
Name: count, dtype: int64

In [37]:
# general_data = general_data[~general_data["candidate"].isin(["YES", "NO"])]
# general_data["party"] = general_data.apply(
#     lambda row: candidate_party_map.get(row["candidate"], row["party"]) if pd.isna(row["party"]) else row["party"],
#     axis=1
# )
general_data.loc[:, "party"] = general_data["party"].replace({
    "Democrat": "DEM",
    "Republican": "REP"
})
general_data = general_data[general_data["party"].isin(["DEM", "REP"])] # Analyzing only republican and democratic
general_data

Unnamed: 0,precinct,party,candidate,votes
0,"Dist. 1, Bellemont",DEM,Hillary Clinton,442
1,"Dist. 1, Bellemont",REP,Donald J. Trump,1090
7,"Dist. 1, By-Pass Fire",DEM,Hillary Clinton,504
8,"Dist. 1, By-Pass Fire",REP,Donald J. Trump,241
14,"Dist. 1, Courthouse",DEM,Hillary Clinton,171
...,...,...,...,...
12944,West Bentonia,REP,Donald J. Trump,221
12950,West Midway,DEM,Hillary Clinton,25
12951,West Midway,REP,Donald J. Trump,151
12957,Zion,DEM,Hillary Clinton,54


In [38]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

candidate_column,precinct,gen_dem_CLINTON,gen_rep_TRUMP
0,(01) New Hope Baptist Church,580,45
1,(02) S MCCOMB BAPTIST CHURCH,223,65
2,"(03)First Bapt. Church, Summit",301,63
3,(04) Mlk Center,647,13
4,(05) American Leg. Hut,164,325
...,...,...,...
1714,Zion Hill,61,303
1715,Zion Hill-New Haven,132,516
1716,Zion Ridge,404,45
1717,Zion Z,107,299


In [39]:
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")
combined["rep_primary_total"] = combined.filter(like="pri_rep_").sum(axis=1)
combined["dem_primary_total"] = combined.filter(like="pri_dem_").sum(axis=1)
combined["general_total"] = combined.filter(like="gen_").sum(axis=1)
combined

candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_FUQUE,pri_dem_O'MAILEY,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_dem_WILSON,pri_rep_BUSH,pri_rep_CARSON,...,pri_rep_PAUL,pri_rep_RUBIO,pri_rep_RUBLO,pri_rep_SANTORUM,pri_rep_TRUMP,gen_dem_CLINTON,gen_rep_TRUMP,rep_primary_total,dem_primary_total,general_total
0,(01) New Hope Baptist Church,210.0,1.0,0.0,1.0,0.0,26.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,12.0,580,45,21.0,239.0,625
1,(02) S MCCOMB BAPTIST CHURCH,84.0,1.0,0.0,0.0,0.0,10.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,30.0,223,65,43.0,95.0,288
2,(04) Mlk Center,256.0,0.0,0.0,1.0,0.0,22.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,647,13,4.0,279.0,660
3,(05) American Leg. Hut,44.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,2.0,...,0.0,14.0,0.0,0.0,96.0,164,325,221.0,62.0,489
4,(06) S. Pike Comm. Ctr.,115.0,1.0,0.0,0.0,0.0,11.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,2.0,270,15,5.0,127.0,285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1463,Zion,25.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2.0,...,0.0,5.0,0.0,0.0,56.0,54,198,125.0,29.0,252
1464,Zion Hill,35.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,85.0,61,303,173.0,42.0,364
1465,Zion Hill-New Haven,47.0,1.0,0.0,1.0,0.0,14.0,1.0,1.0,5.0,...,0.0,10.0,0.0,0.0,171.0,132,516,273.0,64.0,648
1466,Zion Ridge,197.0,1.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,18.0,404,45,26.0,218.0,449


In [40]:
combined.to_csv("MS.csv", index=False)
