In [14]:
import pandas as pd
import glob
import os
from pprint import pprint

In [15]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\MS\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]

# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary'])
]


In [16]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\MS\20160308__ms__special__general__state_senate__25__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20160607__ms__special__general__state_house__29__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20160628__ms__special__general__runoff__state_house__29__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20160823__ms__special__general__state__house__72__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20160913__ms__special__general__runoff__state__house__72__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20161108__ms__general__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20161129__ms__special__general__runoff__state__house__106__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20161129__ms__special__general__runoff__state__house__89__precinct.csv


In [17]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\MS\20160308__ms__primary__precinct.csv


In [18]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)

        # Drop rows where 'precinct' is NaN
        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]
            
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        df = df.drop_duplicates()

        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
# pri_combined_df["votes"] = pri_combined_df["votes"].astype(int)
pri_combined_df["precinct"] = pri_combined_df["precinct"].str.upper()
pri_combined_df


Unnamed: 0,county,precinct,office,district,candidate,party,votes
0,Adams,"DIST. 1, BELLEMONT",President,,Jeb Bush,Republican,6.0
1,Adams,"DIST. 1, BELLEMONT",President,,Ben Carson,Republican,4.0
2,Adams,"DIST. 1, BELLEMONT",President,,Chris Christie,Republican,0.0
3,Adams,"DIST. 1, BELLEMONT",President,,Ted Cruz,Republican,224.0
4,Adams,"DIST. 1, BELLEMONT",President,,Carly Florina,Republican,0.0
...,...,...,...,...,...,...,...
33331,Yazoo,TOTAL,President,,Hillary Clinton,Democrat,2360.0
33332,Yazoo,TOTAL,President,,"Roque ""Rocky"" De Le Fuente",Democrat,6.0
33333,Yazoo,TOTAL,President,,Martin O'Mailey,Democrat,4.0
33334,Yazoo,TOTAL,President,,Bernie Sanders,Democrat,208.0


In [19]:
pri_combined_df["party"].value_counts(dropna=False)

party
Republican    24076
Democrat       8785
Democratic      475
Name: count, dtype: int64

In [20]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]
primary_data.loc[:, "party"] = primary_data["party"].replace({
    "Democratic": "DEM",
    "Democrat":"DEM",
    "Republican": "REP"
})
# primary_data = primary_data[primary_data["party"].isin(["DEM", "REP"])] # Analyzing only republican and democratic

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,"DIST. 1, BELLEMONT",REP,Jeb Bush,6.0
1,"DIST. 1, BELLEMONT",REP,Ben Carson,4.0
2,"DIST. 1, BELLEMONT",REP,Chris Christie,0.0
3,"DIST. 1, BELLEMONT",REP,Ted Cruz,224.0
4,"DIST. 1, BELLEMONT",REP,Carly Florina,0.0
...,...,...,...,...
33331,TOTAL,DEM,Hillary Clinton,2360.0
33332,TOTAL,DEM,"Roque ""Rocky"" De Le Fuente",6.0
33333,TOTAL,DEM,Martin O'Mailey,4.0
33334,TOTAL,DEM,Bernie Sanders,208.0


In [21]:
primary_data["party"].value_counts(dropna=False)

party
REP    24076
DEM     9260
Name: count, dtype: int64

In [22]:
#Remove Uncommited, Overvotes, Undervotes
primary_data = primary_data[~primary_data["candidate"].isin(["Uncommitted", "Overvotes", "Undervotes"])] 
primary_data["candidate"].value_counts(dropna=False)

candidate
Jeb Bush                      1852
Carly Florina                 1852
Lindsey Graham                1852
Mike Huckabee                 1852
Bernie Sanders                1852
Ben Carson                    1852
Rand Paul                     1852
Hillary Clinton               1852
Rick Santorum                 1852
Donald J. Trump               1840
Ted Cruz                      1840
Willie Wilson                 1814
Marco Rubio                   1814
George Pataki                 1814
Roque "Rocky" De Le Fuente    1719
Martin O'Mailey               1719
John R. Kasich                1655
Chris Christie                1153
Chris Christle                 699
John R. Kaisch                 159
Martin O'Malley                133
Roque "Rocky" De La Fuente      62
John R Kasich                   38
George Patakl                   38
Macro Rublo                     38
Roque 'Rocky' De La Fuente      38
Wille Wilson                    38
Roque "Rocky" De La Fuque       33
Yed Cruz  

In [23]:
# Fixing the Rocky La Fuente Error, Donald I. Trump
primary_data.loc[
    (
        primary_data["candidate"].str.contains("rocky.*fuente", case=False, na=False) |
        primary_data["candidate"].str.contains("LA FUENTE", case=False, na=False)
    ),
    "candidate"
] = "La Fuente"


primary_data.loc[:, "candidate"] = primary_data["candidate"].replace({
    "Donald I. Trump": "Donald J. Trump",
})
primary_data["candidate"].value_counts(dropna=False)

candidate
Jeb Bush                     1852
Rand Paul                    1852
Bernie Sanders               1852
Ben Carson                   1852
Donald J. Trump              1852
Rick Santorum                1852
Hillary Clinton              1852
Mike Huckabee                1852
Lindsey Graham               1852
Carly Florina                1852
Ted Cruz                     1840
La Fuente                    1819
Willie Wilson                1814
George Pataki                1814
Marco Rubio                  1814
Martin O'Mailey              1719
John R. Kasich               1655
Chris Christie               1153
Chris Christle                699
John R. Kaisch                159
Martin O'Malley               133
John R Kasich                  38
George Patakl                  38
Macro Rublo                    38
Wille Wilson                   38
Roque "Rocky" De La Fuque      33
Yed Cruz                       12
Name: count, dtype: int64

In [24]:
primary_data["candidate"].unique()
candidate_party_map = (
    primary_data.dropna(subset=["candidate", "party"])
                .set_index("candidate")["party"]
                .to_dict()
)
print(candidate_party_map)

{'Jeb Bush': 'REP', 'Ben Carson': 'REP', 'Chris Christie': 'REP', 'Ted Cruz': 'REP', 'Carly Florina': 'REP', 'Lindsey Graham': 'REP', 'Mike Huckabee': 'REP', 'John R. Kasich': 'REP', 'George Pataki': 'REP', 'Rand Paul': 'REP', 'Marco Rubio': 'REP', 'Rick Santorum': 'REP', 'Donald J. Trump': 'REP', 'Hillary Clinton': 'DEM', 'La Fuente': 'DEM', "Martin O'Malley": 'DEM', 'Bernie Sanders': 'DEM', 'Willie Wilson': 'DEM', "Martin O'Mailey": 'DEM', 'Chris Christle': 'REP', 'Roque "Rocky" De La Fuque': 'DEM', 'John R. Kaisch': 'REP', 'John R Kasich': 'REP', 'George Patakl': 'REP', 'Macro Rublo': 'REP', 'Wille Wilson': 'DEM', 'Yed Cruz': 'REP'}


In [25]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_FUQUE,pri_dem_O'MAILEY,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_dem_WILSON,pri_rep_BUSH,pri_rep_CARSON,...,pri_rep_HUCKABEE,pri_rep_KAISCH,pri_rep_KASICH,pri_rep_PATAKI,pri_rep_PATAKL,pri_rep_PAUL,pri_rep_RUBIO,pri_rep_RUBLO,pri_rep_SANTORUM,pri_rep_TRUMP
0,(01) NEW HOPE BAPTIST CHURCH,210.0,1.0,0.0,1.0,0.0,26.0,1.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0
1,(02) S MCCOMB BAPTIST CHURCH,84.0,1.0,0.0,0.0,0.0,10.0,0.0,0.0,1.0,...,0.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,30.0
2,"(03) FIRST BAPT. CHURCH, SUMMIT",112.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,2.0,0.0,0.0,16.0
3,(04) MLK CENTER,256.0,0.0,0.0,1.0,0.0,22.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
4,(05) AMERICAN LEG. HUT,44.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,2.0,...,2.0,0.0,47.0,1.0,0.0,0.0,14.0,0.0,0.0,96.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,ZION,25.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2.0,...,1.0,0.0,8.0,0.0,0.0,0.0,5.0,0.0,0.0,56.0
1679,ZION HILL,35.0,0.0,0.0,2.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,3.0,0.0,0.0,85.0
1680,ZION HILL-NEW HAVEN,47.0,1.0,0.0,1.0,0.0,14.0,1.0,1.0,5.0,...,0.0,0.0,5.0,0.0,0.0,0.0,10.0,0.0,0.0,171.0
1681,ZION RIDGE,197.0,1.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,18.0


In [26]:
# Process general files
gen_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)

        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]
            
        # Select only president
        if 'office' in df.columns:
            df = df[df["office"] == "President" ]

        gen_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(gen_df_list, ignore_index=True)
gen_combined_df["precinct"] = gen_combined_df["precinct"].str.upper()
gen_combined_df


Unnamed: 0,county,precinct,office,district,party,candidate,votes
0,Adams,"DIST. 1, BELLEMONT",President,,Democrat,Hillary Clinton,442
1,Adams,"DIST. 1, BELLEMONT",President,,Republican,Donald J. Trump,1090
2,Adams,"DIST. 1, BELLEMONT",President,,Constitution,Darrell Castle,6
3,Adams,"DIST. 1, BELLEMONT",President,,American Delta,Rocky' Roque De La Fuente,1
4,Adams,"DIST. 1, BELLEMONT",President,,Prohibition,Jim Hedges,1
...,...,...,...,...,...,...,...
12595,Yazoo,ZION,President,,Constitution,Darrell Castle,3
12596,Yazoo,ZION,President,,American Delta,Roque 'Rocky' De La Fuente,0
12597,Yazoo,ZION,President,,Prohibition,Jim Hedges,0
12598,Yazoo,ZION,President,,Libertarian,Gary Johnson,1


In [27]:
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]
general_data["party"].value_counts(dropna=False)


party
Democrat          1800
Republican        1800
Constitution      1800
American Delta    1800
Prohibition       1800
Libertarian       1800
Green             1800
Name: count, dtype: int64

In [28]:
general_data["candidate"].value_counts(dropna=False)

candidate
Hillary Clinton               1800
Donald J. Trump               1800
Darrell Castle                1800
Jim Hedges                    1800
Gary Johnson                  1800
Jill Stein                    1800
Rocky' Roque De La Fuente     1204
Roque 'Rocky' De La Fuente     596
Name: count, dtype: int64

In [29]:
# remove WRITE-IN, Fixing the rocky roque
# Fixing the Rocky La Fuente Error, Donald I. Trump
general_data.loc[
    (
        general_data["candidate"].str.contains("rocky.*fuente", case=False, na=False) |
        general_data["candidate"].str.contains("LA FUENTE", case=False, na=False)
    ),
    "candidate"
] = "La Fuente"


general_data = general_data[~general_data["candidate"].isin(["WRITE-IN"])] 
general_data["candidate"].value_counts(dropna=False)

candidate
Hillary Clinton    1800
Donald J. Trump    1800
Darrell Castle     1800
La Fuente          1800
Jim Hedges         1800
Gary Johnson       1800
Jill Stein         1800
Name: count, dtype: int64

In [30]:
general_data["party"].value_counts(dropna=False)

party
Democrat          1800
Republican        1800
Constitution      1800
American Delta    1800
Prohibition       1800
Libertarian       1800
Green             1800
Name: count, dtype: int64

In [31]:
# general_data = general_data[~general_data["candidate"].isin(["YES", "NO"])]
# general_data["party"] = general_data.apply(
#     lambda row: candidate_party_map.get(row["candidate"], row["party"]) if pd.isna(row["party"]) else row["party"],
#     axis=1
# )
general_data.loc[:, "party"] = general_data["party"].replace({
    "Democrat": "DEM",
    "Republican": "REP",
    "Libertarian": "LIB",
    "Green": "GRN",
    "Constitution": "CON",
    "American Delta": "AMD",
    "Prohibition": "PRO"
})

# general_data = general_data[general_data["party"].isin(["DEM", "REP"])] # Analyzing only republican and democratic
general_data

Unnamed: 0,precinct,party,candidate,votes
0,"DIST. 1, BELLEMONT",DEM,Hillary Clinton,442
1,"DIST. 1, BELLEMONT",REP,Donald J. Trump,1090
2,"DIST. 1, BELLEMONT",CON,Darrell Castle,6
3,"DIST. 1, BELLEMONT",AMD,La Fuente,1
4,"DIST. 1, BELLEMONT",PRO,Jim Hedges,1
...,...,...,...,...
12595,ZION,CON,Darrell Castle,3
12596,ZION,AMD,La Fuente,0
12597,ZION,PRO,Jim Hedges,0
12598,ZION,LIB,Gary Johnson,1


In [32]:
general_data["party"].value_counts(dropna=False)

party
DEM    1800
REP    1800
CON    1800
AMD    1800
PRO    1800
LIB    1800
GRN    1800
Name: count, dtype: int64

In [33]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

candidate_column,precinct,gen_amd_FUENTE,gen_con_CASTLE,gen_dem_CLINTON,gen_grn_STEIN,gen_lib_JOHNSON,gen_pro_HEDGES,gen_rep_TRUMP
0,(01) NEW HOPE BAPTIST CHURCH,1,1,580,3,1,1,45
1,(02) S MCCOMB BAPTIST CHURCH,1,1,223,1,5,0,65
2,"(03)FIRST BAPT. CHURCH, SUMMIT",0,0,301,0,1,0,63
3,(04) MLK CENTER,1,2,647,3,1,1,13
4,(05) AMERICAN LEG. HUT,0,0,164,2,9,0,325
...,...,...,...,...,...,...,...,...
1704,ZION,0,3,54,0,1,0,198
1705,ZION HILL,0,1,61,0,2,0,303
1706,ZION HILL-NEW HAVEN,0,1,132,2,10,0,516
1707,ZION RIDGE,2,2,404,2,0,0,45


In [34]:
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")
combined["rep_primary_total"] = combined.filter(like="pri_rep_").sum(axis=1)
combined["dem_primary_total"] = combined.filter(like="pri_dem_").sum(axis=1)
combined["general_total"] = combined.filter(like="gen_").sum(axis=1)
# Convert all numeric columns (excluding 'precinct') to integers
for col in combined.columns[1:]:
    combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0).astype(int)
combined

candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_FUQUE,pri_dem_O'MAILEY,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_dem_WILSON,pri_rep_BUSH,pri_rep_CARSON,...,gen_amd_FUENTE,gen_con_CASTLE,gen_dem_CLINTON,gen_grn_STEIN,gen_lib_JOHNSON,gen_pro_HEDGES,gen_rep_TRUMP,rep_primary_total,dem_primary_total,general_total
0,(01) NEW HOPE BAPTIST CHURCH,210,1,0,1,0,26,1,0,0,...,1,1,580,3,1,1,45,21,239,632
1,(02) S MCCOMB BAPTIST CHURCH,84,1,0,0,0,10,0,0,1,...,1,1,223,1,5,0,65,43,95,296
2,(04) MLK CENTER,256,0,0,1,0,22,0,0,0,...,1,2,647,3,1,1,13,4,279,668
3,(05) AMERICAN LEG. HUT,44,0,0,0,0,18,0,0,2,...,0,0,164,2,9,0,325,221,62,500
4,(06) S. PIKE COMM. CTR.,115,1,0,0,0,11,0,0,1,...,0,1,270,1,0,0,15,5,127,287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1486,ZION,25,0,0,0,0,4,0,0,2,...,0,3,54,0,1,0,198,125,29,256
1487,ZION HILL,35,0,0,2,0,5,0,0,0,...,0,1,61,0,2,0,303,173,42,367
1488,ZION HILL-NEW HAVEN,47,1,0,1,0,14,1,1,5,...,0,1,132,2,10,0,516,273,64,661
1489,ZION RIDGE,197,1,0,0,0,20,0,0,0,...,2,2,404,2,0,0,45,26,218,455


In [35]:
combined.to_csv("MS.csv", index=False)
