In [201]:
# Import all the libraries
import pandas as pd
import glob
import os
import re
from pprint import pprint

In [202]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\MS\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]

# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary'])
]


In [203]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\MS\20160308__ms__special__general__state_senate__25__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20160607__ms__special__general__state_house__29__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20160628__ms__special__general__runoff__state_house__29__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20160823__ms__special__general__state__house__72__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20160913__ms__special__general__runoff__state__house__72__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20161108__ms__general__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20161129__ms__special__general__runoff__state__house__106__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MS\20161129__ms__special__general__runoff__state__house__89__precinct.csv


In [204]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\MS\20160308__ms__primary__precinct.csv


In [205]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)
        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
            
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()
        df["county"] = df["county"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]                
            df = df[~df['precinct'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]   

        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]       
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]               
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)



In [206]:
## Cleaning precinct, county name
# Fixing Cubbler
pri_combined_df.loc[
    ( 
        pri_combined_df["county"].str.contains("\w*F120\w*", case=False, na=False)|
        pri_combined_df["county"].str.contains("\w*STONE\w*", case=False, na=False)|
        pri_combined_df["county"].str.contains("\w*OBBLER\w*", case=False, na=False),
    "county")
] = "STONE"

In [207]:
# Checking any suspicious precinct, county
suspicious_precincts = pri_combined_df[
    pri_combined_df["precinct"].isna() |
    pri_combined_df["precinct"].str.strip().str.upper().str.contains(r"\w*TOTAL\w*", na=False) |
    pri_combined_df["precinct"].str.strip().str.upper().isin(["NONE", "NAN"]) |
    pri_combined_df["county"].isna() |
    pri_combined_df["county"].str.strip().str.upper().str.contains(r"\w*TOTAL\w*", na=False) |
    pri_combined_df["county"].str.strip().str.upper().isin(["NONE", "NAN"])
]

In [208]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,"ADAMSDIST. 1, BELLEMONT",Republican,Jeb Bush,6.0
1,"ADAMSDIST. 1, BELLEMONT",Republican,Ben Carson,4.0
2,"ADAMSDIST. 1, BELLEMONT",Republican,Chris Christie,0.0
3,"ADAMSDIST. 1, BELLEMONT",Republican,Ted Cruz,224.0
4,"ADAMSDIST. 1, BELLEMONT",Republican,Carly Florina,0.0
...,...,...,...,...
32125,YAZOOZION,Democrat,Hillary Clinton,25.0
32126,YAZOOZION,Democrat,"Roque ""Rocky"" De Le Fuente",0.0
32127,YAZOOZION,Democrat,Martin O'Mailey,0.0
32128,YAZOOZION,Democrat,Bernie Sanders,4.0


In [209]:
# Viewing candidate data
primary_data["candidate"].value_counts(dropna=False)

candidate
Jeb Bush                      1785
Carly Florina                 1785
Lindsey Graham                1785
Mike Huckabee                 1785
Bernie Sanders                1785
Ben Carson                    1785
Rand Paul                     1785
Hillary Clinton               1785
Rick Santorum                 1785
Donald J. Trump               1774
Ted Cruz                      1774
Willie Wilson                 1750
Marco Rubio                   1750
George Pataki                 1750
Roque "Rocky" De Le Fuente    1649
Martin O'Mailey               1649
John R. Kasich                1593
Chris Christie                1117
Chris Christle                 668
John R. Kaisch                 157
Martin O'Malley                136
Roque "Rocky" De La Fuente      66
Roque "Rocky" De La Fuque       35
John R Kasich                   35
George Patakl                   35
Macro Rublo                     35
Roque 'Rocky' De La Fuente      35
Wille Wilson                    35
Yed Cruz  

In [210]:
# Cleaning Candidates

# Turning all primary data to uppercase
# primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()

# unwanted_keywords = [
#     r"\w*VOTE\w*",
#     r"\w*UNCOM\w*",
#     r"\w*TOTAL\w*",
#     r"Blank Votes", r"All Others",r"Total Votes Cast",r"No Preference"
# ]

# pattern = "|".join(unwanted_keywords)

# # Assuming candidate column is already string and uppercase
# primary_data = primary_data[~primary_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

#fIXING DE LA FUENTE
primary_data.loc[
    (
        primary_data["candidate"].str.contains("rocky.*fuente", case=False, na=False) |
        primary_data["candidate"].str.contains("LA FUENTE", case=False, na=False) |
        primary_data["candidate"].str.contains("\w*Fuque\w*", case=False, na=False)
    ),
    "candidate"
] = "LA FUENTE"


primary_data.loc[:, "candidate"] = primary_data["candidate"].replace({
    "Donald I. Trump": "Donald J. Trump",
})

primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()
# # Fixing the , candidate
# primary_data["candidate"] = (
#     primary_data["candidate"].str.split(",")
#     .str[0]
#     .str.strip()
#     )


# #Selecting only last name
primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]

# Fixing Christie
primary_data.loc[
    primary_data["candidate"].str.contains("CHRISTLE", case=False, na=False),
    "candidate"
] = "CHRISTIE"

# Fixing KASICH
primary_data.loc[
    primary_data["candidate"].str.contains("KAISCH", case=False, na=False),
    "candidate"
] = "CHRISTIE"

# Fixing O'MALLEY
primary_data.loc[
    primary_data["candidate"].str.contains("O'MALLEY", case=False, na=False),
    "candidate"
] = "O'MAILEY"

# Fixing O'MALLEY
primary_data.loc[
    primary_data["candidate"].str.contains("PATAKL", case=False, na=False),
    "candidate"
] = "PATAKI"

# Fixing O'MALLEY
primary_data.loc[
    primary_data["candidate"].str.contains("RUBLO", case=False, na=False),
    "candidate"
] = "RUBIO"

# # Fixing Huckabee
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ABEE\w*", case=False, na=False),
#     "candidate"
# ] = "HUCKABEE"

# # Fixing Fiorina
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ORINA\W*", case=False, na=False),
#     "candidate"
# ] = "FIORINA"

# # Fixing Gray
# primary_data.loc[
#     primary_data["candidate"].str.contains("GREY", case=False, na=False),
#     "candidate"
# ] = "GRAY"

# # Fixing Wilson
# primary_data.loc[
#     primary_data["candidate"].str.contains("WISON", case=False, na=False),
#     "candidate"
# ] = "WILSON"


# # Fixing separator 
# primary_data["candidate"] = (
#     primary_data["candidate"]
#     .str.split(r"\s*(?:and|/|&|–|-|\+)\s*", n=1, expand=True)[0]
#     .str.strip()
#     .str.upper()
# )

# # Fixing McMullin

# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("MCMULLIN", case=False, na=False) |
#         primary_data["candidate"].str.contains("EVAN MCMULLEN", case=False, na=False)
#     ),
#     "candidate"
# ] = "MCMULLIN"

# primary_data.loc[
#     primary_data["candidate"].str.contains("De La Fuen", case=False, na=False),
#     "candidate"
# ] = "FUENTE D"

# primary_data.loc[:,"candidate"] = (
#     primary_data["candidate"].str.split().str[0].str.upper()
# )

primary_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]


candidate
CHRISTIE    1942
BUSH        1785
RUBIO       1785
SANDERS     1785
O'MAILEY    1785
FUENTE      1785
CLINTON     1785
TRUMP       1785
SANTORUM    1785
PAUL        1785
CARSON      1785
PATAKI      1785
HUCKABEE    1785
GRAHAM      1785
FLORINA     1785
CRUZ        1785
WILSON      1785
KASICH      1628
Name: count, dtype: int64

In [211]:
# Viewing Party
primary_data["party"].value_counts(dropna=False)

party
Republican    23205
Democrat       8420
Democratic      505
Name: count, dtype: int64

In [212]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown

#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# )

primary_data["party"].value_counts(dropna=False)


party
Republican    23205
Democrat       8420
Democratic      505
Name: count, dtype: int64

In [213]:
# Cleaning Party
# Turning all general data party to uppercase
primary_data["party"] = primary_data["party"].astype(str).str.upper()

primary_data["party"] = primary_data.apply(
    lambda row: fill_party_from_data(row, primary_data),
    axis=1
)
primary_data["party"] = (
    primary_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEMOCRAT":"DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GREEN",
        "GREEN-RAINBOW":"GRN",
        "LIBERTARIAN": "LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK"
    })
)
primary_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = primary_data["party"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = primary_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = (


party
REP    23205
DEM     8925
Name: count, dtype: int64

In [214]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"]
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data.loc[:,"candidate_column"] = (


candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_O'MAILEY,pri_dem_SANDERS,pri_dem_WILSON,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CHRISTIE,pri_rep_CRUZ,pri_rep_FLORINA,pri_rep_GRAHAM,pri_rep_HUCKABEE,pri_rep_KASICH,pri_rep_PATAKI,pri_rep_PAUL,pri_rep_RUBIO,pri_rep_SANTORUM,pri_rep_TRUMP
0,"ADAMSDIST. 1, BELLEMONT",163.0,1.0,0.0,49.0,2.0,6.0,4.0,0.0,224.0,0.0,0.0,2.0,88.0,0.0,0.0,52.0,0.0,281.0
1,"ADAMSDIST. 1, BY-PASS FIRE",205.0,0.0,2.0,29.0,1.0,1.0,1.0,0.0,38.0,0.0,0.0,0.0,11.0,0.0,0.0,4.0,0.0,75.0
2,"ADAMSDIST. 1, COURTHOUSE",61.0,0.0,0.0,31.0,0.0,3.0,2.0,1.0,47.0,0.0,0.0,1.0,55.0,0.0,0.0,37.0,0.0,99.0
3,"ADAMSDIST. 2, BEAU PRE",106.0,0.0,0.0,14.0,0.0,1.0,0.0,0.0,80.0,0.0,0.0,0.0,20.0,0.0,0.0,15.0,0.0,118.0
4,"ADAMSDIST. 2, DUNCAN PARK",112.0,0.0,1.0,30.0,0.0,3.0,7.0,1.0,77.0,1.0,0.0,0.0,38.0,0.0,1.0,21.0,0.0,97.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1798,YAZOOWARD 4,382.0,1.0,1.0,31.0,0.0,1.0,1.0,0.0,50.0,0.0,0.0,0.0,29.0,0.0,0.0,21.0,0.0,101.0
1799,YAZOOWARD 5,507.0,0.0,2.0,23.0,1.0,0.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1800,YAZOOWEST BENTONIA,139.0,0.0,0.0,17.0,1.0,1.0,1.0,0.0,46.0,0.0,0.0,0.0,7.0,0.0,0.0,5.0,0.0,84.0
1801,YAZOOWEST MIDWAY,11.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,1.0,3.0,0.0,0.0,3.0,0.0,49.0


In [215]:
# Process general files
general_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)
        
        # Combine precinct as county + precinct
        df["precinct"] = df["county"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()
        df["county"] = df["county"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]                
        
        if 'county' in df.columns:
            df['county'] = df['county'].astype(str)                  
            df = df[df['county'].str.upper() != "TOTAL"]               
            df = df[df['county'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['county'].str.upper() != "NAN"]       
            df = df[~df['county'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]      
            
        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        general_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(general_df_list, ignore_index=True)

# Checking any suspicious precinct
suspicious_precincts = gen_combined_df[
    gen_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]


In [216]:
# Select only the relevant columns
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]

general_data

Unnamed: 0,precinct,party,candidate,votes
0,"ADAMSDIST. 1, BELLEMONT",Democrat,Hillary Clinton,442
1,"ADAMSDIST. 1, BELLEMONT",Republican,Donald J. Trump,1090
2,"ADAMSDIST. 1, BELLEMONT",Constitution,Darrell Castle,6
3,"ADAMSDIST. 1, BELLEMONT",American Delta,Rocky' Roque De La Fuente,1
4,"ADAMSDIST. 1, BELLEMONT",Prohibition,Jim Hedges,1
...,...,...,...,...
12959,YAZOOZION,Constitution,Darrell Castle,3
12960,YAZOOZION,American Delta,Roque 'Rocky' De La Fuente,0
12961,YAZOOZION,Prohibition,Jim Hedges,0
12962,YAZOOZION,Libertarian,Gary Johnson,1


In [217]:
# Viewing candidate data
general_data["candidate"].value_counts(dropna=False)

candidate
Hillary Clinton               1852
Donald J. Trump               1852
Darrell Castle                1852
Jim Hedges                    1852
Gary Johnson                  1852
Jill Stein                    1852
Rocky' Roque De La Fuente     1256
Roque 'Rocky' De La Fuente     596
Name: count, dtype: int64

In [218]:
# Cleaning Candidates

# Turning all general data to uppercase
general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


# unwanted_keywords = [
#     r"\w*VOTE\w*",
#     r"\w*UNCOM\w*",
#     r"\w*TOTAL\w*",
#     r"\w*WRITE[\s-]\w*",
#     r"NAN",
#     r"UNCERTIFIED",
#     r"UNVERIFIED",
#     r"NONE OF THE ABOVE",
#     r"LBT",
#     r"DEM",
#     r"REP",
#     r"GRN",
#     r"NAN"

# ]

# Fixing FUENTE
general_data.loc[
    general_data["candidate"].str.contains("\w*Fuente\w*", case=False, na=False),
    "candidate"
] = "FUENTE"
# pattern = "|".join(unwanted_keywords)

# # FILTER OUT TRASH WORDS
# general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# # Fixing Darrell Castle"
# general_data.loc[
#     general_data["candidate"].str.contains("\w*ASTLE\w*", case=False, na=False),
#     "candidate"
# ] = "CASTLE"

# general_data.loc[
#     general_data["candidate"].str.contains("\w*ATUREN\w*", case=False, na=False),
#     "candidate"
# ] = "MATUREN"

# # Fixing mixed president + vice_president by /
# general_data["candidate"] = (
#     general_data["candidate"].str.split("/")
#     .str[0]
#     .str.strip()
#     )

# #Selecting only last name
# general_data["candidate"] = general_data["candidate"].str.split().str[-1]

# # Fixing the , candidate
# # general_data["candidate"] = (
# #     general_data["candidate"].str.split(",")
# #     .str[0]
# #     .str.strip()
# #     )




# # Fixing Cubbler
# general_data.loc[
#     ( 
#         general_data["candidate"].str.contains("\w*UBBIER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*UBLER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*OBBLER\w*", case=False, na=False),
#     "candidate")
# ] = "CUBBLER"


# # Fixing Kotlikoff
# general_data.loc[
#     general_data["candidate"].str.contains("\w*TIKOFF\w*", case=False, na=False),
#     "candidate"
# ] = "KOTLIKOFF"

# # Fixing Valdivia
# general_data.loc[
#     general_data["candidate"].str.contains("\w*VALDIVA\W*", case=False, na=False),
#     "candidate"
# ] = "VALDIVIA"

# # Fixing HOEFLING
# general_data.loc[
#     general_data["candidate"].str.contains("\w*HOEFFLING", case=False, na=False),
#     "candidate"
# ] = "HOEFLING"


# # Fixing McMullin
# general_data.loc[
#     general_data["candidate"].str.contains("MCMULLEN", case=False, na=False) ,
#     "candidate"
# ] = "MCMULLIN"

# Fixing LASTNAME + First name Initial
# general_data.loc[:,"candidate"] = (
#     general_data["candidate"].str.split().str[0].str.upper()
# )


# #Selecting only last name
general_data["candidate"] = general_data["candidate"].str.split().str[-1]

general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = general_data["candidate"].str.split().str[-1]


candidate
CLINTON    1852
TRUMP      1852
CASTLE     1852
FUENTE     1852
HEDGES     1852
JOHNSON    1852
STEIN      1852
Name: count, dtype: int64

In [219]:
# Viewing Party
general_data["party"].value_counts(dropna=False)

party
Democrat          1852
Republican        1852
Constitution      1852
American Delta    1852
Prohibition       1852
Libertarian       1852
Green             1852
Name: count, dtype: int64

In [220]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown
    

#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)

    # Filter and combine only known party entries (exclude 'UNK')
    new_data = (
        df[["candidate", "party"]]
        .dropna()
        .query('party != "UNK"')
        .drop_duplicates()
    )

    # Merge with master and remove duplicates by candidate
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)

    # Filter and combine only known party entries (exclude 'UNK')
    new_data = (
        df[["candidate", "party"]]
        .dropna()
        .query('party != "UNK"')
        .drop_duplicates()
    )

    # Merge with master and remove duplicates by candidate
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
# general_data["party"] = general_data.apply(
#     lambda row: fill_party_from_data(row, general_data),
#     axis=1
# )

# # Fill remaining party using general master CSV
# master_party_df = pd.read_csv(r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv") # USE YOUR OWN ADDRESS
# general_data = fill_party_from_master(general_data, master_party_df)
general_data["party"].value_counts(dropna=False)


party
Democrat          1852
Republican        1852
Constitution      1852
American Delta    1852
Prohibition       1852
Libertarian       1852
Green             1852
Name: count, dtype: int64

In [221]:
# Cleaning Party
# Turning all general data party to uppercase
general_data["party"] = general_data["party"].astype(str).str.upper()

general_data["party"] = (
    general_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEMOCRAT":"DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GRN",
        "GREEN AND RAINBOW":"GRN",
        "GREEN-RAINBOW":"GRN",
        "LIBERTARIAN": "LIB",
        "LIBERTARIN":"LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "AMERICAN DELTA":"AMD",
        "PROHIBITION":"PRO",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK",
        "NPA":"IND",
        "UST":"CON",
        "(WRITE-IN)":"IND"
    })
)

general_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data["party"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = (


party
DEM    1852
REP    1852
CON    1852
AMD    1852
PRO    1852
LIB    1852
GRN    1852
Name: count, dtype: int64

In [222]:
# UPDATE MASTER FILE, CAREFUL
update_master_candidate_party(general_data, r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv")

In [223]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate_column"] = (


candidate_column,precinct,gen_amd_FUENTE,gen_con_CASTLE,gen_dem_CLINTON,gen_grn_STEIN,gen_lib_JOHNSON,gen_pro_HEDGES,gen_rep_TRUMP
0,"ADAMSDIST. 1, BELLEMONT",1,6,442,4,20,1,1090
1,"ADAMSDIST. 1, BY-PASS FIRE",3,2,504,1,4,2,241
2,"ADAMSDIST. 1, COURTHOUSE",0,1,171,2,18,0,402
3,"ADAMSDIST. 2, BEAU PRE",0,3,285,1,9,0,468
4,"ADAMSDIST. 2, DUNCAN PARK",0,2,338,3,21,0,461
...,...,...,...,...,...,...,...,...
1847,YAZOOWASHINGTON ST. FIRE STATION,0,0,180,0,0,0,185
1848,YAZOOWELFARE OFFICE,0,1,209,0,1,1,287
1849,YAZOOWEST BENTONIA,0,4,280,0,2,1,221
1850,YAZOOWEST MIDWAY,0,1,25,0,0,0,151


In [224]:
# Merge
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")

# Convert DEM primary columns to numeric and calculate total
dem_cols = combined.filter(like="pri_dem_").columns
combined[dem_cols] = combined[dem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_primary_total"] = combined[dem_cols].sum(axis=1)

# Convert general election columns to numeric and calculate total
gen_cols = combined.filter(like="gen_").columns
combined[gen_cols] = combined[gen_cols].apply(pd.to_numeric, errors="coerce")
combined["general_total"] = combined[gen_cols].sum(axis=1)

# Convert all numeric columns to int
for col in combined.columns[1:]:
    combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0).astype(int)

# Identify filtered-out precincts
primary_precincts = set(primary_result["precinct"])
general_precincts = set(general_result["precinct"])
combined_precincts = set(combined["precinct"])

primary_filtered_out = primary_precincts - combined_precincts
general_filtered_out = general_precincts - combined_precincts




In [225]:


combined.to_csv("MS.csv", index=False)
# Step 3: Identify unmatched precincts and retrieve full rows
combined_precincts = set(combined["precinct"])

# Rows in primary_result but not in combined
primary_filtered_out = primary_result[~primary_result["precinct"].isin(combined_precincts)]
primary_filtered_out.to_csv("MS_primary_filtered.csv", index=False)

# Rows in general_result but not in combined
general_filtered_out = general_result[~general_result["precinct"].isin(combined_precincts)]
general_filtered_out.to_csv("MS_general_filtered.csv", index=False)

In [226]:
print(f"primary: {len(primary_precincts)}, general: {len(general_precincts)}, combined: {len(combined)}")

primary: 1803, general: 1852, combined: 1572


In [227]:
print(f"primary: {len(pri_combined_df['county'].unique())}, general: {len(gen_combined_df['county'].unique())}")


primary: 94, general: 82


In [228]:
pri_combined_df['county'].unique()

array(['ADAMS', 'ALCORN', 'AMITE', 'ATTALA', 'BENTON', 'BOLIVAR',
       'CALHOUN', 'CARROLL', 'CHICKASAW', 'CHOCTAW', 'CLAIBORNE',
       'CLARKE', 'CLAY', 'COAHOMA', 'COPIAH', 'COVINGTON', 'DESOTO',
       'FORREST', 'FRANKLIN', 'GEORGE', 'GREENE', 'GRENADA', 'HANCOCK',
       'HARRISON', 'HINDS', 'HOLMES', 'ISSAQUENA', 'ITAWAMBA', 'JACKSON',
       'JASPER', 'JEFFERSON', 'JEFFERSON DAVIS', 'JONES', 'BAND BUILDING',
       'COURTHOUSE', 'FARMERS MARKET', 'FORT STEVENS', 'KELLIS STORE',
       'KEMPER SPRINGS', 'LYNVILLE', 'LYNWOOD', 'MOUNT NEBO',
       'PORTERVILLE', 'PRESTON', 'SATELLITE', 'SCOOBA A-L', 'SCOOBA M-Z',
       'LAFAYETTE', 'LAMAR', 'LAUDERDALE', 'LAWRENCE', 'LEAKE', 'LEE',
       'LEFLORE', 'LINCOLN', 'LOWNDES', 'MADISON', 'MARION', 'MARSHALL',
       'MONROE', 'MONTGOMERY', 'NESHOBA', 'NEWTON', 'NOXUBEE',
       'OKTIBBEHA', 'PANOLA', 'PEARL RIVER', 'PERRY', 'PIKE', 'PONTOTOC',
       'PRENTISS', 'QUITMAN', 'RANKIN', 'SCOTT', 'SHARKEY', 'SIMPSON',
       'SMITH', 'ST

In [229]:
gen_combined_df['county'].unique()

array(['ADAMS', 'ALCORN', 'AMITE', 'ATTALA', 'BENTON', 'BOLIVAR',
       'CALHOUN', 'CARROLL', 'CHICKASAW', 'CHOCTAW', 'CLAIBORNE',
       'CLARKE', 'CLAY', 'COAHOMA', 'COPIAH', 'COVINGTON', 'DESOTO',
       'FORREST', 'FRANKLIN', 'GEORGE', 'GREENE', 'GRENADA', 'HANCOCK',
       'HARRISON', 'HINDS', 'HOLMES', 'HUMPHREYS', 'ISSAQUENA',
       'ITAWAMBA', 'JACKSON', 'JASPER', 'JEFFERSON', 'JEFFERSON DAVIS',
       'JONES', 'KEMPER', 'LAFAYETTE', 'LAMAR', 'LAUDERDALE', 'LAWRENCE',
       'LEAKE', 'LEE', 'LEFLORE', 'LINCOLN', 'LOWNDES', 'MADISON',
       'MARION', 'MARSHALL', 'MONROE', 'MONTGOMERY', 'NESHOBA', 'NEWTON',
       'NOXUBEE', 'OKTIBBEHA', 'PANOLA', 'PEARL RIVER', 'PERRY', 'PIKE',
       'PONTOTOC', 'PRENTISS', 'QUITMAN', 'RANKIN', 'SCOTT', 'SHARKEY',
       'SIMPSON', 'SMITH', 'STONE', 'SUNFLOWER', 'TALLAHATCHIE', 'TATE',
       'TIPPAH', 'TISHOMINGO', 'TUNICA', 'UNION', 'WALTHALL', 'WARREN',
       'WASHINGTON', 'WAYNE', 'WEBSTER', 'WILKINSON', 'WINSTON',
       'YALOBUSHA', '

In [230]:
primary_counties = set(pri_combined_df['county'].dropna().str.strip().str.upper())
general_counties = set(gen_combined_df['county'].dropna().str.strip().str.upper())

diff = primary_counties - general_counties
print(f"Counties in primary but not in general: {len(diff)}")
print(diff)


Counties in primary but not in general: 15
{'KELLIS STORE', 'LYNWOOD', 'KEMPER SPRINGS', 'COURTHOUSE', 'LYNVILLE', 'FARMERS MARKET', 'FORT STEVENS', 'SATELLITE', 'PORTERVILLE', 'PRESTON', 'SCOOBA M-Z', 'WINTSON', 'BAND BUILDING', 'MOUNT NEBO', 'SCOOBA A-L'}
