In [121]:
# Import all the libraries
import pandas as pd
import glob
import os
import re
from pprint import pprint

In [122]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\MA\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary'])
]


In [123]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__acton__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__auburn__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__barnstable__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__belmont__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__brookline__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__chelmsford__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__dedham__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__falmouth__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__freetown__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20161108__ma__general__somerset__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\

In [124]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\MA\20160301__ma__primary__president__precinct.csv
C:\Huy Phan\College\VoterTurnout\data\MA\20160908__ma__primary__precinct.csv


In [125]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)
        
        # Combine precinct as county + precinct
        df["precinct"] = df["town"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[~df['precinct'].str.upper().str.contains(r"\bTOTAL\b", na=False)] 
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]                
        
        if 'town' in df.columns:
            df['town'] = df['town'].astype(str)                  
            df = df[~df['town'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]            
            df = df[df['town'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['town'].str.upper() != "NAN"]      
            
        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)


# Checking any suspicious precinct
suspicious_precincts = pri_combined_df[
    pri_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]


In [126]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,MATTAPOISETT1,Green-rainbow,Darryl Cherney,0
1,AQUINNAH1,Republican,Jeb Bush,0
2,LAWRENCE1,Republican,Carly Fiorina,0
3,BOSTON8,Republican,Total Votes Cast,51
4,SCITUATE1,Republican,Ben Carson,10
...,...,...,...,...
73227,IPSWICH1,Green-rainbow,Blank Votes,0
73228,LYNN3,Republican,Rick Santorum,0
73229,SWAMPSCOTT3,Green-rainbow,Blank Votes,0
73230,FAIRHAVEN5,Democratic,No Preference,5


In [127]:
# Viewing candidate data
primary_data["candidate"].value_counts(dropna=False)

candidate
All Others            6446
Total Votes Cast      6446
Blank Votes           6446
No Preference         6446
Rand Paul             2174
Jim Gilmore           2174
Rick Santorum         2174
Marco Rubio           2174
Mike Huckabee         2174
Roque De La Fuente    2174
Hillary Clinton       2174
Martin O'Malley       2174
George Pataki         2174
John R. Kasich        2174
Jeb Bush              2174
Chris Christie        2174
Donald J. Trump       2174
Ted Cruz              2174
Bernie Sanders        2174
Ben Carson            2174
Carly Fiorina         2174
William P. Kreml      2098
Jill E. Stein         2098
Skcm Curry            2098
Kent Mesplay          2098
Darryl Cherney        2098
Name: count, dtype: int64

In [128]:
# Cleaning Candidates

# Turning all primary data to uppercase
# primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()

# unwanted_keywords = [
#     r"\w*VOTE\w*",
#     r"\w*UNCOM\w*",
#     r"\w*TOTAL\w*",
#     r"Blank Votes", r"All Others",r"Total Votes Cast",r"No Preference"
# ]

# pattern = "|".join(unwanted_keywords)

# # Assuming candidate column is already string and uppercase
# primary_data = primary_data[~primary_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

primary_data = primary_data[~primary_data["candidate"].isin(["Blank Votes", "All Others","Total Votes Cast","No Preference"])] 
primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()
# # Fixing the , candidate
# primary_data["candidate"] = (
#     primary_data["candidate"].str.split(",")
#     .str[0]
#     .str.strip()
#     )

# # Fixing De la Fuente
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*Fuen\w*", case=False, na=False),
#     "candidate"
# ] = "LA FUENTE"

# #Selecting only last name
primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]

# # Fixing Huckabee
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ABEE\w*", case=False, na=False),
#     "candidate"
# ] = "HUCKABEE"

# # Fixing Fiorina
# primary_data.loc[
#     primary_data["candidate"].str.contains("\w*ORINA\W*", case=False, na=False),
#     "candidate"
# ] = "FIORINA"

# # Fixing Gray
# primary_data.loc[
#     primary_data["candidate"].str.contains("GREY", case=False, na=False),
#     "candidate"
# ] = "GRAY"

# # Fixing Wilson
# primary_data.loc[
#     primary_data["candidate"].str.contains("WISON", case=False, na=False),
#     "candidate"
# ] = "WILSON"


# # Fixing separator 
# primary_data["candidate"] = (
#     primary_data["candidate"]
#     .str.split(r"\s*(?:and|/|&|–|-|\+)\s*", n=1, expand=True)[0]
#     .str.strip()
#     .str.upper()
# )

# # Fixing McMullin

# primary_data.loc[
#     (
#         primary_data["candidate"].str.contains("MCMULLIN", case=False, na=False) |
#         primary_data["candidate"].str.contains("EVAN MCMULLEN", case=False, na=False)
#     ),
#     "candidate"
# ] = "MCMULLIN"

# primary_data.loc[
#     primary_data["candidate"].str.contains("De La Fuen", case=False, na=False),
#     "candidate"
# ] = "FUENTE D"

# primary_data.loc[:,"candidate"] = (
#     primary_data["candidate"].str.split().str[0].str.upper()
# )

primary_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["candidate"] = primary_data["candidate"].str.split().str[-1]


candidate
PATAKI      2174
PAUL        2174
GILMORE     2174
SANTORUM    2174
RUBIO       2174
HUCKABEE    2174
FUENTE      2174
CLINTON     2174
O'MALLEY    2174
BUSH        2174
KASICH      2174
CHRISTIE    2174
TRUMP       2174
CRUZ        2174
SANDERS     2174
CARSON      2174
FIORINA     2174
KREML       2098
STEIN       2098
CURRY       2098
MESPLAY     2098
CHERNEY     2098
Name: count, dtype: int64

In [129]:
# Viewing Party
primary_data["party"].value_counts(dropna=False)

party
Republican       28262
Green-rainbow    10490
Democratic        8696
Name: count, dtype: int64

In [130]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown
    

#=====================================
# Function to fill remaining missing party from master lookup
#=====================================
def fill_party_from_master(df, master_df):
    party_map = master_df.set_index("candidate")["party"].to_dict()
    df["party"] = df.apply(
        lambda row: party_map.get(row["candidate"], row["party"])
        if pd.isna(row["party"]) else row["party"],
        axis=1
    )
    return df


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)
    
    # Combine unique pairs
    new_data = df[["candidate", "party"]].dropna().drop_duplicates()
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
# primary_data["party"] = primary_data.apply(
#     lambda row: fill_party_from_data(row, primary_data),
#     axis=1
# )

# Fill remaining party using general master CSV
# master_party_df = pd.read_csv("data/master_primary_candidate_party.csv")
# general_data = fill_party_from_master(general_data, master_party_df)

# STEP 3: Update master file with new (candidate, party) pairs
# update_master_candidate_party(general_data, "data/master_candidate_party.csv")
primary_data["party"].value_counts(dropna=False)


party
Republican       28262
Green-rainbow    10490
Democratic        8696
Name: count, dtype: int64

In [131]:
# Cleaning Party
# Turning all general data party to uppercase
primary_data["party"] = primary_data["party"].astype(str).str.upper()

primary_data["party"] = primary_data.apply(
    lambda row: fill_party_from_data(row, primary_data),
    axis=1
)
primary_data["party"] = (
    primary_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GREEN",
        "GREEN-RAINBOW":"GRN",
        "LIBERTARIAN": "LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK"
    })
)
primary_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = primary_data["party"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = primary_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data["party"] = (


party
REP    28262
GRN    10490
DEM     8696
Name: count, dtype: int64

In [132]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"]
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data.loc[:,"candidate_column"] = (


candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_O'MALLEY,pri_dem_SANDERS,pri_grn_CHERNEY,pri_grn_CURRY,pri_grn_KREML,pri_grn_MESPLAY,pri_grn_STEIN,...,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_GILMORE,pri_rep_HUCKABEE,pri_rep_KASICH,pri_rep_PATAKI,pri_rep_PAUL,pri_rep_RUBIO,pri_rep_SANTORUM,pri_rep_TRUMP
0,ABINGTON1,193,0,3,268,0,0,0,0,0,...,36,3,1,0,61,0,2,45,0,235
1,ABINGTON2,195,0,3,271,0,0,0,0,1,...,34,0,0,0,46,0,1,48,0,257
2,ABINGTON3,246,2,5,273,0,0,0,0,0,...,48,0,0,1,56,0,2,52,0,264
3,ABINGTON4,265,1,2,267,0,0,0,0,0,...,56,0,0,2,68,0,0,62,0,267
4,ABINGTON5,260,1,2,311,0,0,0,0,1,...,37,0,0,1,72,0,1,61,1,245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1285,YARMOUTH3,276,0,2,217,0,0,0,0,0,...,27,2,0,0,55,1,1,54,0,206
1286,YARMOUTH4,343,1,3,325,0,0,0,0,0,...,43,2,0,0,71,0,1,74,0,319
1287,YARMOUTH5,222,0,1,260,0,0,0,0,0,...,33,2,0,2,45,0,0,58,0,239
1288,YARMOUTH6,251,2,0,236,0,0,0,0,0,...,31,1,1,0,47,0,0,64,1,244


In [133]:
# Process general files
general_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)
        
        # Combine precinct as county + precinct
        df["precinct"] = df["town"].astype(str) + df["precinct"].astype(str)
        df["precinct"] = df["precinct"].str.upper()

        # Cleaning maybe total precinct
        if 'precinct' in df.columns:
            df['precinct'] = df['precinct'].astype(str)                  
            df = df[df['precinct'].str.upper() != "TOTAL"]               
            df = df[df['precinct'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['precinct'].str.upper() != "NAN"]                
        
        if 'town' in df.columns:
            df['town'] = df['town'].astype(str)                  
            df = df[~df['town'].str.upper().str.contains(r"\w*TOTAL\w*", na=False)]            
            df = df[df['town'].str.strip().str.upper() != "TOTAL"]   
            df = df[df['town'].str.upper() != "NAN"]      
            
        # Filtering out only President 
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        # Dropping duplicates value
        df = df.drop_duplicates()
        
        general_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(general_df_list, ignore_index=True)

# Checking any suspicious precinct
suspicious_precincts = gen_combined_df[
    gen_combined_df["precinct"].str.strip().str.upper().isin(["TOTAL", "NAN"])
]


In [134]:
# Select only the relevant columns
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]

general_data

Unnamed: 0,precinct,party,candidate,votes
0,ACTON1,,BLANK,37
1,ACTON2,,BLANK,33
2,ACTON3,,BLANK,29
3,ACTON4,,BLANK,18
4,ACTON5,,BLANK,36
...,...,...,...,...
28818,SPENCERTOTAL,Libertarian,Johnson and Weld,296
28819,SPENCERTOTAL,Green-Rainbow,Stein and Baraka,83
28820,SPENCERTOTAL,Republican,Trump and Pence,3045
28821,SPENCERTOTAL,,McMullin and Johnson,10


In [135]:
# Viewing candidate data
general_data["candidate"].value_counts(dropna=False)

candidate
Stein and Baraka            2193
Johnson and Weld            2193
Clinton and Kaine           2193
Trump and Pence             2193
All Others                  2184
Kotlikoff and Leamer        2174
No Preference               2174
Moorehead and Lilly         2174
Blank Votes                 2174
Schoenke and Mitchel        2174
Feegbeh and O'Brien         2174
Total Votes Cast            2174
Mcmullin and Johnson        2174
Blanks                        35
JOHNSON and WELD              24
STEIN and BARAKA              24
TRUMP and PENCE               24
CLINTON and KAINE             24
Blank                         23
STEIN & BARAKA                21
JOHNSON & WELD                21
TRUMP & PENCE                 21
McMullin and Johnson          15
CLINTON & KAINE               15
MCMULLIN & JOHNSON            15
Write-In                      14
TRUMP-PENCE                   14
BLANKS                        14
JOHNSTON-WELD                 14
CLINTON-KAINE                 14


In [136]:
# Cleaning Candidates

# Turning all primary data to uppercase
# general_data["candidate"] = general_data["candidate"].astype(str).str.upper()


# unwanted_keywords = [
#     r"\w*VOTE\w*",
#     r"\w*UNCOM\w*",
#     r"\w*TOTAL\w*",
#     r"\w*WRITE[\s-]\w*",
#     r"NAN",
#     r"UNCERTIFIED",
#     r"UNVERIFIED",
#     r"NONE OF THE ABOVE",
#     r"LBT",
#     r"DEM",
#     r"REP",
#     r"GRN",
#     r"NAN"

# ]

# pattern = "|".join(unwanted_keywords)

# # FILTER OUT TRASH WORDS
# general_data = general_data[~general_data["candidate"].str.upper().str.contains(pattern, regex=True, na=False)]

# # Fixing Darrell Castle"
# general_data.loc[
#     general_data["candidate"].str.contains("\w*ASTLE\w*", case=False, na=False),
#     "candidate"
# ] = "CASTLE"

# general_data.loc[
#     general_data["candidate"].str.contains("\w*ATUREN\w*", case=False, na=False),
#     "candidate"
# ] = "MATUREN"

# # Fixing mixed president + vice_president by /
# general_data["candidate"] = (
#     general_data["candidate"].str.split("/")
#     .str[0]
#     .str.strip()
#     )

# #Selecting only last name
# general_data["candidate"] = general_data["candidate"].str.split().str[-1]

# # Fixing the , candidate
# # general_data["candidate"] = (
# #     general_data["candidate"].str.split(",")
# #     .str[0]
# #     .str.strip()
# #     )




# # Fixing Cubbler
# general_data.loc[
#     ( 
#         general_data["candidate"].str.contains("\w*UBBIER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*UBLER\w*", case=False, na=False)|
#         general_data["candidate"].str.contains("\w*OBBLER\w*", case=False, na=False),
#     "candidate")
# ] = "CUBBLER"


# # Fixing Kotlikoff
# general_data.loc[
#     general_data["candidate"].str.contains("\w*TIKOFF\w*", case=False, na=False),
#     "candidate"
# ] = "KOTLIKOFF"

# # Fixing Valdivia
# general_data.loc[
#     general_data["candidate"].str.contains("\w*VALDIVA\W*", case=False, na=False),
#     "candidate"
# ] = "VALDIVIA"

# # Fixing HOEFLING
# general_data.loc[
#     general_data["candidate"].str.contains("\w*HOEFFLING", case=False, na=False),
#     "candidate"
# ] = "HOEFLING"


# # Fixing McMullin
# general_data.loc[
#     general_data["candidate"].str.contains("MCMULLEN", case=False, na=False) ,
#     "candidate"
# ] = "MCMULLIN"

# Fixing LASTNAME + First name Initial
# general_data.loc[:,"candidate"] = (
#     general_data["candidate"].str.split().str[0].str.upper()
# )
general_data = general_data[~general_data["candidate"].isin(["All Others", "No Preference","Blank Votes","Total Votes Cast","Blanks","Blank","Write-In","BLANKS","Other Write-in","All Other Write-in Votes","Write-in votes","Write in","BLANK","Scattering Write-ins","SCATTERED","All Other Write Ins","Write-in"])] 
general_data["candidate"] = (
    general_data["candidate"]
    .str.split(r"\s*(?:and|/|&|–|-|\+)\s*", n=1, expand=True)[0]
    .str.strip()
    .str.upper()
)

# Fixing McMullin

general_data.loc[
    (
        general_data["candidate"].str.contains("MCMULLIN", case=False, na=False) |
        general_data["candidate"].str.contains("EVAN MCMULLEN", case=False, na=False)
    ),
    "candidate"
] = "MCMULLIN"

# Fixing clinton and Kaine
general_data.loc[
    general_data["candidate"].str.contains("CLINTON AND KAINE", case=False, na=False) ,
    "candidate"
] = "CLINTON"

# Fixing johnson
general_data.loc[
    general_data["candidate"].str.contains("JOHNSTON", case=False, na=False) ,
    "candidate"
] = "JOHNSON"

# Fixing sanders
general_data.loc[
    (
        general_data["candidate"].str.contains("SANDERS", case=False, na=False) |
        general_data["candidate"].str.contains("BERNIE S", case=False, na=False)
    ),
    "candidate"
] = "SANDERS"

general_data["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate"] = (


candidate
CLINTON      2252
JOHNSON      2252
STEIN        2252
TRUMP        2252
MCMULLIN     2216
MOOREHEAD    2174
SCHOENKE     2174
KOTLIKOFF    2174
FEEGBEH      2174
SANDERS        15
ROMNEY          9
KASICH          9
BLUMBERG        9
RYAN            9
Name: count, dtype: int64

In [137]:
# Viewing Party
general_data["party"].value_counts(dropna=False)

party
(Write-In)           10870
Democratic            2252
Libertarian           2252
Republican            2252
Green-rainbow         2174
NaN                     93
Green-Rainbow           72
Green and Rainbow        6
Name: count, dtype: int64

In [138]:
#=====================================
# This function is used to look up party
# for the candidate in the same df
#=====================================
def fill_party_from_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown
    

#=====================================
# Function to fill remaining missing party from master lookup
#=====================================
def fill_party_from_master(df, master_df):
    party_map = master_df.set_index("candidate")["party"].to_dict()
    df["party"] = df.apply(
        lambda row: party_map.get(row["candidate"], row["party"])
        if pd.isna(row["party"]) else row["party"],
        axis=1
    )
    return df


#=====================================
# Function to update the master candidate-party CSV
#=====================================
def update_master_candidate_party(df, master_path):
    # Read the existing master file
    master_df = pd.read_csv(master_path)

    # Filter and combine only known party entries (exclude 'UNK')
    new_data = (
        df[["candidate", "party"]]
        .dropna()
        .query('party != "UNK"')
        .drop_duplicates()
    )

    # Merge with master and remove duplicates by candidate
    updated_master = pd.concat([master_df, new_data]).drop_duplicates(subset="candidate")

    # Save updated version
    updated_master.to_csv(master_path, index=False)


#=====================================
# Example usage (uncomment and modify for your workflow)
#=====================================
# Fill party using internal data
general_data["party"] = general_data.apply(
    lambda row: fill_party_from_data(row, general_data),
    axis=1
)

# Fill remaining party using general master CSV
master_party_df = pd.read_csv(r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv") # USE YOUR OWN ADDRESS
general_data = fill_party_from_master(general_data, master_party_df)
general_data["party"].value_counts(dropna=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["party"] = df.apply(


party
(Write-In)           10912
Democratic            2252
Libertarian           2252
Republican            2252
Green-rainbow         2174
Green-Rainbow           72
None                    36
IND                     15
Green and Rainbow        6
Name: count, dtype: int64

In [139]:
# Cleaning Party
# Turning all general data party to uppercase
general_data["party"] = general_data["party"].astype(str).str.upper()

general_data["party"] = (
    general_data["party"]
    .replace({
        "REPUBLICAN": "REP",
        "R": "REP",
        "DEMOCRATIC": "DEM",
        "DEM": "DEM",
        "D": "DEM",
        "GREEN": "GRN",
        "G": "GRN",
        "GREEN AND RAINBOW":"GRN",
        "GREEN-RAINBOW":"GRN",
        "LIBERTARIAN": "LIB",
        "LIBERTARIN":"LIB",
        "LBT": "LIB",
        "L": "LIB",
        "CONSTITUTION": "CON",
        "NP": "IND",
        "NON": "IND",
        "WRI": "IND",
        "WRITE-IN": "IND",
        "ONA": "IND",
        "GEN": "IND",
        "NONE":"UNK",
        "(WRITE-IN)":"IND"
    })
)

general_data["party"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data["party"].astype(str).str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = (


party
IND    10927
DEM     2252
LIB     2252
GRN     2252
REP     2252
UNK       36
Name: count, dtype: int64

In [140]:
# UPDATE MASTER FILE, CAREFUL
update_master_candidate_party(general_data, r"C:\Huy Phan\College\VoterTurnout\data\cleaned_data\gen_can_party.csv")

In [141]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate_column"] = (


candidate_column,precinct,gen_dem_CLINTON,gen_grn_STEIN,gen_ind_FEEGBEH,gen_ind_KOTLIKOFF,gen_ind_MCMULLIN,gen_ind_MOOREHEAD,gen_ind_SANDERS,gen_ind_SCHOENKE,gen_lib_JOHNSON,gen_rep_TRUMP,gen_unk_BLUMBERG,gen_unk_KASICH,gen_unk_ROMNEY,gen_unk_RYAN
0,ABINGTON1,818,25,0,0,0,0,0,0,97,717,0,0,0,0
1,ABINGTON2,739,27,0,0,0,0,0,0,67,785,0,0,0,0
2,ABINGTON3,773,18,0,0,0,0,0,0,80,808,0,0,0,0
3,ABINGTON4,877,16,0,0,0,0,0,0,87,878,0,0,0,0
4,ABINGTON5,908,16,0,0,0,0,0,0,86,829,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1303,YARMOUTH3,904,18,0,0,0,0,0,0,63,663,0,0,0,0
1304,YARMOUTH4,1002,31,0,0,0,0,0,0,79,860,0,0,0,0
1305,YARMOUTH5,881,26,0,0,0,0,0,0,61,797,0,0,0,0
1306,YARMOUTH6,907,30,0,0,0,0,0,0,65,753,0,0,0,0


In [142]:
# Merge
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")

# Convert DEM primary columns to numeric and calculate total
dem_cols = combined.filter(like="pri_dem_").columns
combined[dem_cols] = combined[dem_cols].apply(pd.to_numeric, errors="coerce")
combined["dem_primary_total"] = combined[dem_cols].sum(axis=1)

# Convert general election columns to numeric and calculate total
gen_cols = combined.filter(like="gen_").columns
combined[gen_cols] = combined[gen_cols].apply(pd.to_numeric, errors="coerce")
combined["general_total"] = combined[gen_cols].sum(axis=1)

# Convert all numeric columns to int
for col in combined.columns[1:]:
    combined[col] = pd.to_numeric(combined[col], errors='coerce').fillna(0).astype(int)

# Identify filtered-out precincts
primary_precincts = set(primary_result["precinct"])
general_precincts = set(general_result["precinct"])
combined_precincts = set(combined["precinct"])

primary_filtered_out = primary_precincts - combined_precincts
general_filtered_out = general_precincts - combined_precincts




In [143]:


combined.to_csv("MA.csv", index=False)
# Step 3: Identify unmatched precincts and retrieve full rows
combined_precincts = set(combined["precinct"])

# Rows in primary_result but not in combined
primary_filtered_out = primary_result[~primary_result["precinct"].isin(combined_precincts)]
primary_filtered_out.to_csv("MA_primary_filtered.csv", index=False)

# Rows in general_result but not in combined
general_filtered_out = general_result[~general_result["precinct"].isin(combined_precincts)]
general_filtered_out.to_csv("MA_general_filtered.csv", index=False)

In [144]:
print(f"primary: {len(primary_precincts)}, general: {len(general_precincts)}, combined: {len(combined)}")

primary: 1290, general: 1308, combined: 1290


In [145]:
print(f"primary: {len(pri_combined_df['town'].unique())}, general: {len(gen_combined_df['town'].unique())}")


primary: 351, general: 351


In [146]:
pri_combined_df['town'].unique()

array(['Mattapoisett', 'Aquinnah', 'Lawrence', 'Boston', 'Scituate',
       'Hardwick', 'Lynn', 'W. Bridgewater', 'Ludlow', 'Lowell', 'Dover',
       'Springfield', 'Walpole', 'Plymouth', 'Revere', 'Worcester',
       'Seekonk', 'Saugus', 'Hawley', 'Shrewsbury', 'Fall River',
       'Lunenburg', 'Dartmouth', 'Peabody', 'Milton', 'Stoneham',
       'Foxborough', 'Lakeville', 'New Bedford', 'Arlington',
       'Framingham', 'Melrose', 'Gloucester', 'Burlington', 'Marshfield',
       'W. Boylston', 'Fitchburg', 'Malden', 'Fairhaven', 'Cambridge',
       'Marlborough', 'Barnstable', 'Southwick', 'Medford', 'Chicopee',
       'Norwood', 'Gardner', 'Mansfield', 'Wendell', 'Mashpee', 'Belmont',
       'Braintree', 'Holden', 'Leominster', 'Williamstown', 'Waltham',
       'Plainville', 'Franklin', 'Quincy', 'Wareham', 'Stoughton',
       'N. Andover', 'Weymouth', 'Dighton', 'Everett', 'Newton',
       'Georgetown', 'Taunton', 'Merrimac', 'Lexington', 'Winthrop',
       'N. Reading', 'Salem', '