In [44]:
import pandas as pd
import glob
import os
from pprint import pprint

In [45]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\NM\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary'])
]


In [46]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\NM\nm__general__precinct.csv


In [47]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\NM\nm__primary__precinct.csv


In [48]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)

        # Drop rows where 'precinct' is NaN
        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]
        
        if 'office' in df.columns:
            df = df[df["office"] == "President"]

        df = df.drop_duplicates()

        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
pri_combined_df["precinct"] = pri_combined_df["county"].astype(str) + pri_combined_df["precinct"].astype(str)
pri_combined_df["precinct"] = pri_combined_df["precinct"].str.upper()

In [49]:
pri_combined_df

Unnamed: 0,county,precinct,office,district,party,candidate,votes
0,Bernalillo,BERNALILLO1,President,,Democratic,HILLARY RODHAM CLINTON,60
1,Bernalillo,BERNALILLO1,President,,Democratic,BERNARD SANDERS,93
2,Bernalillo,BERNALILLO1,President,,Republican,DONALD J TRUMP,64
3,Bernalillo,BERNALILLO1,President,,Republican,JEB BUSH,3
4,Bernalillo,BERNALILLO1,President,,Republican,BENJAMIN S CARSON,6
...,...,...,...,...,...,...,...
11931,Valencia,VALENCIA41,President,,Republican,JEB BUSH,1
11932,Valencia,VALENCIA41,President,,Republican,BENJAMIN S CARSON,1
11933,Valencia,VALENCIA41,President,,Republican,TED CRUZ,25
11934,Valencia,VALENCIA41,President,,Republican,CARLY FIORINA,0


In [50]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]
primary_data["party"].value_counts(dropna=False)

party
Republican    8952
Democratic    2984
Name: count, dtype: int64

In [51]:
primary_data.loc[:, "party"] = primary_data["party"].replace({
    "Democratic": "DEM",
    "Republican": "REP"
})

# primary_data = primary_data[~primary_data["candidate"].isin(["YES", "NO"])]
# primary_data = primary_data[primary_data["party"].isin(["DEM", "REP"])] # Analyzing only republican and democratic

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,BERNALILLO1,DEM,HILLARY RODHAM CLINTON,60
1,BERNALILLO1,DEM,BERNARD SANDERS,93
2,BERNALILLO1,REP,DONALD J TRUMP,64
3,BERNALILLO1,REP,JEB BUSH,3
4,BERNALILLO1,REP,BENJAMIN S CARSON,6
...,...,...,...,...
11931,VALENCIA41,REP,JEB BUSH,1
11932,VALENCIA41,REP,BENJAMIN S CARSON,1
11933,VALENCIA41,REP,TED CRUZ,25
11934,VALENCIA41,REP,CARLY FIORINA,0


In [52]:
primary_data["party"].value_counts(dropna=False)

party
REP    8952
DEM    2984
Name: count, dtype: int64

In [53]:
primary_data["candidate"].unique()
candidate_party_map = (
    primary_data.dropna(subset=["candidate", "party"])
                .set_index("candidate")["party"]
                .to_dict()
)
print(candidate_party_map)

{'HILLARY RODHAM CLINTON': 'DEM', 'BERNARD SANDERS': 'DEM', 'DONALD J TRUMP': 'REP', 'JEB BUSH': 'REP', 'BENJAMIN S CARSON': 'REP', 'TED CRUZ': 'REP', 'CARLY FIORINA': 'REP', 'JOHN R KASICH': 'REP'}


In [54]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data.loc[:,"candidate_column"] = (


candidate_column,precinct,pri_dem_CLINTON,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_KASICH,pri_rep_TRUMP
0,BERNALILLO1,60,93,3,6,15,0,9,64
1,BERNALILLO10,65,65,1,5,9,2,7,35
2,BERNALILLO101,35,186,2,1,3,0,2,10
3,BERNALILLO102,27,105,0,0,2,1,1,6
4,BERNALILLO103,91,96,1,1,5,0,1,15
...,...,...,...,...,...,...,...,...,...
1487,VALENCIA5,111,111,6,5,26,1,7,127
1488,VALENCIA6,105,111,3,4,24,4,10,87
1489,VALENCIA7,142,123,5,5,26,3,10,132
1490,VALENCIA8,130,144,5,9,32,3,12,106


In [55]:
def fill_party_from_general_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown

# general_data["party"] = general_data.apply(
#     lambda row: fill_party_from_general_data(row, general_data),
#     axis=1
# )


In [56]:
# Process general files
gendf_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)

        # Drop rows where 'precinct' is NaN
        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]

        if 'office' in df.columns:
            df = df[df["office"] == "President"]

        gendf_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(gendf_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
gen_combined_df["precinct"] = gen_combined_df["county"].astype(str) + gen_combined_df["precinct"].astype(str)
gen_combined_df["precinct"] = gen_combined_df["precinct"].str.upper()
gen_combined_df


Unnamed: 0,county,precinct,office,district,party,candidate,votes
0,Bernalillo,BERNALILLO1,President,,Libertarian,GARY JOHNSON,61
1,Bernalillo,BERNALILLO1,President,,Constitution,DARRELL CASTLE,0
2,Bernalillo,BERNALILLO1,President,,Republican,DONALD J TRUMP,311
3,Bernalillo,BERNALILLO1,President,,Democratic,HILLARY RODHAM CLINTON,346
4,Bernalillo,BERNALILLO1,President,,Green,JILL STEIN,9
...,...,...,...,...,...,...,...
11931,Valencia,VALENCIA41,President,,Democratic,HILLARY RODHAM CLINTON,379
11932,Valencia,VALENCIA41,President,,Green,JILL STEIN,6
11933,Valencia,VALENCIA41,President,,,GLORIA LA RIVA,0
11934,Valencia,VALENCIA41,President,,,EVAN MCMULLIN,17


In [57]:
# general_data["party"] = general_data.apply(
#     lambda row: fill_party_from_general_data(row, general_data),
#     axis=1
# )
gen_combined_df["party"].value_counts(dropna=False)

party
NaN             4476
Libertarian     1492
Constitution    1492
Republican      1492
Democratic      1492
Green           1492
Name: count, dtype: int64

In [58]:
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]
# general_data = general_data[~general_data["candidate"].isin(["YES", "NO"])]
general_data.loc[:,"party"] = general_data.apply(
    lambda row: candidate_party_map.get(row["candidate"], row["party"]) if pd.isna(row["party"]) else row["party"],
    axis=1
)

general_data["party"] = general_data.apply(
    lambda row: fill_party_from_general_data(row, general_data),
    axis=1
)
general_data.loc[:,"party"] = (
    general_data["party"]
    .replace({
        "Democratic": "DEM",
        "Republican": "REP",
        "Libertarian": "LIB",
        "Green": "GRN",
        "Constitution": "CON",
        "American Delta": "AMD",
        "Prohibition": "PRO"
    })
    .fillna("IND")
)


general_data["party"] = general_data.apply(
    lambda row: fill_party_from_general_data(row, general_data),
    axis=1
)

# general_data = general_data[general_data["party"].isin(["DEM", "REP"])] # Analyzing only republican and democratic

gen_combined_df["candidate"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["party"] = general_data.apply(


candidate
GARY JOHNSON                  1492
DARRELL CASTLE                1492
DONALD J TRUMP                1492
HILLARY RODHAM CLINTON        1492
JILL STEIN                    1492
GLORIA LA RIVA                1492
EVAN MCMULLIN                 1492
"ROCKY" ROQUE DE LA FUENTE    1492
Name: count, dtype: int64

In [59]:
general_data["party"].value_counts(dropna=False)


party
IND    4476
LIB    1492
CON    1492
REP    1492
DEM    1492
GRN    1492
Name: count, dtype: int64

In [60]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data["candidate_column"] = (


candidate_column,precinct,gen_con_CASTLE,gen_dem_CLINTON,gen_grn_STEIN,gen_ind_FUENTE,gen_ind_MCMULLIN,gen_ind_RIVA,gen_lib_JOHNSON,gen_rep_TRUMP
0,BERNALILLO1,0,346,9,0,6,0,61,311
1,BERNALILLO10,1,287,9,0,1,1,78,223
2,BERNALILLO101,0,341,28,1,2,6,59,57
3,BERNALILLO102,1,231,14,0,1,1,28,35
4,BERNALILLO103,0,490,10,0,2,0,48,125
...,...,...,...,...,...,...,...,...,...
1487,VALENCIA5,2,360,6,1,2,2,120,502
1488,VALENCIA6,1,404,6,1,5,0,117,472
1489,VALENCIA7,2,483,15,0,9,0,133,591
1490,VALENCIA8,3,507,15,2,7,0,148,716


In [61]:
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")
combined["rep_primary_total"] = combined.filter(like="pri_rep_").sum(axis=1)
combined["dem_primary_total"] = combined.filter(like="pri_dem_").sum(axis=1)
combined["general_total"] = combined.filter(like="gen_").sum(axis=1)
combined

candidate_column,precinct,pri_dem_CLINTON,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CARSON,pri_rep_CRUZ,pri_rep_FIORINA,pri_rep_KASICH,pri_rep_TRUMP,gen_con_CASTLE,gen_dem_CLINTON,gen_grn_STEIN,gen_ind_FUENTE,gen_ind_MCMULLIN,gen_ind_RIVA,gen_lib_JOHNSON,gen_rep_TRUMP,rep_primary_total,dem_primary_total,general_total
0,BERNALILLO1,60,93,3,6,15,0,9,64,0,346,9,0,6,0,61,311,97,153,733
1,BERNALILLO10,65,65,1,5,9,2,7,35,1,287,9,0,1,1,78,223,59,130,600
2,BERNALILLO101,35,186,2,1,3,0,2,10,0,341,28,1,2,6,59,57,18,221,494
3,BERNALILLO102,27,105,0,0,2,1,1,6,1,231,14,0,1,1,28,35,10,132,311
4,BERNALILLO103,91,96,1,1,5,0,1,15,0,490,10,0,2,0,48,125,23,187,675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1487,VALENCIA5,111,111,6,5,26,1,7,127,2,360,6,1,2,2,120,502,172,222,995
1488,VALENCIA6,105,111,3,4,24,4,10,87,1,404,6,1,5,0,117,472,132,216,1006
1489,VALENCIA7,142,123,5,5,26,3,10,132,2,483,15,0,9,0,133,591,181,265,1233
1490,VALENCIA8,130,144,5,9,32,3,12,106,3,507,15,2,7,0,148,716,167,274,1398


In [62]:
combined.to_csv("NM.csv", index=False)
