In [22]:
import pandas as pd
import glob
import os
from pprint import pprint

In [23]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\MT\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]


# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary'])
]


In [24]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\MT\20161108__mt__general__precinct.csv


In [25]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\MT\20160607__mt__primary__precinct.csv


In [26]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)

        # Drop rows where 'precinct' is NaN
        if 'precinct' in df.columns:
            df = df[df["precinct"].notna()]

        # Drop rows where it is not president
        if 'office' in df.columns:
            df = df[df["office"] == "PRESIDENT"]
        
        df = df.drop_duplicates()

        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
pri_combined_df["precinct"] = pri_combined_df["county"].astype(str) + pri_combined_df["precinct"].astype(str)
pri_combined_df["precinct"] = pri_combined_df["precinct"].str.upper()
pri_combined_df


Unnamed: 0,county,precinct,office,district,party,candidate,votes,house_district,senate_district
0,Beaverhead,BEAVERHEADPRECINCT 01,PRESIDENT,,REP,JEB BUSH,10,72,36
1,Beaverhead,BEAVERHEADPRECINCT 12,PRESIDENT,,REP,JEB BUSH,16,72,36
2,Beaverhead,BEAVERHEADPRECINCT 14,PRESIDENT,,REP,JEB BUSH,3,72,36
3,Beaverhead,BEAVERHEADPRECINCT 17,PRESIDENT,,REP,JEB BUSH,3,72,36
4,Beaverhead,BEAVERHEADPRECINCT 19,PRESIDENT,,REP,JEB BUSH,0,72,36
...,...,...,...,...,...,...,...,...,...
6169,Yellowstone,YELLOWSTONE55.1,PRESIDENT,,DEM,NO PREFERENCE,2,55,28
6170,Yellowstone,YELLOWSTONE40-45,PRESIDENT,,DEM,NO PREFERENCE,6,40,20
6171,Yellowstone,YELLOWSTONE56.2,PRESIDENT,,DEM,NO PREFERENCE,50,56,28
6172,Yellowstone,YELLOWSTONE56.1,PRESIDENT,,DEM,NO PREFERENCE,5,56,28


In [27]:
pri_combined_df["party"].value_counts(dropna=False)

party
REP    4116
DEM    2058
Name: count, dtype: int64

In [28]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]
# primary_data.loc[:, "party"] = primary_data["party"].replace({
#     "President Republican Party": "REP",
#     "President Democratic Party": "DEM",
#     "Democratic Party": "DEM",
#     "Republican Party": "REP"
# })
# primary_data = primary_data[primary_data["party"].isin(["DEM", "REP"])] # Analyzing only republican and democratic

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,BEAVERHEADPRECINCT 01,REP,JEB BUSH,10
1,BEAVERHEADPRECINCT 12,REP,JEB BUSH,16
2,BEAVERHEADPRECINCT 14,REP,JEB BUSH,3
3,BEAVERHEADPRECINCT 17,REP,JEB BUSH,3
4,BEAVERHEADPRECINCT 19,REP,JEB BUSH,0
...,...,...,...,...
6169,YELLOWSTONE55.1,DEM,NO PREFERENCE,2
6170,YELLOWSTONE40-45,DEM,NO PREFERENCE,6
6171,YELLOWSTONE56.2,DEM,NO PREFERENCE,50
6172,YELLOWSTONE56.1,DEM,NO PREFERENCE,5


In [29]:
primary_data["party"].value_counts(dropna=False)

party
REP    4116
DEM    2058
Name: count, dtype: int64

In [30]:
#Remove Write-in and Registered Voters
primary_data = primary_data[~primary_data["candidate"].isin(["Registered Voters", "Write-In"])] 
primary_data["candidate"].value_counts(dropna=False)

candidate
NO PREFERENCE      1372
JEB BUSH            686
HILLARY CLINTON     686
TED CRUZ            686
JOHN R. KASICH      686
MARCO RUBIO         686
BERNIE SANDERS      686
DONALD J. TRUMP     686
Name: count, dtype: int64

In [31]:
primary_data = primary_data[~primary_data["candidate"].isin(["NO PREFERENCE"])] 
primary_data["candidate"].unique()
candidate_party_map = (
    primary_data.dropna(subset=["candidate", "party"])
                .set_index("candidate")["party"]
                .to_dict()
)
print(candidate_party_map)

{'JEB BUSH': 'REP', 'HILLARY CLINTON': 'DEM', 'TED CRUZ': 'REP', 'JOHN R. KASICH': 'REP', 'MARCO RUBIO': 'REP', 'BERNIE SANDERS': 'DEM', 'DONALD J. TRUMP': 'REP'}


In [32]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

candidate_column,precinct,pri_dem_CLINTON,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CRUZ,pri_rep_KASICH,pri_rep_RUBIO,pri_rep_TRUMP
0,BEAVERHEADPRECINCT 01,27,44,10,18,30,8,78
1,BEAVERHEADPRECINCT 02,22,50,9,38,23,13,252
2,BEAVERHEADPRECINCT 03,36,33,11,35,35,18,269
3,BEAVERHEADPRECINCT 04,10,7,5,13,7,5,76
4,BEAVERHEADPRECINCT 05,1,1,2,7,5,2,56
...,...,...,...,...,...,...,...,...
687,YELLOWSTONE55.4,131,134,6,33,12,9,250
688,YELLOWSTONE55.5,199,164,11,82,47,15,421
689,YELLOWSTONE56.1,22,34,2,15,8,7,106
690,YELLOWSTONE56.2,274,318,21,138,63,30,882


In [33]:
# Process general files
gen_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)

        # Select only president
        if 'office' in df.columns:
            df = df[df["office"] == "President" ]

        gen_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(gen_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
gen_combined_df["precinct"] = gen_combined_df["county"].astype(str) + gen_combined_df["precinct"].astype(str)
gen_combined_df["precinct"] = gen_combined_df["precinct"].str.upper()
gen_combined_df


Unnamed: 0,county,precinct,district,office,party,candidate,votes
0,Beaverhead,BEAVERHEADPRECINCT 01,,President,Democrat,Hillary Clinton,175
1,Beaverhead,BEAVERHEADPRECINCT 12,,President,Democrat,Hillary Clinton,157
2,Beaverhead,BEAVERHEADPRECINCT 14,,President,Democrat,Hillary Clinton,125
3,Beaverhead,BEAVERHEADPRECINCT 17,,President,Democrat,Hillary Clinton,18
4,Beaverhead,BEAVERHEADPRECINCT 19,,President,Democrat,Hillary Clinton,14
...,...,...,...,...,...,...,...
3425,Yellowstone,YELLOWSTONE55.1,,President,Republican,Donald Trump,385
3426,Yellowstone,YELLOWSTONE40-45,,President,Republican,Donald Trump,626
3427,Yellowstone,YELLOWSTONE56.2,,President,Republican,Donald Trump,2144
3428,Yellowstone,YELLOWSTONE56.1,,President,Republican,Donald Trump,235


In [34]:
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]
general_data["party"].value_counts(dropna=False)


party
Democrat          686
American Delta    686
Libertarian       686
Green             686
Republican        686
Name: count, dtype: int64

In [35]:
general_data["candidate"].value_counts(dropna=False)

candidate
Hillary Clinton             686
Rocky Roque de la Fuenta    686
Gary Johnson                686
Jill Stein                  686
Donald Trump                686
Name: count, dtype: int64

In [36]:
# general_data["candidate"] = general_data["candidate"].str.extract(r"^([^/&]+)").iloc[:, 0].str.strip()
# general_data = general_data[~general_data["candidate"].isin(["Registered Voters", "Write-In"])] 
general_data.loc[:, "party"] = general_data["party"].replace({
    "Democrat": "DEM",
    "Republican": "REP",
    "Libertarian": "LIB",
    "Green": "GRN",
    "Constitution": "CON",
    "American Delta": "AMD",
    "Prohibition": "PRO"
})
general_data["party"].value_counts(dropna=False)

party
DEM    686
AMD    686
LIB    686
GRN    686
REP    686
Name: count, dtype: int64

In [37]:

general_data

Unnamed: 0,precinct,party,candidate,votes
0,BEAVERHEADPRECINCT 01,DEM,Hillary Clinton,175
1,BEAVERHEADPRECINCT 12,DEM,Hillary Clinton,157
2,BEAVERHEADPRECINCT 14,DEM,Hillary Clinton,125
3,BEAVERHEADPRECINCT 17,DEM,Hillary Clinton,18
4,BEAVERHEADPRECINCT 19,DEM,Hillary Clinton,14
...,...,...,...,...
3425,YELLOWSTONE55.1,REP,Donald Trump,385
3426,YELLOWSTONE40-45,REP,Donald Trump,626
3427,YELLOWSTONE56.2,REP,Donald Trump,2144
3428,YELLOWSTONE56.1,REP,Donald Trump,235


In [38]:
general_data.loc[:,"candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  general_data.loc[:,"candidate_column"] = (


candidate_column,precinct,gen_amd_FUENTA,gen_dem_CLINTON,gen_grn_STEIN,gen_lib_JOHNSON,gen_rep_TRUMP
0,BEAVERHEADPRECINCT 01,1,175,15,48,203
1,BEAVERHEADPRECINCT 02,4,127,16,35,551
2,BEAVERHEADPRECINCT 03,0,158,5,32,507
3,BEAVERHEADPRECINCT 04,0,34,1,6,131
4,BEAVERHEADPRECINCT 05,0,9,1,4,99
...,...,...,...,...,...,...
681,YELLOWSTONE55.4,4,344,16,87,731
682,YELLOWSTONE55.5,5,501,11,115,1027
683,YELLOWSTONE56.1,0,64,5,20,235
684,YELLOWSTONE56.2,14,849,51,150,2144


In [39]:
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")
combined["rep_primary_total"] = combined.filter(like="pri_rep_").sum(axis=1)
combined["dem_primary_total"] = combined.filter(like="pri_dem_").sum(axis=1)
combined["general_total"] = combined.filter(like="gen_").sum(axis=1)
combined

candidate_column,precinct,pri_dem_CLINTON,pri_dem_SANDERS,pri_rep_BUSH,pri_rep_CRUZ,pri_rep_KASICH,pri_rep_RUBIO,pri_rep_TRUMP,gen_amd_FUENTA,gen_dem_CLINTON,gen_grn_STEIN,gen_lib_JOHNSON,gen_rep_TRUMP,rep_primary_total,dem_primary_total,general_total
0,BEAVERHEADPRECINCT 01,27,44,10,18,30,8,78,1,175,15,48,203,144,71,442
1,BEAVERHEADPRECINCT 02,22,50,9,38,23,13,252,4,127,16,35,551,335,72,733
2,BEAVERHEADPRECINCT 03,36,33,11,35,35,18,269,0,158,5,32,507,368,69,702
3,BEAVERHEADPRECINCT 04,10,7,5,13,7,5,76,0,34,1,6,131,106,17,172
4,BEAVERHEADPRECINCT 05,1,1,2,7,5,2,56,0,9,1,4,99,72,2,113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
657,YELLOWSTONE55.4,131,134,6,33,12,9,250,4,344,16,87,731,310,265,1182
658,YELLOWSTONE55.5,199,164,11,82,47,15,421,5,501,11,115,1027,576,363,1659
659,YELLOWSTONE56.1,22,34,2,15,8,7,106,0,64,5,20,235,138,56,324
660,YELLOWSTONE56.2,274,318,21,138,63,30,882,14,849,51,150,2144,1134,592,3208


In [40]:
combined.to_csv("MT.csv", index=False)
