In [1]:
import pandas as pd
import glob
import os
from pprint import pprint

In [2]:
#Get all CSV files in the folder of GA
all_files = glob.glob(r"C:\Huy Phan\College\VoterTurnout\data\RI\*.csv")

# Files that contain both 'precinct' and 'general' in the filename
general_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'general'])
]

# Files that contain both 'precinct' and 'primary' in the filename
primary_files = [
    f for f in all_files
    if all(word in f.lower() for word in ['precinct', 'primary', 'president'])
]


In [3]:
print("General files:")
for f in general_files:
    print(f)


General files:
C:\Huy Phan\College\VoterTurnout\data\RI\20161108__ri__general__precinct.csv


In [4]:
print("\nPrimary files:")
for f in primary_files:
    print(f)


Primary files:
C:\Huy Phan\College\VoterTurnout\data\RI\20160426__ri__primary__president__precinct.csv


In [5]:
# Process primary files
primary_df_list = []

for file in primary_files:
    try:
        df = pd.read_csv(file)

        # Drop rows where 'precinct' is NaN
        if 'office' in df.columns:
            df = df[df["office"] == "President"]
        
        df = df.drop_duplicates()

        primary_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
pri_combined_df = pd.concat(primary_df_list, ignore_index=True)
# Create a new column "precinct" by combining county and precinct
pri_combined_df["precinct"] = pri_combined_df["precinct"].str.upper()
pri_combined_df


Unnamed: 0,county,precinct,office,district,party,candidate,votes
0,Bristol,BARRINGTON 0101,President,,Democratic,Mark Stewart,0
1,Bristol,BARRINGTON 0101,President,,Democratic,Bernie Sanders,251
2,Bristol,BARRINGTON 0101,President,,Democratic,Uncommitted,5
3,Bristol,BARRINGTON 0101,President,,Democratic,"Roque ""Rocky"" De La Fuente",0
4,Bristol,BARRINGTON 0101,President,,Democratic,Hillary Clinton,214
...,...,...,...,...,...,...,...
2039,,FEDERAL PRECINCT #2,President,,Republican,John R. Kasich,249
2040,,FEDERAL PRECINCT #2,President,,Republican,Marco Rubio,11
2041,,FEDERAL PRECINCT #2,President,,Republican,Uncommitted,25
2042,,FEDERAL PRECINCT #2,President,,Republican,Overvotes,2


In [6]:
pri_combined_df["party"].value_counts(dropna=False)

party
Democratic    1022
Republican    1022
Name: count, dtype: int64

In [7]:
# Select only the relevant columns
primary_data = pri_combined_df[["precinct", "party", "candidate", "votes"]]
primary_data.loc[:, "party"] = primary_data["party"].replace({
    "Democratic": "DEM",
    "Republican": "REP"
})
# primary_data = primary_data[primary_data["party"].isin(["DEM", "REP"])] # Analyzing only republican and democratic

primary_data

Unnamed: 0,precinct,party,candidate,votes
0,BARRINGTON 0101,DEM,Mark Stewart,0
1,BARRINGTON 0101,DEM,Bernie Sanders,251
2,BARRINGTON 0101,DEM,Uncommitted,5
3,BARRINGTON 0101,DEM,"Roque ""Rocky"" De La Fuente",0
4,BARRINGTON 0101,DEM,Hillary Clinton,214
...,...,...,...,...
2039,FEDERAL PRECINCT #2,REP,John R. Kasich,249
2040,FEDERAL PRECINCT #2,REP,Marco Rubio,11
2041,FEDERAL PRECINCT #2,REP,Uncommitted,25
2042,FEDERAL PRECINCT #2,REP,Overvotes,2


In [8]:
primary_data["party"].value_counts(dropna=False)

party
DEM    1022
REP    1022
Name: count, dtype: int64

In [9]:
#Remove Uncommited, Overvotes, Undervotes
primary_data = primary_data[~primary_data["candidate"].isin(["Uncommitted", "Overvotes", "Undervotes"])] 
primary_data["candidate"].value_counts(dropna=False)

candidate
Mark Stewart                  146
Bernie Sanders                146
Roque "Rocky" De La Fuente    146
Hillary Clinton               146
Donald J. Trump               146
Ted Cruz                      146
John R. Kasich                146
Marco Rubio                   146
Name: count, dtype: int64

In [10]:
primary_data["candidate"].unique()
candidate_party_map = (
    primary_data.dropna(subset=["candidate", "party"])
                .set_index("candidate")["party"]
                .to_dict()
)
print(candidate_party_map)

{'Mark Stewart': 'DEM', 'Bernie Sanders': 'DEM', 'Roque "Rocky" De La Fuente': 'DEM', 'Hillary Clinton': 'DEM', 'Donald J. Trump': 'REP', 'Ted Cruz': 'REP', 'John R. Kasich': 'REP', 'Marco Rubio': 'REP'}


In [11]:
primary_data.loc[:,"candidate_column"] = (
    "pri_" +
    primary_data["party"].str.lower() + "_" +
    primary_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
primary_result = primary_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

primary_result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  primary_data.loc[:,"candidate_column"] = (


candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_SANDERS,pri_dem_STEWART,pri_rep_CRUZ,pri_rep_KASICH,pri_rep_RUBIO,pri_rep_TRUMP
0,BARRINGTON 0101,214,0,251,0,34,92,2,115
1,BARRINGTON 0102,336,0,347,0,29,119,2,138
2,BARRINGTON 0103,313,0,216,0,25,153,4,124
3,BARRINGTON 0104,316,1,299,0,23,158,3,173
4,BARRINGTON 0105,311,1,286,1,33,178,3,137
...,...,...,...,...,...,...,...,...,...
141,WOONSOCKET 3902,159,0,105,1,5,7,4,62
142,WOONSOCKET 3903,337,1,394,2,39,31,4,181
143,WOONSOCKET 3905,124,0,164,1,14,10,0,67
144,WOONSOCKET 3907,339,2,488,0,49,62,2,313


In [12]:
# Process general files
gen_df_list = []

for file in general_files:
    try:
        df = pd.read_csv(file)

        # Select only president
        if 'office' in df.columns:
            df = df[df["office"] == "President" ]

        gen_df_list.append(df)

    except Exception as e:
        print(f"Error in {file}: {e}")

# Combine all cleaned files
gen_combined_df = pd.concat(gen_df_list, ignore_index=True)
gen_combined_df["precinct"] = gen_combined_df["precinct"].str.upper()
gen_combined_df


Unnamed: 0,office,district,candidate,party,precinct,votes,absentee_votes
0,President,,Donald J. Trump,R,BARRINGTON 0101,522,36
1,President,,Donald J. Trump,R,BARRINGTON 0102,683,67
2,President,,Donald J. Trump,R,BARRINGTON 0103,469,39
3,President,,Donald J. Trump,R,BARRINGTON 0104,611,38
4,President,,Donald J. Trump,R,BARRINGTON 0105,590,68
...,...,...,...,...,...,...,...
3001,President,,WRITE-IN,,WOONSOCKET 3912,17,0
3002,President,,WRITE-IN,,WOONSOCKET LIMITED,0,0
3003,President,,WRITE-IN,,WOONSOCKET PRESIDENTIAL,4,0
3004,President,,WRITE-IN,,FEDERAL PRECINCT #1,1,1


In [13]:
general_data = gen_combined_df[["precinct", "party", "candidate", "votes"]]
general_data["party"].value_counts(dropna=False)


party
NaN    1002
R       501
D       501
L       501
G       501
Name: count, dtype: int64

In [14]:
general_data["candidate"].value_counts(dropna=False)

candidate
Donald J. Trump               501
Hillary Clinton               501
Gary Johnson                  501
Jill Stein                    501
"Rocky" Roque De La Fuente    501
WRITE-IN                      501
Name: count, dtype: int64

In [15]:
# remove WRITE-IN
general_data = general_data[~general_data["candidate"].isin(["WRITE-IN"])] 
general_data["candidate"].value_counts(dropna=False)

candidate
Donald J. Trump               501
Hillary Clinton               501
Gary Johnson                  501
Jill Stein                    501
"Rocky" Roque De La Fuente    501
Name: count, dtype: int64

In [16]:
def fill_party_from_general_data(row, df):
    if pd.notna(row["party"]):
        return row["party"]
    
    # Try to find other rows with the same candidate and known party
    matches = df[(df["candidate"] == row["candidate"]) & (df["party"].notna())]
    if not matches.empty:
        return matches["party"].iloc[0]  # Return the first match's party
    else:
        return None  # Still unknown

# general_data["party"] = general_data.apply(
#     lambda row: fill_party_from_general_data(row, general_data),
#     axis=1
# )


In [17]:
# general_data = general_data[~general_data["candidate"].isin(["YES", "NO"])]
# general_data["party"] = general_data.apply(
#     lambda row: candidate_party_map.get(row["candidate"], row["party"]) if pd.isna(row["party"]) else row["party"],
#     axis=1
# )

general_data["party"] = general_data.apply(
    lambda row: fill_party_from_general_data(row, general_data),
    axis=1
)

general_data.loc[:,"party"] = (
    general_data["party"]
    .replace({
        "D": "DEM",
        "R": "REP",
        "L": "LIB",
        "G": "GRN",
    })
    .fillna("IND")
)
general_data["party"].value_counts(dropna=False)


party
REP    501
DEM    501
LIB    501
GRN    501
IND    501
Name: count, dtype: int64

In [18]:
general_data["candidate_column"] = (
    "gen_" +
    general_data["party"].str.lower() + "_" +
    general_data["candidate"].str.split().str[-1].str.upper()
)

# pivot the table
general_result = general_data.pivot_table(
    index="precinct",
    columns="candidate_column",
    values="votes",
    aggfunc="sum",  
    fill_value=0
).reset_index()

general_result

candidate_column,precinct,gen_dem_CLINTON,gen_grn_STEIN,gen_ind_FUENTE,gen_lib_JOHNSON,gen_rep_TRUMP
0,BARRINGTON 0101,1027,23,2,60,522
1,BARRINGTON 0102,1331,21,3,68,683
2,BARRINGTON 0103,1136,17,0,59,469
3,BARRINGTON 0104,1328,25,2,78,611
4,BARRINGTON 0105,1290,24,4,69,590
...,...,...,...,...,...,...
496,WOONSOCKET 3910,686,18,2,68,759
497,WOONSOCKET 3911,677,20,4,46,465
498,WOONSOCKET 3912,506,18,2,37,448
499,WOONSOCKET LIMITED,0,0,0,0,0


In [19]:
combined = pd.merge(primary_result, general_result, on="precinct", how="inner")
combined["rep_primary_total"] = combined.filter(like="pri_rep_").sum(axis=1)
combined["dem_primary_total"] = combined.filter(like="pri_dem_").sum(axis=1)
combined["general_total"] = combined.filter(like="gen_").sum(axis=1)
combined

candidate_column,precinct,pri_dem_CLINTON,pri_dem_FUENTE,pri_dem_SANDERS,pri_dem_STEWART,pri_rep_CRUZ,pri_rep_KASICH,pri_rep_RUBIO,pri_rep_TRUMP,gen_dem_CLINTON,gen_grn_STEIN,gen_ind_FUENTE,gen_lib_JOHNSON,gen_rep_TRUMP,rep_primary_total,dem_primary_total,general_total
0,BARRINGTON 0101,214,0,251,0,34,92,2,115,1027,23,2,60,522,243,465,1634
1,BARRINGTON 0102,336,0,347,0,29,119,2,138,1331,21,3,68,683,288,683,2106
2,BARRINGTON 0103,313,0,216,0,25,153,4,124,1136,17,0,59,469,306,529,1681
3,BARRINGTON 0104,316,1,299,0,23,158,3,173,1328,25,2,78,611,357,616,2044
4,BARRINGTON 0105,311,1,286,1,33,178,3,137,1290,24,4,69,590,351,599,1977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,WOONSOCKET 3902,159,0,105,1,5,7,4,62,400,10,2,17,230,78,265,659
142,WOONSOCKET 3903,337,1,394,2,39,31,4,181,300,12,0,9,162,255,734,483
143,WOONSOCKET 3905,124,0,164,1,14,10,0,67,506,18,1,37,425,91,289,987
144,WOONSOCKET 3907,339,2,488,0,49,62,2,313,563,25,0,53,628,426,829,1269


In [20]:
combined.to_csv("RI.csv", index=False)
