In [1]:
import geopandas as gpd

## Arizona Precinct Cleaning of Data

In [2]:
file_path = "./clean_data/arizona_data/ArizonaAggPrecinct.geojson"

gdf = gpd.read_file(file_path)

gdf = gdf.to_crs(4326)

print(gdf.head())
print(gdf.crs)

             UNIQUE_ID COUNTYFP  PCTNUM        PRECINCTNA CDE_COUNTY  \
0            02 ALPINE      001  AP0002            ALPINE         AP   
1  03 CANYON DE CHELLY      001  AP0003  CANYON DE CHELLY         AP   
2            05 CHINLE      001  AP0005            CHINLE         AP   
3            09 CONCHO      001  AP0009            CONCHO         AP   
4        11 COTTONWOOD      001  AP0011        COTTONWOOD         AP   

  COUNTY_NAM CON_DIST SLDL_DIST SLDU_DIST  G20PREDBID  ...  GSL30DTER  \
0     Apache       01        07        07        94.0  ...        0.0   
1     Apache       01        07        07      1982.0  ...        0.0   
2     Apache       01        07        07       989.0  ...        0.0   
3     Apache       01        07        07       333.0  ...        0.0   
4     Apache       01        07        07       748.0  ...        0.0   

   P0010001  P0010003  P0010004  P0020002  P0010006  P0010005  P0010007  \
0       664       594         2        77         2  

In [3]:
column_name_mappings = {
    "UNIQUE_ID": "precinct",
    "G20PRERTRU": "republican",
    "G20PREDBID": "democrat",
    "P0010001": "population",
    "P0010003": "white",
    "P0010004": "black",
    "P0020002": "hispanic",
    "P0010006": "asian",
    "P0010007": "pacific",
    "P0010005": "native",
    "P0010008": "other",
    "geometry": "geometry"
}

In [4]:
columns_to_keep = list(column_name_mappings.keys())

gdf_filtered = gdf[columns_to_keep]

gdf_filtered = gdf_filtered.rename(columns=column_name_mappings)

gdf_filtered.columns = gdf_filtered.columns.str.lower()
gdf_filtered.head()

Unnamed: 0,precinct,republican,democrat,population,white,black,hispanic,asian,pacific,native,other,geometry
0,02 ALPINE,283.0,94.0,664,594,2,77,2,0,10,2,"MULTIPOLYGON (((-109.49567 33.6528, -109.49576..."
1,03 CANYON DE CHELLY,273.0,1982.0,4666,63,13,40,20,0,4526,7,"MULTIPOLYGON (((-109.71666 36.26151, -109.7165..."
2,05 CHINLE,148.0,989.0,2949,183,23,62,74,0,2581,14,"MULTIPOLYGON (((-109.81183 36.27512, -109.8081..."
3,09 CONCHO,1486.0,333.0,2945,2494,6,279,11,3,48,89,"MULTIPOLYGON (((-109.53982 34.44871, -109.5392..."
4,11 COTTONWOOD,87.0,748.0,1529,2,0,15,0,0,1506,1,"MULTIPOLYGON (((-109.81768 36.1476, -109.81822..."


In [5]:
gdf_filtered.head()

Unnamed: 0,precinct,republican,democrat,population,white,black,hispanic,asian,pacific,native,other,geometry
0,02 ALPINE,283.0,94.0,664,594,2,77,2,0,10,2,"MULTIPOLYGON (((-109.49567 33.6528, -109.49576..."
1,03 CANYON DE CHELLY,273.0,1982.0,4666,63,13,40,20,0,4526,7,"MULTIPOLYGON (((-109.71666 36.26151, -109.7165..."
2,05 CHINLE,148.0,989.0,2949,183,23,62,74,0,2581,14,"MULTIPOLYGON (((-109.81183 36.27512, -109.8081..."
3,09 CONCHO,1486.0,333.0,2945,2494,6,279,11,3,48,89,"MULTIPOLYGON (((-109.53982 34.44871, -109.5392..."
4,11 COTTONWOOD,87.0,748.0,1529,2,0,15,0,0,1506,1,"MULTIPOLYGON (((-109.81768 36.1476, -109.81822..."


In [6]:
gdf_filtered.to_file("arizonaPrecinctData.geojson", driver="GeoJSON")

In [8]:
# make arizona precinct data a bit smaller
gdf_filtered2 = gdf_filtered.copy(deep=True)
gdf_filtered2['geometry'] = gdf_filtered2['geometry'].simplify(tolerance=0.001, preserve_topology=True)

# Save the simplified GeoDataFrame as a new GeoJSON file
gdf_filtered2.to_file("arizonaSimplifiedPrecinctData.geojson", driver="GeoJSON")

## Arizona Cleaning of District Data

In [2]:
file_path = "./clean_data/arizona_data/ArizonaAggDistrict.geojson"

gdf = gpd.read_file(file_path)

gdf = gdf.to_crs(4326)

print(gdf.head())
print(gdf.crs)

     LONGNAME SHORTNAME  DISTRICT       COLOR   TOTAL  TARGET_DEV  \
0  District 1        D1         1   -16777088  794611           0   
1  District 2        D2         2     7405440  794612           1   
2  District 3        D3         3   950534272  794612           1   
3  District 4        D4         4  1275097984  794611           0   
4  District 5        D5         5    -5635968  794612           1   

   TARGET_DEV_1  CompDemVot  CompRepVot  Pres2020_D  ...  P0010004  P0020002  \
0           0.0       48.70       51.30       50.76  ...     26286    130423   
1           0.0       46.40       53.60       45.95  ...     18200    139426   
2           0.0       76.46       23.54       75.72  ...     87524    509139   
3           0.0       53.51       46.49       55.25  ...     43927    210756   
4           0.0       40.97       59.02       41.74  ...     29640    139644   

   P0010006  P0010005  P0010007  P0010008  G20PRERTRU  G20PREDBID  G20PRELJOR  \
0     36146     16079  

In [3]:
incumbents = ["Republican", "Republican", "Democrat", "Democrat", "Republican", "Republican", "Democrat", "Republican", "Republican"]
representative_name = ["David Schweikert", "Eli Crane", "Ruben Gallego", "Greg Stanton", "Andy Biggs", "Juan Ciscomani", "Raúl Grijalva", "Debbie Lesko", "Paul Gosar"]
gdf['incumbent'] = incumbents
gdf['representative'] = representative_name
display_columns = gdf[["incumbent", "representative"]]
print(display_columns)

    incumbent    representative
0  Republican  David Schweikert
1  Republican         Eli Crane
2    Democrat     Ruben Gallego
3    Democrat      Greg Stanton
4  Republican        Andy Biggs
5  Republican    Juan Ciscomani
6    Democrat     Raúl Grijalva
7  Republican      Debbie Lesko
8  Republican        Paul Gosar


In [4]:
columnNameMapping = {
    "DISTRICT": "district",
    "representative": "representative",
    "incumbent": "incumbent",
    "P0010001": "population",
    "G20PRERTRU": "republican",
    "G20PREDBID": "democrat",
    "P0010003": "white",
    "P0010004": "black",
    "P0020002": "hispanic",
    "P0010006": "asian",
    "P0010007": "pacific",
    "P0010005": "native",
    "P0010008": "other",
    "geometry": "geometry"
}

In [5]:
columns_to_keep = list(columnNameMapping.keys())

gdf_filtered = gdf[columns_to_keep]

gdf_filtered = gdf_filtered.rename(columns=columnNameMapping)

gdf_filtered.columns = gdf_filtered.columns.str.lower()
gdf_filtered.head()

Unnamed: 0,district,representative,incumbent,population,republican,democrat,white,black,hispanic,asian,pacific,native,other,geometry
0,1,David Schweikert,Republican,784751,221372.0,227872.0,573505,26286,130423,36146,1117,16079,53267,"MULTIPOLYGON (((-111.97848 33.46567, -111.9789..."
1,2,Eli Crane,Republican,814634,221734.0,187006.0,489001,18200,139426,10205,1251,169384,51578,"MULTIPOLYGON (((-111.03999 33.46602, -111.0399..."
2,3,Ruben Gallego,Democrat,817597,56732.0,176494.0,254733,87524,509139,22713,2226,28669,262738,"MULTIPOLYGON (((-112.09965 33.5314, -112.09966..."
3,4,Greg Stanton,Democrat,783453,159822.0,198150.0,477992,43927,210756,44109,3001,23603,88874,"MULTIPOLYGON (((-111.97879 33.43652, -111.9788..."
4,5,Andy Biggs,Republican,795790,240188.0,173132.0,572235,29640,139644,48281,1926,8830,43234,"MULTIPOLYGON (((-111.89263 33.29224, -111.8913..."


In [6]:
gdf_filtered.to_file("modifiedArizonaDistrictData.geojson", driver='GeoJSON')

## Make District Data Smaller