# How to Preprocess Florida Avocado Farm Data

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# Define the filenames we'll use throughout the script
original_geojson_file = "Florida_Counties.geojson"
new_geojson_file = "Farm_Info_" + original_geojson_file
csv_file = "FloridaCounties.csv"

## Step 1: Take the GeoJSON, extract the counties, and save them to a CSV. 

In [3]:
# Read the original Geojson into a dataframe
df = pd.read_json(original_geojson_file)

In [4]:
# Pull out each county name and save it to a list (use title case and strip whitespace)
county_list = []
for i in range(len(df)):
    county_name = df.iloc[i,1]['properties']['COUNTYNAME'].title().strip()
    county_list.append(county_name)
    
print(f"{len(county_list)} counties found.")

67 counties found.


In [5]:
# Build a new dataframe based on the list of counties, and create an additional field to indicate 
# whether the county has an avocado farm
county_df = pd.DataFrame({"County": county_list,
                          "HasAvocadoFarm": 0})

In [6]:
# Sort the list and reset the index
county_df = county_df.sort_values(['County'])
county_df.reset_index(drop=True)

Unnamed: 0,County,HasAvocadoFarm
0,Alachua,0
1,Baker,0
2,Bay,0
3,Bradford,0
4,Brevard,0
...,...,...
62,Union,0
63,Volusia,0
64,Wakulla,0
65,Walton,0


In [7]:
# Save the dataframe to a CSV
county_df.to_csv(csv_file, index=False, header=True)

## Step 2: Manually update the new CSV to set the HasAvocadoFarm field appropriately for each county.

### NOTE: Use 1 to indicate the county has an avocado farm, and 0 if it doesn't. This is IMPORTANT!

In [8]:
# Stop here!

# Update the CSV using your favorite editor (Excel works fine) and save it with the same filename. 
# Be sure to CLOSE the file after editing! 

# Once the file CSV is updated and saved, proceed with the next steps. 

## Step 3: Read the updated CSV into a new dataframe and use it to help modify the contents of the original GeoJSON. The resulting GeoJSON will have a field indicating whether the county in question has an avocado farm. This new field can be used to color the counties as desired. 

In [9]:
# Read the CSV that now contains farm information and set the county as the index, to make it easier to search
county_df = pd.read_csv(csv_file)
county_df = county_df.set_index('County')
county_df.head()

Unnamed: 0_level_0,HasAvocadoFarm
County,Unnamed: 1_level_1
Alachua,1
Baker,1
Bay,0
Bradford,0
Brevard,1


In [10]:
# Update the original GeoJSON dataframe to add a field for avocado farms.
# Initialize this new field based on the information in the CSV.

# For each row in the dataframe ...
for i in range(len(df)):
    
    # ... get the county name, determine the farm status, and update the GeoJSON accordingly
    county_name = df.iloc[i, 1]['properties']['COUNTYNAME'].title().strip()   
    has_farm = county_df.loc[county_name, 'HasAvocadoFarm']
    df.iloc[i, 1]['properties']['HasAvocadoFarm'] = has_farm
    

In [11]:
# Define a helper function to format a GeoJSON
def dfGeoJSON(df):
    geojson = {'type': 'FeatureCollection', 'features': []}
    
    for i in range(len(df)):
        feature = df.iloc[i, 1]
        geojson['features'].append(feature)
    
    return geojson  

In [12]:
# Convert the updated dataframe back to a GeoJSON
geojson = dfGeoJSON(df)

In [13]:
# Define a helper function to deal with numpy datatypes
def convert(o):
    if isinstance(o, np.int64): return int(o)  
    raise TypeError

In [14]:
# Finally, save the new GeoJSON file
with open(new_geojson_file, 'w') as file:
     file.write(json.dumps(geojson, default=convert))  