# Analyze observed climatology from NCLIMGRID and LIVNEH

This notebook contains example calculations to help you on your way to develop your custom analysis. \
First, set your parameters and retrieve (a subset of) the pre-calculated zonal statistics from the feature service. \
Then, calculate the mean and change over time for your retrieved dataset. \
Last, develop your own analysis.

# Import libraries

In [3]:
import numpy as np
import pandas as pd
import geopandas as gpd
import requests

# Set feature service variables

In [4]:
feat_srvc = 'https://services3.arcgis.com/0Fs3HcaFfvzXvm7w/ArcGIS/rest/services/CRIS_Zonal_Statistics_by_County/FeatureServer/3'
query = '/query'

# Use Geographic Identifiers (Field Name: GEOID) to pick which counties to calculate zonal statistics over.
uniqueID = 'GEOID'

# Pick counties and years to return

In [5]:
# Put each county's GEOID between apostrophes and separate by commas, e.g. ids = '20201', '20117'
# Set ids to 'all' if using all counties in the feature layer
ids = '20201', '20203', '20117'

In [6]:
# Pick start and end year of the period you want to process.
# If you only want to process 1 year, put your desired year for both the start and end.
year_start = 1981
year_end = 2020

In [7]:
# Define the where clause for counties and years to return

if ids == 'all':
    
    where_clause_id = f"YEAR >= {year_start} AND YEAR <= {year_end}"

else:
    
    if np.size(ids) == 1:

        where_clause_id = f"{uniqueID} = '{ids}' AND YEAR >= {year_start} AND YEAR <= {year_end}"

    else:

        where_clause_id = f"{uniqueID} IN {ids} AND YEAR >= {year_start} AND YEAR <= {year_end}"

# Pick variables to return

### Pick one of the three options below, run the appropriate cells, then move to "Retrieve data"

##### Option 1: return all variables

In [14]:
# Run this cell if you want all variables returned
variables = '*'

##### Option 2: return manually selected subset of variables

In [None]:
# Show available variables
response_temp = requests.get(feat_srvc + '?f=pjson')
data = response_temp.json()
fields = data['fields']
for field in fields:
    print('Variable:', field['name'], '\nDescription:', field['alias'], '\n')
    print()

In [None]:
# If you want 1 or more variables from the above list, select them here
# Put each variable between apostrophes and separate by commas, e.g. variables = 'TMAX_NCLIMGRID_MEAN'
variables = 'TMAX_NCLIMGRID_MEAN'

##### Option 3: select all NCLIMGRID or all LIVNEH variables

In [29]:
# Run the line below for all NCLIMGRID variables
variables = [field['name'] for field in fields if "NCLIMGRID" in field['name']]

# Run the line below for all NCLIMGRID variables
# variables = [field['name'] for field in fields if "LIVNEH" in field['name']]

# Retrieve data

In [46]:
# Add counties ('GEOID') and 'YEAR' to the variables-to-return list
return_variables = ['GEOID', 'YEAR']

if np.size(variables) == 1:

    return_variables.append(variables)

else:

    for var in variables:

        return_variables.append(var)

In [None]:
# Maximum number of variables that can be processed in each request: 50
# Maximum number of rows that can be processed in each request: 1000
# This code loops through variables and rows and appends each subset (1000x50) to a final GeoDataFrame (gdf)

# Number of variables that are requested to be processed
vars_to_process = len(return_variables)

# Number of variables to process in each loop
loop_size = 50

# Set initial value for loop number (v)
v = 0

# Loop through variables, 50 at one time
# While there are variables left to process, keep looping through this code
while vars_to_process > 0:

    # Sequentially subset the variables to process with 50 variables in each loop
    return_variables_condition = return_variables[max((loop_size*v - 1) + 1, 0): min(loop_size*(v+1), len(return_variables))]

    # Update number of variables to process, i.e. subtract 50 (loop_size)
    vars_to_process -= loop_size
    
    # Update loop number (v)
    v += 1

    # Set parameters for requests.get()
    params = {
        'where': where_clause_id,
        'outFields': return_variables_condition,
        'orderByFields': ['YEAR', 'GEOID'],
        'f': 'pgeojson',
    }

    # Request response from feature service
    response = requests.get(feat_srvc + query, params=params)


    # Put the first 50 variables in gdf to establish the GeoDataFrame
    if v == 1:
        
        # Translate the reponse.text into a temporary GeoDataFrame
        gdf_var1_temp = gpd.read_file(response.text)

        # Drop geometry field
        gdf_var1_temp.drop(columns=['geometry'], inplace=True)

        # Establish the final GeoDataFrame (gdf) and put the first subset of rows and variables (temporary GeoDataFrame) in it
        gdf = gdf_var1_temp


        # Set initial value for loop number (i)
        i = 0

        # Loop through rows, 1000 at one time
        while len(gdf_var1_temp) == 1000:
            
            # Update loop number (i)
            i = i + 1

            # Set the offset for in params. This says how many rows to skip from the start.
            offset = 1000 * i

            # Set parameters for requests.get()
            params = {
                'where': where_clause_id,
                'outFields': return_variables_condition,
                'orderByFields': ['YEAR', 'GEOID'],
                'f': 'pgeojson',
                'resultOffset': f'{offset}'
            }

            # Request response from feature service
            response = requests.get(feat_srvc + query, params=params)

            # Translate the reponse.text into a temporary GeoDataFrame
            gdf_var1_temp = gpd.read_file(response.text)

            # Concatenate the new subset to the final GeoDataFrame (gdf)
            gdf = pd.concat([gdf, gdf_var1_temp], ignore_index=True);

            # Drop geometry field
            gdf.drop(columns=['geometry'], inplace=True)


    # Put all the additional variables, beyond the first 50, in a temporary GeoDataFrame and concatenate with the final GeoDataFrame (gdf)
    elif v > 1:
    
        # Translate the reponse.text into a temporary GeoDataFrame
        gdf_var2_temp = gpd.read_file(response.text)
        
        # Drop geometry field
        gdf_var2_temp.drop(columns=['geometry'], inplace=True)

        # Establish the a temporart GeoDataFrame (gdf_var2) and put the first subset of rows and variables (temporary GeoDataFrame) in it     
        gdf_var2 = gdf_var2_temp


        # Set initial value for loop number (j)
        j = 0

        # Loop through rows, 1000 at one time
        while len(gdf_var2_temp) == 1000:

            # Update loop number (j)
            j = j + 1

            # Set the offset for in params. This says how many rows to skip from the start.
            offset = 1000 * j

            # Set parameters for requests.get()
            params = {
                'where': where_clause_id,
                'outFields': return_variables_condition,
                'orderByFields': ['YEAR', 'GEOID'],
                'f': 'pgeojson',
                'resultOffset': f'{offset}'
            }

            # Request response from feature service
            response = requests.get(feat_srvc + query, params=params)

            # Translate the reponse.text into a temporary GeoDataFrame
            gdf_var2_temp = gpd.read_file(response.text)

            # Concatenate the new subset to the temporary GeoDataFrame (gdf_var2)
            gdf_var2 = pd.concat([gdf_var2, gdf_var2_temp], ignore_index=True)

            # Drop geometry field
            gdf_var2.drop(columns=['geometry'], inplace=True)

        # Concatenate temporary GeoDataFrame (gdf_var2) to the final GeoDataFrame (gdf)
        gdf = pd.concat([gdf, gdf_var2], axis=1)

# Sort GeoDataFrame by GEOID and YEAR
gdf.sort_values(by=['GEOID', 'YEAR'], inplace=True)

gdf

# Analysis examples

### Calculation 1: mean over time period

In [26]:
# If desired, run this cell to take a subset of the time period over which to calculate the mean
year_start_subset = 1990
year_end_subset = 1995

gdf1 = gdf[(gdf['YEAR'] >= year_start_subset) & (gdf['YEAR'] < year_end_subset+1)]

##### Run this cell to return the mean of each variable in the GeoDataFrame

In [27]:
# Group by county (GEOID) and calculate mean of period 'year_start_subset - year_end_subset'
variables_mean = gdf1.groupby('GEOID').mean()

# Drop YEAR column
variables_mean.drop(columns='YEAR', inplace=True)

# If returning all variables, remove OBJECTID and BATCH_ID columns
if variables == '*':
    variables_mean.drop(columns=['OBJECTID', 'BATCH_ID'], inplace=True)

# Add index numbers to rows
variables_mean.reset_index(inplace=True)

# Rename column names to reflect that they represent the mean of each variable over time
variables_mean.rename(columns={gdf1.columns[i]: f"{gdf1.columns[i]} (mean)" for i in range(2, len(gdf1.columns))}, inplace=True)

# Show means of variables
variables_mean

Unnamed: 0,GEOID,TMAX_NCLIMGRID_MIN (mean),TMAX_NCLIMGRID_MAX (mean),TMAX_NCLIMGRID_MEAN (mean)
0,20117,62.525172,64.834242,63.742053
1,20201,63.295928,64.979055,64.146538
2,20203,65.892865,66.977767,66.478455


### Calculation 2: change over time

In [28]:
# Set years over which to calculate the change
year_start_change = 1985
year_end_change = 1995

# Create two new GeoDataFrames which contain only the the start and end years, respectively
gdf_start_change = gdf[(gdf['YEAR'] == year_start_change)]
gdf_end_change = gdf[(gdf['YEAR'] == year_end_change)]

In [None]:
# Set the indices of both GeoDataFrames to the counties (GEOIDs)
gdf_start_change.set_index('GEOID', inplace=True);
gdf_end_change.set_index('GEOID', inplace=True);

# Drop YEAR, MODEL_SET, and MODEL columns
gdf_start_change.drop(columns=['YEAR'], inplace=True)
gdf_end_change.drop(columns=['YEAR'], inplace=True)

# If returning all variables, remove OBJECTID and BATCH_ID columns
if variables == '*':
    gdf_start_change.drop(columns=['OBJECTID', 'BATCH_ID'], inplace=True)
    gdf_end_change.drop(columns=['OBJECTID', 'BATCH_ID'], inplace=True)

In [32]:
# Calculate absolute difference between start and end years
variables_difference_absolute = gdf_end_change - gdf_start_change

# Add index numbers to rows
variables_difference_absolute.reset_index(inplace=True)

# Rename columns to reflect that they represent the mean of each variable over time
variables_difference_absolute.rename(columns={gdf.columns[i]: f"{gdf.columns[i]} (difference)" for i in range(2, len(gdf.columns))}, inplace=True)

# Show absolute difference of each county and each variable between the start and end years
variables_difference_absolute

Unnamed: 0,GEOID,TMAX_NCLIMGRID_MIN (difference),TMAX_NCLIMGRID_MAX (difference),TMAX_NCLIMGRID_MEAN (difference)
0,20117,1.54364,1.57245,1.71683
1,20201,2.77005,1.92838,2.27677
2,20203,2.22816,2.26542,2.24628


In [33]:
# Calculate percentage difference between start and end years
variables_difference_percentage = ((gdf_end_change - gdf_start_change) / gdf_start_change) * 100

# Rename columns to reflect that they represent the percentage difference between the start and end years
variables_difference_percentage.rename(columns={gdf.columns[i]: f"{gdf.columns[i]} (% difference)" for i in range(2, len(gdf.columns))}, inplace=True)

# Add index numbers to rows
variables_difference_percentage.reset_index(inplace=True)

# Show percentage difference of each county and each variable between the start and end years
variables_difference_percentage

Unnamed: 0,GEOID,TMAX_NCLIMGRID_MIN (% difference),TMAX_NCLIMGRID_MAX (% difference),TMAX_NCLIMGRID_MEAN (% difference)
0,20117,2.561817,2.515316,2.797385
1,20201,4.602177,3.09239,3.712668
2,20203,3.484224,3.482593,3.481557
