# Validation Analysis of the SSP3 and SPP5 Population Data

This notebook tests the population data that goes into GCAM-USA and TELL to confirm that they add to the same nationwide and state-level total populations for SSP3 and SSP5. The raw state-level data is taken directly from the Jiang et al. population dataset (https://zenodo.org/records/3956412) and aggregate using the functions below.

In [None]:
# Start by importing the packages we need:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from glob import glob


## Set the Directory Structure

In [None]:
# Identify the top-level directory and the subdirectory where the data will be stored:
data_input_dir =  '/Users/burl878/Documents/IMMM/Data/TELL/Production_Runs/tell_data/population_data/'
image_output_dir =  '/Users/burl878/Documents/Code/code_repos/exp_group_b/analyses/plots/population_validation/'

# If the "image_output_dir" subdirectory doesn't exist then create it:
if not os.path.exists(image_output_dir):
   os.makedirs(image_output_dir)


## Process the Raw SSP3 and SSP5 Population Data


In [None]:
# Define a function aggregate the raw population data for a given scenario:
def process_state_level_population_data(data_input_dir: str, scenario_to_process: str):
    
    # Make a list of all the state subdirectories in the folder:
    dir_list = glob(os.path.join((data_input_dir + 'Jiang_et_al_Population/' + scenario_to_process + '/'), "*", ""), recursive = True)
    
    # Loop over the directories and process the data for each state:
    for i in range(len(dir_list)):
        
        # Strip out the name of the directory being processed:
        dir_name = dir_list[i].split(scenario_to_process)[1].replace('/', '')
                
        # Extract the state FIPS code and state abbreviation from the directory name:
        state_fips = int(dir_name.split('-')[0])*1000
        state_abbreviation = dir_name.split('-')[1]
              
        # Read in the '*_proj_pop.csv' file for that state:
        pop_df = pd.read_csv((data_input_dir + 'Jiang_et_al_Population/' + scenario_to_process + '/' + dir_name + '/' + dir_name + '_proj_pop.csv'))

        # Remove the first four columns:
        pop_df.drop(['state', 'age', 'female', 'urban'], axis=1, inplace=True)
        
        # Sum over all the rows to get the total population by year:
        sum_pop_df = pd.DataFrame(pop_df.sum(axis=0).round(2), columns =['Nat_Res_Pop']).reset_index()
        sum_pop_df.rename(columns={'index': 'Year'}, inplace=True)

        # Add in the scenario, state abbreviation, and FIPS codes:
        sum_pop_df['Scenario'] = scenario_to_process
        sum_pop_df['State_FIPS'] = state_fips
        sum_pop_df['State'] = state_abbreviation
    
        # Aggregate the output into a new dataframe:
        if i == 0:
           output_df = sum_pop_df
        else:
           output_df = pd.concat([output_df, sum_pop_df])
        
        # Clean up:
        del dir_name, state_fips, state_abbreviation, pop_df, sum_pop_df

    # Reorder the columns and sort by state name:
    output_df = output_df[['State', 'State_FIPS', 'Scenario', 'Year', 'Nat_Res_Pop']].copy().sort_values(['State_FIPS', 'Year']).reset_index(drop=True)      
    
    # Subset to just the future years that match the county-level data:
    output_df = output_df[output_df['Year'].isin(['2020', '2030', '2040', '2050', '2060', '2070', '2080', '2090', '2100'])]
    
    # Return the final dataframe:
    return output_df


In [None]:
# Test the function created above:
natural_df = process_state_level_population_data(data_input_dir = data_input_dir, 
                                                 scenario_to_process = 'SSP3')

natural_df


## Process the County-Level Data to the Same Resolution and Format


In [None]:
# Define a function aggregate the raw county-level population data for a given scenario:
def process_county_level_population_data(data_input_dir: str, scenario_to_process: str):
    
    # Read in the raw county-level population data for that scenario:
    if scenario_to_process == 'SSP3':
       pop_df = pd.read_csv((data_input_dir + 'ssp3_county_population.csv'))
    elif scenario_to_process == 'SSP5':
       pop_df = pd.read_csv((data_input_dir + 'ssp5_county_population.csv'))
    
    # Convert the county FIPS code to a state FIPS code:
    pop_df['State_FIPS'] = (((pop_df['FIPS']/1000).apply(np.floor))*1000).astype(int)
    
    # Write out the dataframe to a .csv file:
    pop_df.to_csv((data_input_dir + 'Test.csv'), sep=',', index=False)
    
    # Drop the county FIPS and state_name columns:
    pop_df.drop(['FIPS', 'state_name'], axis=1, inplace=True)
    
    # Loop through the years and sum up the population by state:
    for year in [2020, 2030, 2040, 2050, 2060, 2070, 2080, 2090, 2100]:
        year_df = pop_df[[(str(year)), 'State_FIPS']].copy()
        year_df['County_Pop'] = (year_df.groupby('State_FIPS')[(str(year))].transform('sum')).round(2)
        year_df['Year'] = str(year)
               
        if year == 2020:
           output_df = year_df[['State_FIPS', 'County_Pop', 'Year']].drop_duplicates()
        else:
           output_df = pd.concat([output_df, year_df[['State_FIPS', 'County_Pop', 'Year']].drop_duplicates()])
        
        del year_df
    
    # Add in the scenario and clean up the output_df:
    output_df['Scenario'] = scenario_to_process
    output_df = output_df[['State_FIPS', 'Scenario', 'Year', 'County_Pop']].copy().sort_values(['State_FIPS', 'Year']).reset_index(drop=True) 
    
    # Return the final dataframe:
    return output_df


In [None]:
# Test the function created above:
county_df = process_county_level_population_data(data_input_dir = data_input_dir, 
                                                 scenario_to_process = 'SSP3')

county_df


## Merge the Two Datastreams Together


In [None]:
# Define a function merge the natural and county-level datastreams together for a given scenario:
def process_merged_data(data_input_dir: str, scenario_to_process: str):
    # Process the raw state-level data:
    natural_df = process_state_level_population_data(data_input_dir = data_input_dir, 
                                                     scenario_to_process = scenario_to_process)
    
    # Process the county-level data:
    county_df = process_county_level_population_data(data_input_dir = data_input_dir, 
                                                     scenario_to_process = scenario_to_process)
    
    # Merge the "natural_df" with "county_df" using FIPS codes and years to merge on:
    output_df = natural_df.merge(county_df, on=['State_FIPS', 'Scenario', 'Year'], how='left')
    
    # Compute the population difference relative to the state-level data:
    output_df['Pop_Diff'] = output_df['Nat_Res_Pop'] - output_df['County_Pop']
    output_df['Pop_Diff_Rel'] = (output_df['Pop_Diff'] / output_df['Nat_Res_Pop'])*100
    
    return output_df
    

In [None]:
# Test the function created above:
output_df = process_merged_data(data_input_dir = data_input_dir, 
                                scenario_to_process = 'SSP3')

output_df


## Visualize the Results


In [None]:
def plot_population_differences(data_input_dir: str, scenario_to_process: str,
                                image_output_dir: str, image_resolution: int, save_images=False):
    
    # Process the data:
    output_df = process_merged_data(data_input_dir = data_input_dir, 
                                    scenario_to_process = scenario_to_process)

    # Create a 1:1 vector:
    one_to_one = np.arange(0, (output_df['Nat_Res_Pop'].max()), 1000)

    # Make the scatter plot:
    plt.figure(figsize=(20, 10))
    plt.plot(one_to_one,one_to_one,'k', linewidth=1, label = '1:1')
    plt.plot(one_to_one, (one_to_one*1.02), 'k', linewidth=1, linestyle='--', label = '1:1 - 2%')
    plt.plot(one_to_one, (one_to_one*0.98), 'k', linewidth=1, linestyle='--', label = '1:1 + 2%')
    plt.scatter(output_df['Nat_Res_Pop'], output_df['County_Pop'], s=50, c='blue', label='Single Year + State Value')
    plt.legend()
    plt.xlim(0, output_df['Nat_Res_Pop'].max())
    plt.ylim(0, output_df['Nat_Res_Pop'].max())
    plt.xlabel('Jiang et al. Annual State-Level Population')
    plt.ylabel('Aggregated Annual County-Level Population')
    plt.title('State-Level Annual Population Comparison: ' + scenario_to_process)

    # If the "save_images" flag is set to true then save the plot to a .png file:
    if save_images:
       plt.savefig(os.path.join(image_output_dir, ('Population_Scatter_Plot_' + scenario_to_process + '.png')), dpi=image_resolution, bbox_inches='tight', facecolor='white')
       #plt.close()

        
    # Make the histogram plot:
    plt.figure(figsize=(20, 10))
    plt.hist(output_df['Pop_Diff'], bins=25, density=True, histtype='step', edgecolor = 'r', label='Difference', linewidth=3)
    plt.xlim((-1*output_df['Pop_Diff'].abs().max()), (output_df['Pop_Diff'].abs().max()))
    plt.xlabel('Annual State-Level Population Difference')
    plt.ylabel('Probability')
    plt.title('State-Level Annual Population Differences: ' + scenario_to_process)

    # If the "save_images" flag is set to true then save the plot to a .png file:
    if save_images:
       plt.savefig(os.path.join(image_output_dir, ('Population_Histograms_' + scenario_to_process + '.png')), dpi=image_resolution, bbox_inches='tight', facecolor='white')
       #plt.close()     


In [None]:
# Execute the final analysis:
plot_population_differences(data_input_dir = data_input_dir, 
                            scenario_to_process = 'SSP3',
                            image_output_dir = image_output_dir,
                            image_resolution = 300,
                            save_images = True)
