###### Imports and Settings

In [1]:
import pandas as pd
import numpy as np
import requests
from collections import deque
from functools import reduce
import pickle
import matplotlib.pyplot as plt
#pd.options.display.float_format = '{:,.0f}'.format
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 150)
pd.options.mode.chained_assignment = None  # default='warn'
import sys
sys.path.append("..") # Adds higher directory to python modules path
import geodict
namestocommon = geodict.namestocommon
geotogeoid = geodict.geotogeoid
GNRC = geodict.GNRC
KY = geodict.KY
censusplaces = geodict.censusplaces
shorttnplaces = geodict.shorttnplaces
import sqlite3 as sq
#functions
def percentchange(x, y):
    try:
        return ((x - y)*100/y)
    except ZeroDivisionError:
        return 0
def realchange(x, y):
    return x-y
#calculate real and percent change between all columns for all possible time frames
def calculate_changes(df, columns, time_frames, years):
    for column in columns:
        for time_frame in time_frames:
            start_year, end_year = time_frame.split('-')
            df[f'{column} % Change', 'None', f'{time_frame}'] = percentchange(df[(column, int(end_year), 'None')], df[(column, int(start_year), 'None')])
            df[f'{column} Change', 'None', f'{time_frame}'] = (df[(column, int(end_year), 'None')] - df[(column, int(start_year), 'None')])

    return df
#generate all possible time frames from a list of years
def generate_time_frames(years):
    time_frames = []
    for i in range(len(years)-1):
        for j in range(i+1, len(years)):
            time_frames.append(f"{years[i]}-{years[j]}")
    return time_frames

In [2]:
#to read in... rb is read bite
with open('api_keys.pkl', 'rb') as keys_file:
        keys_dict_2 = pickle.load(keys_file)
#create a variable that contains your api key
api_key = keys_dict_2['CENSUS']

# Get 2020 PL population #

In [3]:
#2020 PL for households and population #s
#counties
url_str= 'https://api.census.gov/data/2020/dec/pl?key='+api_key
predicates= {}
get_vars= ["NAME", 'GEO_ID', 'P1_001N', 'H1_001N']
predicates["get"]= ",". join(get_vars)
predicates["for"]= "county:*"
predicates["in"]= "state:47" 
data = requests.get(url_str, params= predicates)
col_names = ['NAME', 'GEO_ID', 'Population', 'Households', 'StateFIPS', 'GeoFIPS']
df = pd.DataFrame(columns=col_names, data=data.json()[1:], dtype=str)
df = df.loc[df['GeoFIPS'].isin(GNRC)]
predicates= {}
get_vars= ["NAME", 'GEO_ID', 'P1_001N', 'H1_001N']
predicates["get"]= ",". join(get_vars)
predicates["for"]= "county:*"
predicates["in"]= "state:21" 
data = requests.get(url_str, params= predicates)                                                              
col_names = ['NAME', 'GEO_ID', 'Population', 'Households', 'StateFIPS', 'GeoFIPS']
kycos = pd.DataFrame(columns=col_names, data=data.json()[1:], dtype=str)
kycos = kycos.loc[kycos['GeoFIPS'].isin(KY)]
df = pd.concat([df, kycos], axis = 0)
#ky places call
predicates= {}
get_vars= ["NAME", 'GEO_ID', 'P1_001N', 'H1_001N']
predicates["get"]= ",". join(get_vars)
predicates["for"]= "place:*"
predicates["in"]= "state:21" 
data = requests.get(url_str, params= predicates)
col_names = ['NAME', 'GEO_ID', 'Population', 'Households', 'StateFIPS', 'GeoFIPS']
places=pd.DataFrame(columns=col_names, data=data.json()[1:], dtype=str)
places=places.loc[places['GeoFIPS'].isin(shorttnplaces)]
df = pd.concat([df, places], axis = 0)
#places
predicates= {}
get_vars= ["NAME", 'GEO_ID', 'P1_001N', 'H1_001N']
predicates["get"]= ",". join(get_vars)
predicates["for"]= "place:*"
predicates["in"]= "state:47" 
data = requests.get(url_str, params= predicates)
col_names = ['NAME', 'GEO_ID', 'Population', 'Households', 'StateFIPS', 'GeoFIPS']
places=pd.DataFrame(columns=col_names, data=data.json()[1:], dtype=str)
places=places.loc[places['GeoFIPS'].isin(shorttnplaces)]
df = pd.concat([df, places], axis = 0)
#state call
predicates= {}
get_vars= ["NAME", 'GEO_ID', 'P1_001N', 'H1_001N']
predicates["get"]= ",". join(get_vars)
predicates["for"]= "state:47"
data= requests.get(url_str, params= predicates)
col_names = ['NAME', 'GEO_ID', 'Population', 'Households', 'StateFIPS']
state=pd.DataFrame(columns=col_names, data=data.json()[1:], dtype=str)
state['GeoFIPS'] = '0'
df = pd.concat([df, state], axis = 0)
#national call
predicates= {}
get_vars= ["NAME", 'GEO_ID', 'P1_001N', 'H1_001N']
predicates["get"]= ",". join(get_vars)
predicates["for"]= "us:*"
data= requests.get(url_str, params= predicates)
col_names = ['NAME', 'GEO_ID', 'Population', 'Households', 'StateFIPS']
national=pd.DataFrame(columns=col_names, data=data.json()[1:], dtype=str)
national['GeoFIPS'] = '0'
df = pd.concat([df, national], axis = 0)
savename = df
print('Okay Finished')

Okay Finished


In [4]:
pl2020 = savename

In [5]:
pl2020 = pl2020.drop(columns = ['StateFIPS', 'GeoFIPS', 'GEO_ID']).reset_index(drop = True)

In [6]:
pl2020.tail()

Unnamed: 0,NAME,Population,Households
68,"Westmoreland town, Tennessee",2718,1070
69,"White Bluff town, Tennessee",3862,1597
70,"White House city, Tennessee",12982,5039
71,Tennessee,6910840,3031605
72,United States,331449281,140498736


In [7]:
#set the index to the geography and transpose to aggregate regions, set the rest of the columns to float datatype
data = pl2020.set_index('NAME').transpose()
cols = data.columns
data[cols] = data[cols].astype(float)

In [8]:
GNRCCounties = [data['Stewart County, Tennessee'],data['Montgomery County, Tennessee'],
                data['Houston County, Tennessee'],data['Humphreys County, Tennessee'],
                data['Dickson County, Tennessee'],data['Cheatham County, Tennessee'],
                data['Robertson County, Tennessee'],data['Sumner County, Tennessee'],
                data['Davidson County, Tennessee'],data['Wilson County, Tennessee'],
                data['Trousdale County, Tennessee'],data['Williamson County, Tennessee'],
                data['Rutherford County, Tennessee']]
data['GNRC'] = sum(GNRCCounties)
GNRCCountiesAll = [data['Stewart County, Tennessee'],data['Montgomery County, Tennessee'],
                   data['Houston County, Tennessee'],data['Humphreys County, Tennessee'],
                   data['Dickson County, Tennessee'],data['Cheatham County, Tennessee'],
                   data['Robertson County, Tennessee'],data['Sumner County, Tennessee'],
                   data['Davidson County, Tennessee'],data['Wilson County, Tennessee'],
                   data['Trousdale County, Tennessee'],data['Williamson County, Tennessee'],
                   data['Rutherford County, Tennessee'],data['Maury County, Tennessee']]
data['GNRC Region'] = sum(GNRCCountiesAll)
MPOCounties = [data['Robertson County, Tennessee'],data['Sumner County, Tennessee'],
               data['Davidson County, Tennessee'],data['Wilson County, Tennessee'],
               data['Williamson County, Tennessee'],data['Rutherford County, Tennessee'],
               data['Maury County, Tennessee']]
data['MPO'] = sum(MPOCounties)
RuthInc = [data['Eagleville city, Tennessee'],data['La Vergne city, Tennessee'],
           data['Murfreesboro city, Tennessee'],data['Smyrna town, Tennessee']]
data['Rutherford Incorporated'] = sum(RuthInc)
data['Rutherford Unincorporated'] = data['Rutherford County, Tennessee'] - data['Rutherford Incorporated']
WilsonInc = [data['Lebanon city, Tennessee'],data['Mount Juliet city, Tennessee'],
             data['Watertown city, Tennessee']]
data['Wilson Incorporated'] = sum(WilsonInc)
data['Wilson Unincorporated'] = data['Wilson County, Tennessee'] - data['Wilson Incorporated']
CheathInc = [data['Ashland City town, Tennessee'],data['Kingston Springs town, Tennessee'],
             data['Pegram town, Tennessee'],data['Pleasant View city, Tennessee']]
data['Cheatham Incorporated'] = sum(CheathInc)
data['Cheatham Unincorporated'] = data['Cheatham County, Tennessee'] - data['Cheatham Incorporated']
DicksInc = [data['Burns town, Tennessee'],data['Charlotte town, Tennessee'],
            data['Dickson city, Tennessee'],data['Slayden town, Tennessee'],
            data['Vanleer town, Tennessee'],data['White Bluff town, Tennessee']]
data['Dickson Incorporated'] = sum(DicksInc)
data['Dickson Unincorporated'] = data['Dickson County, Tennessee'] - data['Dickson Incorporated']
HumphInc = [data['McEwen city, Tennessee'],data['New Johnsonville city, Tennessee'],
            data['Waverly city, Tennessee']]
data['Humphreys Incorporated'] = sum(HumphInc)
data['Humphreys Unincorporated'] = data['Humphreys County, Tennessee'] - data['Humphreys Incorporated']
data['Montgomery Incorporated'] = data['Clarksville city, Tennessee']
data['Montgomery Unincorporated'] = data['Montgomery County, Tennessee'] - data['Montgomery Incorporated']

In [9]:
#transpose back and reset the index
pl2020 = data.transpose().reset_index()

In [10]:
#check that data is ready to join
pl2020.head(3)

Unnamed: 0,NAME,Population,Households
0,"Montgomery County, Tennessee",220069.0,85714.0
1,"Rutherford County, Tennessee",341486.0,131216.0
2,"Sumner County, Tennessee",196281.0,78995.0


### Households

In [22]:
#households
data = pd.read_csv('../../Data Downloads/WoodsPooleUrbanSIM_HouseholdsProjections_2017Base.csv')

In [23]:
#merge pl2020 and replace 2020 WP # with Census #
real2020 = pl2020.drop(columns = 'Population')
data = data.merge(real2020, on = 'NAME')
data['2020'] = data['Households']
data = data.drop(columns = ['Households', 'GEO_ID'])

In [24]:
data.head()

Unnamed: 0,NAME,2017,2020,2025,2035,2045
0,"Cheatham County, Tennessee",14671,16785.0,17101,19514,22019
1,"Davidson County, Tennessee",273497,328309.0,297798,315192,332379
2,"Dickson County, Tennessee",19032,22551.0,21596,23868,26141
3,"Houston County, Tennessee",2999,3936.0,3444,3840,4230
4,"Humphreys County, Tennessee",7064,8849.0,7622,7949,8247


In [25]:
hhsuff = data.set_index('NAME').add_prefix('Households ')

In [26]:
hhsuff.head()

Unnamed: 0_level_0,Households 2017,Households 2020,Households 2025,Households 2035,Households 2045
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Cheatham County, Tennessee",14671,16785.0,17101,19514,22019
"Davidson County, Tennessee",273497,328309.0,297798,315192,332379
"Dickson County, Tennessee",19032,22551.0,21596,23868,26141
"Houston County, Tennessee",2999,3936.0,3444,3840,4230
"Humphreys County, Tennessee",7064,8849.0,7622,7949,8247


In [27]:
cols = data.columns
#hhproj = data.reset_index(drop = False)
hhproj = data.melt(id_vars = 'NAME', var_name = 'Year', value_name = 'Households')#, values = cols)
hhproj.head()

Unnamed: 0,NAME,Year,Households
0,"Cheatham County, Tennessee",2017,14671.0
1,"Davidson County, Tennessee",2017,273497.0
2,"Dickson County, Tennessee",2017,19032.0
3,"Houston County, Tennessee",2017,2999.0
4,"Humphreys County, Tennessee",2017,7064.0


In [28]:
#populations
data = pd.read_csv('../../Data Downloads/WoodsPooleUrbanSIM_PopulationProjections_2017Base.csv')
data.head(2)

Unnamed: 0,NAME,GEO_ID,2017,2020,2025,2035,2045
0,"Cheatham County, Tennessee",0500000US47021,38688,41240,45070,51373,58015
1,"Davidson County, Tennessee",0500000US47037,638443,663737,695114,735884,776083


In [29]:
#merge pl2020 and replace 2020 WP # with Census #
real2020 = pl2020.drop(columns = 'Households')
data = data.merge(real2020, on = 'NAME')
data['2020'] = data['Population']
data = data.drop(columns = ['Population', 'GEO_ID'])

In [30]:
popsuff = data.set_index('NAME').add_prefix('Population ')

In [31]:
cols = data.columns
#hhproj = data.reset_index(drop = False)
popproj = data.melt(id_vars = 'NAME', var_name = 'Year', value_name = 'Population')#, values = cols)
popproj.head(3)

Unnamed: 0,NAME,Year,Population
0,"Cheatham County, Tennessee",2017,38688.0
1,"Davidson County, Tennessee",2017,638443.0
2,"Dickson County, Tennessee",2017,49003.0


In [32]:
data = popproj.merge(hhproj, on = ['NAME', 'Year'])

In [33]:
#make sure year is formatted as an integer and generate list of years and all possible time frames
data['Year'] = data['Year'].astype(int)
#create a list of years from the dataframe to pass through our "generate time frames" function to create a list of all possible time frames - need this here for later
years = list(data['Year'].unique().astype(int))
time_frames = generate_time_frames(years)

In [34]:
#create a multilevel column header with year and placeholder for time frames
#pivot the table and create a multiindex of year and column header
cols = list(data.columns)
cols.remove('NAME')
cols.remove('Year')
df_pivot = data.pivot_table(index = ['NAME'], columns = ['Year'], values = cols)
df_pivot.head(2)

Unnamed: 0_level_0,Households,Households,Households,Households,Households,Population,Population,Population,Population,Population
Year,2017,2020,2025,2035,2045,2017,2020,2025,2035,2045
NAME,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
"Adams city, Tennessee",228.0,250.0,249.0,274.0,309.0,728.0,624.0,801.0,876.0,995.0
"Ashland City town, Tennessee",2070.0,2311.0,2737.0,3428.0,4101.0,4701.0,5193.0,6315.0,7850.0,9431.0


In [35]:
#add a level to the multiindex to accomodate the time period metrics
df_pivot.columns = pd.MultiIndex.from_tuples([(col[0], col[1], 'None') for col in df_pivot.columns])
df_pivot.head(3)

Unnamed: 0_level_0,Households,Households,Households,Households,Households,Population,Population,Population,Population,Population
Unnamed: 0_level_1,2017,2020,2025,2035,2045,2017,2020,2025,2035,2045
Unnamed: 0_level_2,None,None,None,None,None,None,None,None,None,None
NAME,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
"Adams city, Tennessee",228.0,250.0,249.0,274.0,309.0,728.0,624.0,801.0,876.0,995.0
"Ashland City town, Tennessee",2070.0,2311.0,2737.0,3428.0,4101.0,4701.0,5193.0,6315.0,7850.0,9431.0
"Belle Meade city, Tennessee",1021.0,1130.0,1081.0,1114.0,1146.0,2615.0,2901.0,2790.0,2875.0,2959.0


In [36]:
#get a list of the varaibles to loop through by indexing into the first level only of the column headers
first_level = df_pivot.columns.get_level_values(0).unique().tolist()
# first_level.remove('NAME')
# first_level.remove('Year')
#remove percentages - don't want change metrics on them
first_level = [item for item in first_level if '%' not in item]

In [37]:
#pass the dataframe, the list of variables, time frames, and years through the "calculate change" function
data = calculate_changes(df_pivot, first_level, time_frames = time_frames, years = years)

In [38]:
#reformat and rename columns
data = data.stack([1, 1])
data = data.reset_index(drop = False)
data = data.rename(columns = {'level_1':'Year', 'level_2':'Time Frame'})

In [39]:
data.head()

Unnamed: 0,NAME,Year,Time Frame,Households,Households % Change,Households Change,Population,Population % Change,Population Change
0,"Adams city, Tennessee",2017,,228.0,,,728.0,,
1,"Adams city, Tennessee",2020,,250.0,,,624.0,,
2,"Adams city, Tennessee",2025,,249.0,,,801.0,,
3,"Adams city, Tennessee",2035,,274.0,,,876.0,,
4,"Adams city, Tennessee",2045,,309.0,,,995.0,,


In [40]:
#map to geoid dictionary from module and add source
data['GEO_ID'] = data['NAME'].map(geotogeoid)
data['Source'] = 'Woods & Poole 2017 Base'

In [41]:
#final check
data.head()

Unnamed: 0,NAME,Year,Time Frame,Households,Households % Change,Households Change,Population,Population % Change,Population Change,GEO_ID,Source
0,"Adams city, Tennessee",2017,,228.0,,,728.0,,,1600000US4700200,Woods & Poole 2017 Base
1,"Adams city, Tennessee",2020,,250.0,,,624.0,,,1600000US4700200,Woods & Poole 2017 Base
2,"Adams city, Tennessee",2025,,249.0,,,801.0,,,1600000US4700200,Woods & Poole 2017 Base
3,"Adams city, Tennessee",2035,,274.0,,,876.0,,,1600000US4700200,Woods & Poole 2017 Base
4,"Adams city, Tennessee",2045,,309.0,,,995.0,,,1600000US4700200,Woods & Poole 2017 Base


In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1215 entries, 0 to 1214
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   NAME                 1215 non-null   object 
 1   Year                 1215 non-null   object 
 2   Time Frame           1215 non-null   object 
 3   Households           405 non-null    float64
 4   Households % Change  810 non-null    float64
 5   Households Change    810 non-null    float64
 6   Population           405 non-null    float64
 7   Population % Change  810 non-null    float64
 8   Population Change    810 non-null    float64
 9   GEO_ID               1005 non-null   object 
 10  Source               1215 non-null   object 
dtypes: float64(6), object(5)
memory usage: 104.5+ KB


In [61]:
hhsuff = hhsuff.reset_index(drop = False)
popsuff = popsuff.reset_index(drop = False)
suff = hhsuff.merge(popsuff)
suff['GEO_ID'] = suff['NAME'].map(geotogeoid)
suff['Source'] = 'Woods & Poole 2017 Base'

In [62]:
suff.tail()

Unnamed: 0,NAME,Households 2017,Households 2020,Households 2025,Households 2035,Households 2045,Population 2017,Population 2020,Population 2025,Population 2035,Population 2045,GEO_ID,Source
78,"White Bluff town, Tennessee",1426,1597.0,1596,1725,1852,3733,3862.0,4163,4490,4816,1600000US4779980,Woods & Poole 2017 Base
79,"White House city, Tennessee",4305,5039.0,5394,6413,7427,12392,12982.0,15449,18177,20873,1600000US4780200,Woods & Poole 2017 Base
80,GNRC,711137,864517.0,845208,996443,1171266,1817546,2089918.0,2171887,2577014,3047760,,Woods & Poole 2017 Base
81,MPO,623540,759407.0,737550,865792,1014756,1586142,1822891.0,1887724,2231736,2633573,,Woods & Poole 2017 Base
82,GNRC Region,744469,907660.0,882842,1038007,1216853,1903165,2190892.0,2268607,2683897,3164912,,Woods & Poole 2017 Base


In [63]:
#export to the SQLite database
conn = sq.connect('../../Outputs/Dem_Transpo_Housing_Collection.db')
data.to_sql('WPURBANSIM_PopulationHousehold_CurrentandProjected_Annual_Change', conn, if_exists = 'replace', index = False)
suff.to_sql('WPURBANSIM_PopulationHousehold_CurrentandProjected_Annual_WideFormat', conn, if_exists = 'replace', index = False)

83