# Create the mapping between PUMA and EIA Numer in EIA Form 861

**Contributors:** Anthony D. Fontanini

**Date Created:** January 6th, 2019
    
This notebook creates the mapping file from PUMA to the EIA number for the utilities in EIA Form 861. The mapping file helps apportion the ResStock results to utilities for the EIA Form 861 comparisons.

# Import modules

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

# Load data
- Mapping of census tract to NSRDB grid cell
    - The census tract (format weird)
    - The NSRDB grid cell
    - The portion of the gridcell in the tract
- Mapping of NSRDB grid cell
    - The NSRDB grid cell
    - Utility ID (EIA Form 861 - 2012)
    - hu_cust_combined_wt: Housing unit customer combined wt (needed to multiply by the acs unit counts)
- The spatial lookup table (created in the spatial refactor tsv_maker class)
    - ACS based housing unit counts
    - County long name
    - Census Tract GEOID

In [2]:
# Load Census tract to NSRDB Grid Cell and allocation weights
tract_to_nsrdb_df = pd.read_csv(os.path.join('various_datasets','spatial_data','tract_to_nsrdb_grid_crosswalk.csv'))

# Load the NSRDB grid cell to EIA Form 861 utility number
nsrdb_to_utility_df = pd.read_csv(os.path.join('various_datasets','spatial_data','nsrdb_utility_weights_with_oglala_lakota_lookups.csv'))

# Load the spatial lookup table   
tract_spatial_lookup_df = pd.read_csv(os.path.join('various_datasets','spatial_data','spatial_tract_lookup_table.csv'))
tract_spatial_lookup_df['geoid'] = 'G' + tract_spatial_lookup_df['geoid'].astype(int).astype(str).apply(lambda x: x.zfill(11))

# Join Data into a single dataframe

In [3]:
# Merge NSRDB grid cell, utility id, census tracts, and tract weight ratio
tract_to_utility_df = pd.merge(nsrdb_to_utility_df,tract_to_nsrdb_df,on='nsrdb_gid')

## Modify geoid
tract_to_utility_df['geoid'] = ('G' + 
                                tract_to_utility_df['statefp'].astype(str).apply(lambda x: x.zfill(2)) +
                                tract_to_utility_df['countyfp'].astype(str).apply(lambda x: x.zfill(3)) + 
                                tract_to_utility_df['tract_gisjoin'].astype(str).apply(lambda x: x[-6:]) )
## Create fips column
tract_to_utility_df['fips'] = ('G' + 
                                tract_to_utility_df['statefp'].astype(str).apply(lambda x: x.zfill(2)) +
                                tract_to_utility_df['countyfp'].astype(str).apply(lambda x: x.zfill(3)) )

## Replace Shannon County, SD with Oglala Lakota County, SD
idx = np.where( tract_to_utility_df['fips'] == 'G46113')[0]
tract_to_utility_df.loc[idx,'geoid'] = 'G46102' + tract_to_utility_df.loc[idx,'geoid'].astype(str).apply(lambda x: x[-6:])

# MERGE ACS census tract counts to utility and tract weight ratios
tract_utility_counts = pd.merge(tract_to_utility_df[['geoid','tract2nsrdb_alloc_ratio','utility_id','hu_cust_combined_wt']],tract_spatial_lookup_df,on='geoid')
tract_utility_counts['weight'] = tract_utility_counts['acs_count']*tract_utility_counts['tract2nsrdb_alloc_ratio']*tract_utility_counts['hu_cust_combined_wt']

display(tract_utility_counts.tail(10))

for col in tract_utility_counts.columns.values:
    print(col)

Unnamed: 0,geoid,tract2nsrdb_alloc_ratio,utility_id,hu_cust_combined_wt,tractce,puma5ce,fips,county_name,state_abbr,state_name,...,cz_moisture_regime,ba_climate_zone,climate_zone,iso_zone,custom_region,acs_count,puma7ce,puma_tsv,location,weight
852252,G09001043700,1.0,17569,0.012143,43700.0,103.0,9001.0,Fairfield County,CT,Connecticut,...,A,Cold,5A,NEISO,CR03,1088.0,9103.0,"CT, 00103",CT_Bridgeport-Sikorsky.Mem.AP.725040,13.211282
852253,G09001043700,1.0,4176,0.63896,43700.0,103.0,9001.0,Fairfield County,CT,Connecticut,...,A,Cold,5A,NEISO,CR03,1088.0,9103.0,"CT, 00103",CT_Bridgeport-Sikorsky.Mem.AP.725040,695.188818
852254,G09001043800,1.0,19497,0.341909,43800.0,103.0,9001.0,Fairfield County,CT,Connecticut,...,A,Cold,5A,NEISO,CR03,3233.0,9103.0,"CT, 00103",CT_Bridgeport-Sikorsky.Mem.AP.725040,1105.391635
852255,G09001043800,1.0,13825,0.006988,43800.0,103.0,9001.0,Fairfield County,CT,Connecticut,...,A,Cold,5A,NEISO,CR03,3233.0,9103.0,"CT, 00103",CT_Bridgeport-Sikorsky.Mem.AP.725040,22.592259
852256,G09001043800,1.0,17569,0.012143,43800.0,103.0,9001.0,Fairfield County,CT,Connecticut,...,A,Cold,5A,NEISO,CR03,3233.0,9103.0,"CT, 00103",CT_Bridgeport-Sikorsky.Mem.AP.725040,39.25742
852257,G09001043800,1.0,4176,0.63896,43800.0,103.0,9001.0,Fairfield County,CT,Connecticut,...,A,Cold,5A,NEISO,CR03,3233.0,9103.0,"CT, 00103",CT_Bridgeport-Sikorsky.Mem.AP.725040,2065.758685
852258,G09001045300,1.0,19497,0.341909,45300.0,101.0,9001.0,Fairfield County,CT,Connecticut,...,A,Cold,5A,NEISO,CR03,795.0,9101.0,"CT, 00101",CT_Bridgeport-Sikorsky.Mem.AP.725040,271.817615
852259,G09001045300,1.0,13825,0.006988,45300.0,101.0,9001.0,Fairfield County,CT,Connecticut,...,A,Cold,5A,NEISO,CR03,795.0,9101.0,"CT, 00101",CT_Bridgeport-Sikorsky.Mem.AP.725040,5.555474
852260,G09001045300,1.0,17569,0.012143,45300.0,101.0,9001.0,Fairfield County,CT,Connecticut,...,A,Cold,5A,NEISO,CR03,795.0,9101.0,"CT, 00101",CT_Bridgeport-Sikorsky.Mem.AP.725040,9.653464
852261,G09001045300,1.0,4176,0.63896,45300.0,101.0,9001.0,Fairfield County,CT,Connecticut,...,A,Cold,5A,NEISO,CR03,795.0,9101.0,"CT, 00101",CT_Bridgeport-Sikorsky.Mem.AP.725040,507.973447


geoid
tract2nsrdb_alloc_ratio
utility_id
hu_cust_combined_wt
tractce
puma5ce
fips
county_name
state_abbr
state_name
long_name
region
division
state
county
region_name
division_name
cz_number
cz_moisture_regime
ba_climate_zone
climate_zone
iso_zone
custom_region
acs_count
puma7ce
puma_tsv
location
weight


# Create weight table
The sum of all weights for each county is 1.0

In [4]:
# Create table
df = tract_utility_counts[['long_name','utility_id','weight']].groupby(['long_name','utility_id']).sum().reset_index()

# Save the table with unit weights
print(df['weight'].sum())
df.to_csv('eiaid_units.csv',index=False)        

# Normalize by county (county weight sums to 1.0)
for county in df['long_name'].unique():
    idx = np.where(df['long_name'] == county)[0]
    df.loc[idx,'weight'] = df.loc[idx,'weight']/df.loc[idx,'weight'].sum()

# Save File
df.to_csv('eiaid.csv',index=False)        
display(df.head(20))
print(df['weight'].sum())

120102793.14653802


Unnamed: 0,long_name,utility_id,weight
0,"AL, Autauga County",195,0.673479
1,"AL, Autauga County",3222,0.217334
2,"AL, Autauga County",30517,0.109187
3,"AL, Baldwin County",195,0.397553
4,"AL, Baldwin County",1149,0.313909
5,"AL, Baldwin County",6145,0.030433
6,"AL, Baldwin County",6491,0.189198
7,"AL, Baldwin County",17646,0.068907
8,"AL, Barbour County",195,0.603841
9,"AL, Barbour County",5204,0.070994


3108.0
