**Overview Stage 1**

**1. Import data**   
2 different datasets are imported and formatted to be used as inputs to calculate   
Data is taken for the year 2019.
    1. yearly customer retail sales data (EIA-861)
    2. hourly demand data (EIA-930) (*Here, we neglect subregional demand data*)

**3. Caluclate fraction of retail sales sold to each state** 

The output of this section will be required to move to Stage 2.

**4. Caluclate hourly demand profiles for each state**   


# Import data

In [1]:
#import required packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob

## Import Customer retail sales data from EIA

The data (EIA-861) can be found here: https://www.eia.gov/electricity/data/eia861/    
The required file is called *Sales to Ultimate Customers*.
Enter the path to this file in the code below.

In [24]:
#import customer retail sales data
retail_sales = pd.read_excel(r'C:\Users\Muriel Hauser\Documents\Carnegie Science\State demand data\Sales_Ult_Cust_2019.xlsx', sheet_name="States", header=2, nrows=4485) #adapt path to your location
#choose columns required for this study
retail_sales = retail_sales[['Megawatthours.4','BA Code','State','Part']]
#rename columns
retail_sales=retail_sales.rename(columns={ "BA Code": "BA_code", "Megawatthours.4": "Retail sales (MWh)"}) 
#delete all rows where Part==C, only Parts (A,B & D) should be counted for Sales and Customers for a state or US total 
#See bottom row in file for explanations
retail_sales = retail_sales[retail_sales.Part != 'C']
#delete last row as 
retail_sales.head()

Unnamed: 0,Retail sales (MWh),BA_code,State,Part
0,58000,DUK,SC,A
1,204261,TVA,MS,A
2,127579,MISO,LA,A
3,2623,PJM,MD,A
4,701387,PJM,VA,A


## Import demand data

Import hourly demand data (EIA-930) for 2019 from here: https://github.com/truggles/EIA_Cleaned_Hourly_Electricity_Demand_Data, as described here: https://www.nature.com/articles/s41597-020-0483-x

##### import raw demand data

In [3]:
#function to import demand data and safe as dictionary (each key = one BA)
def importfiles(path, dicname):
    for f in path:
        key = f.split('\\')[-1].split('.')[0]
        BA_demand=pd.read_csv(f, error_bad_lines=False)
        #BA_demand=BA_demand.iloc[2208:10968]    #only year 2019
        BA_demand=BA_demand.iloc[2209:10969]    #only year 2019 for MEM format #adjust if other years preferred
        BA_demand=BA_demand.set_index('date_time')
        #print(key)
        #BA = analyze(f)
        dicname[(key)] = BA_demand



In [4]:
#import data
path=glob.glob(r"C:\Users\Muriel Hauser\Documents\Carnegie Science\State demand data\BA demand data\release_2020_Oct_include_subregions\subregions_and_balancing_authorities\*.csv") #adjust path to your location
demand_data=dict()
importfiles(path,demand_data)
#example
demand_data['AEC']

Unnamed: 0_level_0,raw demand (MW),category,cleaned demand (MW),forecast demand (MW)
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01 01:00:00,EMPTY,MISSING,483,1048
2019-01-01 02:00:00,EMPTY,MISSING,460,1018
2019-01-01 03:00:00,EMPTY,MISSING,434,975
2019-01-01 04:00:00,EMPTY,MISSING,416,890
2019-01-01 05:00:00,EMPTY,MISSING,398,809
...,...,...,...,...
2019-12-31 20:00:00,423,OKAY,423,866
2019-12-31 21:00:00,419,OKAY,419,845
2019-12-31 22:00:00,414,OKAY,414,872
2019-12-31 23:00:00,449,OKAY,449,976


##### sum up subregional demand to BA level

In Stage 1, the subregions in MISO, ISNE, PJM, SWPP, NYIS, and CISO are not considered. Therefore, the available subregional demand data needs to be summed up to the BA level. This concerns MISO, ISNE, PJM, SWPP, NYIS, and CISO.

In [5]:
### functions to help sum up subregion demand data to BA level ###
#find BA names for each subregions
def search_subregions(dictionary, BA):
    names=[]
    for v in dictionary.keys():
        #print(v)
        if BA in v:
            #print(v)
            names.append(v)
    return names

#sum the demand of the subregions
def sum_subregions(names_of_subregions, dictionary):
    df_temp=pd.DataFrame()
    for i in names_of_subregions:
        df_temp[i]=dictionary[i]['cleaned demand (MW)']
    df_temp['cleaned demand (MW)']=df_temp.sum(axis=1)
    return df_temp

In [6]:
#sum up subregion demand to BA level
subregions_to_BA_list=['MISO','ISNE','PJM','SWPP','NYIS','CISO']

#for each BA, sum subregions using the above functions
for BA in subregions_to_BA_list:
    names = search_subregions(demand_data,BA)
    demand_data[BA] =sum_subregions(names, demand_data)
    for i in names:
        del demand_data[i]

##### Add the demand data of CPLW to the demand data of CPLE

In [7]:
#there is demand data for CPLW, but CPLW is not covered in the utility customers data. 
#therefore I add CPLW to CPLE
#CPLE=Duke Energy Progress East ;CPLW=Duke Energy Progress West
demand_data['CPLE']['cleaned demand (MW)']=demand_data['CPLE']['cleaned demand (MW)']+demand_data['CPLW']['cleaned demand (MW)']

# Calculate fraction of retail sales sold from each BA to each state

In [12]:
#using the retail sales data, calculate the sum of sales by BA and State
Sales_BA_State=pd.DataFrame()
Sales_BA_State=retail_sales.groupby(["BA_code", "State"]).sum()
Sales_BA_State

Unnamed: 0_level_0,Unnamed: 1_level_0,Retail sales (MWh)
BA_code,State,Unnamed: 2_level_1
AEC,AL,6540808.0
AEC,FL,1954233.0
AECI,AR,365486.0
AECI,IA,677468.0
AECI,MO,14922016.0
...,...,...
WALC,NM,579223.0
WALC,NV,200939.0
WALC,UT,50181.0
WAUW,MT,990441.0


Calculate what fraction of sales of each BA goes to each state: (sales [BA&state] / sales [BA])

In [13]:
#first, get unique values (of BA's) of multiindex 
bas = Sales_BA_State.index.get_level_values(0).unique() 
#for each State in each BA, calculate fraction of electricity sales and safe them in new column "fraction"
for ba in bas:
    Sales_BA_State.loc[ba,"fraction"] = (Sales_BA_State.loc[ba, "Retail sales (MWh)"]/Sales_BA_State.loc[ba, "Retail sales (MWh)"].sum()).values # sales by state and BA divided by sales of entire BA
Sales_BA_State    

Unnamed: 0_level_0,Unnamed: 1_level_0,Retail sales (MWh),fraction
BA_code,State,Unnamed: 2_level_1,Unnamed: 3_level_1
AEC,AL,6540808.0,0.769956
AEC,FL,1954233.0,0.230044
AECI,AR,365486.0,0.019356
AECI,IA,677468.0,0.035879
AECI,MO,14922016.0,0.790274
...,...,...,...
WALC,NM,579223.0,0.051220
WALC,NV,200939.0,0.017769
WALC,UT,50181.0,0.004437
WAUW,MT,990441.0,0.752520


In [None]:
#export
Sales_BA_State.to_excel(r"C:\Users\Muriel Hauser\Documents\Carnegie Science\State demand data\fraction_By_state_BAcode.xlsx")

**To continue to Stage 2, only these fractions are required.**

# Calculate the hourly demand profile for each state

The fraction calculated above shows how much electricity each BA sells to each state. Multiplying the fraction with the hourly demand profile of the BA provides the amount of electricity demand that is supplied from the BA to a state. For each state, adding up the fractions of the demand of all BAs that contribute to the state results in the total hourly demand profile.

In [32]:
#create a list of all the states in the US(51)(Washington DC counted as a seperate state)
States=retail_sales.State.unique() 
States=States.tolist()
#remove states not in CONUS
States.remove('AK')
States.remove('HI')
#remove states that are supplied by ISNE, as these demand profiles are calculated separately in section 3
States.remove('MA')
States.remove('ME')
States.remove('NH')
States.remove('VT')
States.remove('CT')
States.remove('RI')
#create new dictionary, where each key will be one State's hourly demand data
final_state_demand=dict()
for state in States:
    summ = 0
    for i in Sales_BA_State.loc[Sales_BA_State.index.get_level_values(1)==state].index.get_level_values(0):  #i= all BA's that are "active" in the particular state
        # for some BA's there is no demand data, hence those are skipped
        if i== 'OVEC':  #Ohio Valley Electric Corporation
            continue
        if i== 'SEC':  #Florida
            continue
        if i== 'SEPA':  #Georiga
            continue
        if i== 'AMPL': #Arkansas
            continue
        if i== 'CEA': #Canadian Electricity Association
            continue
        if i== 'NBSO': #New Brunswick System Operator, Canada
            continue
        if i== 'HECO': #Hawaiian electric company
            continue
        summ+=demand_data[i]['cleaned demand (MW)']*(Sales_BA_State.loc[(Sales_BA_State.index.get_level_values(1)==state) & (Sales_BA_State.index.get_level_values(0)==i)]['fraction'].values)
    final_state_demand[state]=summ
    #print(new_dicti[state].sum())

In [37]:
#example
final_state_demand['IA']

date_time
2019-01-01 01:00:00    5990.331389
2019-01-01 02:00:00    5844.391683
2019-01-01 03:00:00    5690.278765
2019-01-01 04:00:00    5548.918563
2019-01-01 05:00:00    5384.130850
                          ...     
2019-12-31 20:00:00    5858.457504
2019-12-31 21:00:00    5776.460321
2019-12-31 22:00:00    5748.083604
2019-12-31 23:00:00    5879.125937
2020-01-01 00:00:00    6115.343814
Name: cleaned demand (MW), Length: 8760, dtype: float64

In [33]:
#safe hourly demand data per state to excel files
for i in final_state_demand: #all state demand data are saved in a dictionary
    final_state_demand[i].to_csv(r'C:\\Users\\Muriel Hauser\\Documents\\Carnegie Science\\State demand data\\State demand data\\2019_MEM_format\\'+i+'.csv',sep=';',index=True) #adapt path to your location

SC
MS
LA
MD
VA
MN
IL
WI
NY
PA
OH
GA
IA
IN
AZ
NV
AL
FL
NC
CA
MO
ID
KS
OK
MI
NE
WA
TX
TN
KY
WV
AR
SD
OR
CO
NJ
NM
ND
WY
MT
UT
DE
DC
