In [None]:
import pandas as pd
import decimal 

# EIA Data #

## Format EIA Data ##

### Variables

In [None]:
# The EIA data table is formatted with notes at the beginning and end, drop them from dataframe
# see skiprows and skipfooter
eiaDropHead = [0, 1]

# Allowed technolgoy codes
allowedTechCodes = ['ST','GT','CC']

# Combined cycle is not operable but keep all units
CT = [10789, 52132, 55372, 55470, 10745, 57953, 54096, 50118]

# Columns we don't need for GGPT (will drop later, putting here for visibility)
eiaDropCols = ['Sector', 'Net Summer Capacity (MW)', 'Net Winter Capacity (MW)', 'Technology', 'Operating Month', 'Planned Retirement Month', 'Retirement Month', 'Planned Operation Month',
'Google Map', 'Bing Map', 'Balancing Authority Code', 'Planned Derate Year', 'Planned Derate Month', 'Planned Derate of Summer Capacity (MW)', 'Planned Uprate Year', 'Planned Uprate Month', 
'Planned Uprate of Summer Capacity (MW)']

# EIA list of tabs to merge
eiaTabs = ['Operating', 'Planned', 'Retired', 'Canceled or Postponed', 'Operating_PR', 'Planned_PR', 'Retired_PR']

eiaColumns = []

eiaList = [] # for storing each tab to merge later, better performance and this is a large table)
eiaLenQC = 0

### Read data

In [None]:
# Loop over each sheet to read in and merge
for i in eiaTabs:
    print("Processing tab: ", i)

    eia_ = pd.read_excel(r'C:\Users\Sarah\Documents\GEM\research\North America\US\EIA\july_generator2021.xlsx', sheet_name=i, skiprows=eiaDropHead, skipfooter=2, keep_default_na=False)
    eiaColumns += [x for x in eia_.columns if x not in eiaColumns]

    eiaLenQC += len(eia_.index) # Can compare this number later to the sum of all rows in the EIA Excel sheet

    # Keep only gas-fired plants
    eia_ = eia_.loc[eia_['Energy Source Code'] == 'NG']
   
    # Add status column based on the tab's name, EIA Status column inconsistent
    eia_['Status_Tab'] = i.lower()
            
    # Store this tab's data to merge
    eiaList += [eia_]

### Backfill columns that are missing in each tab

Columns not in other tabs will backfill as NaN, and worse, for integer columns
will backfill as a NaN float because NaN can't be an integer (e.g. Retired Year column).
Can't blanket apply .fillna() because some columns are string and other float, throws error.
And if leave float NaN, when try to create Sets later on for merging rows, they don't reduce
(float NaN have some kind of differentiating ID that disallows counting as duplicates)

In [None]:
for i in eiaList:
    # Columns
    for col in eiaColumns:
        if col not in i.columns:
            i[col] = 'not found'

colLen = set()
for i in eiaList:
    # print(len(list(i.columns)))
    colLen.add(len(list(i.columns)))
if len(colLen) != 1:
    print('Error in columns.')
else:
    print('Columns complete.')

### Combine tabs

In [None]:
# Combine tabs
eia = pd.concat(eiaList)

# Drop columns
eia.drop(columns=eiaDropCols, inplace=True)

# Total rows
print('\n')
print("Total row count of all tabs: ", eiaLenQC, '\n')
print("Total row count, only NG-fired units: ", len(eia.index), '\n')

## Merge individual combined cycle units into single sets ##

### Groupby "Unit Code", the identifier for common components of a combined cycle set

In [None]:
# Merge units into proper sets
# 'ST', 'GT', and 'CS' are individual. 'CA' (steam) and 'CT' (gas turbine) need to be combined to 'CC'
ccSets = eia.loc[eia['Prime Mover Code'].isin(['CA', 'CT'])]

#Deal with these units separately
ccSets = ccSets.drop(index=ccSets.loc[ccSets['Plant ID'].isin(CT)].index)
ccToRemove = ccSets.index

# Group by plant ID and Unit Code fields
# Sum capacities, group Generator IDs, keep sets of values that could possibly vary between units
ccGroup = ccSets.groupby(['Plant ID', 'Unit Code', 'Status_Tab'], as_index=False).agg({'Nameplate Capacity (MW)': 'sum', 'Generator ID': lambda x: list(x), 'Status': lambda x: set(x),
'Operating Year': lambda x: set(x), 'Planned Retirement Year': lambda x: set(x), 'Planned Operation Year': lambda x: set(x), 'Retirement Year': lambda x: set(x), 'Entity ID': lambda x: set(x),
'Entity Name': lambda x: set(x)})

# Set the correct tech code
ccGroup['Prime Mover Code'] = 'CC'

### Deal with conflicting values from merged rows, such as start year

In [None]:
# Check Sets for multiple values and turn into a range if necessary (e.g. different start years for units of a CC generator)
checkSets = ['Status', 'Operating Year', 'Planned Retirement Year', 'Planned Operation Year', 'Retirement Year', 'Entity ID','Entity Name']
for i in checkSets:
    # Wherever there are multiple values print them to screen
    print('\n', i)
    print("Cells with multiple values: ", len(ccGroup.loc[ccGroup[i].apply(len) != 1].index))
    # Possible issues combing NaN and year
    if len(ccGroup.loc[(ccGroup[i].apply(lambda x: 'not found' in list(x))) & (ccGroup[i].apply(len) > 1)].index) > 0:
        print('ERROR Values and not founds: ', len(ccGroup.loc[(ccGroup[i].apply(lambda x: 'not found' in list(x))) & (ccGroup[i].apply(len) > 1)].index))
    # For mutliple years per new CC value, create year range
    check = []
    check = ccGroup.loc[ccGroup[i].apply(len) != 1]
    if len(check) > 0 and 'Year' in i:
        ccGroup.loc[ccGroup[i].apply(len) != 1, i] = ccGroup.loc[ccGroup[i].apply(len) != 1][i].apply(lambda x: str(int(min(list(x)))) + '-' +  str(int(max(list(x)))) )
    else:
        print("Manually fix: ", i)
        print(ccGroup.loc[ccGroup[i].apply(len) != 1])
        # ccGroup.loc[ccGroup[i].apply(len) != 1].to_excel('./ERRORS.xlsx', index=False)
    # Wherever there's just one value, replace the set with the value
    ccGroup.loc[ccGroup[i].apply(len) == 1, i] = ccGroup.loc[ccGroup[i].apply(len) == 1][i].apply(lambda x: list(x)[0])

In [None]:
# Join back relevant plant-level columns
eiaBackfill = eia.drop_duplicates(subset='Plant ID', ignore_index=True)
eiaBackfill = eiaBackfill.drop(columns=['Unit Code', 'Nameplate Capacity (MW)', 'Generator ID', 'Status', 'Operating Year', 'Planned Retirement Year', 'Planned Operation Year',
'Retirement Year', 'Entity ID', 'Entity Name', 'Prime Mover Code', 'Status_Tab'])
ccGroupJoin = ccGroup.merge(eiaBackfill, on='Plant ID', how='left')

In [None]:
# Remove units from eia dataset, add back-in the new sets
eiaFormatted = eia.drop(index=ccToRemove)

# Check to make sure columns match first
if sorted(list(eiaFormatted.columns)) != sorted(list(ccGroupJoin.columns)):
    print("Error, missing columns")
    exit()

# Append CC rows
eiaFormatted = eiaFormatted.append(ccGroupJoin, ignore_index=True)

### Combined cycle fixes

In [None]:
# Manual fixes
# For single units in standby within CC unit
# convert tech to just GT or ST and make mothballed

# Unit ids
CT_U = list(eiaFormatted.loc[eiaFormatted['Plant ID'].isin(CT)].index)

# Deal with mothballed generators
sepCode = ['\(SB\)', '\(OS\)', '\(OA\)']
ccSetsSkip = eiaFormatted.loc[(eiaFormatted['Status'].str.contains('|'.join(sepCode))) & (eiaFormatted['Plant ID'].isin(CT)) & eiaFormatted['Prime Mover Code'].isin(['CT','CA'])]
CT_2 = [x for x in CT_U if x not in list(ccSetsSkip.index)]

for x in list(ccSetsSkip.index):
    eiaFormatted.loc[x, 'Status'] = 'mothballed'
    if eiaFormatted.loc[x, 'Prime Mover Code'] == 'CT':
        eiaFormatted.loc[x, 'Prime Mover Code'] = 'GT'
    else:
        eiaFormatted.loc[x, 'Prime Mover Code'] = 'ST'

In [None]:
# Now see if there's still a CC unit with one status to combine
# Merge units into proper sets
# 'ST', 'GT', and 'CS' are individual. 'CA' (steam) and 'CT' (gas turbine) need to be combined to 'CC'
ccSetsM = eiaFormatted.loc[eiaFormatted['Prime Mover Code'].isin(['CA', 'CT'])]
ccSetsM = ccSetsM.loc[ccSetsM.index.isin(CT_2)]
ccToRemoveM = ccSetsM.index

# Group by plant ID and Unit Code fields
# Sum capacities, group Generator IDs, keep sets of values that could possibly vary between units
ccGroupM = ccSetsM.groupby(['Plant ID', 'Unit Code', 'Status_Tab'], as_index=False).agg({'Prime Mover Code': lambda x: set(x), 'Nameplate Capacity (MW)': 'sum', 'Generator ID': lambda x: list(x), 'Status': lambda x: set(x),
'Operating Year': lambda x: set(x), 'Planned Retirement Year': lambda x: set(x), 'Planned Operation Year': lambda x: set(x), 'Retirement Year': lambda x: set(x), 'Entity ID': lambda x: set(x),
'Entity Name': lambda x: set(x)})

# Check for multiple statuses
print("Cells with multiple Status values: ", ccGroupM.loc[ccGroupM['Status'].apply(len) != 1])

# Check is a CC unit still
ccGroupM.loc[ccGroupM['Prime Mover Code'].apply(len) > 1, 'Prime Mover Code'] = 'CC'
ccGroupM.loc[ccGroupM['Prime Mover Code'] == {'CT'}, 'Prime Mover Code'] = 'GT'
ccGroupM.loc[ccGroupM['Prime Mover Code'] == {'CA'}, 'Prime Mover Code'] = 'ST'

# Check Sets for multiple values and turn into a range if necessary (e.g. different start years for units of a CC generator)
for i in ['Operating Year', 'Planned Retirement Year', 'Planned Operation Year', 'Retirement Year']: 
    print('\n', i)
    # Possible issues combing NaN and year
    if len(ccGroupM.loc[(ccGroupM[i].apply(lambda x: 'not found' in list(x))) & (ccGroupM[i].apply(len) > 1)].index) > 0:
        print('ERROR Values and not founds: ', len(ccGroupM.loc[(ccGroupM[i].apply(lambda x: 'not found' in list(x))) & (ccGroupM[i].apply(len) > 1)].index))
   
    # For mutliple years per new CC value, create year range
    ccGroupM.loc[ccGroupM[i].apply(len) != 1, i] = ccGroupM.loc[ccGroupM[i].apply(len) != 1][i].apply(lambda x: str(int(min(list(x)))) + '-' +  str(int(max(list(x)))) )
   
    # Wherever there's just one value, replace the set with the value
    ccGroupM.loc[ccGroupM[i].apply(len) == 1, i] = ccGroupM.loc[ccGroupM[i].apply(len) == 1][i].apply(lambda x: list(x)[0])


In [None]:
# Join back relevant plant-level columns
ccGroupJoinM = ccGroupM.merge(eiaBackfill, on='Plant ID', how='left')

# Remove units from eia dataset, add back-in the new sets
eiaFormatted.drop(index=ccToRemoveM, inplace=True)

# Check to make sure columns match first
if sorted(list(eiaFormatted.columns)) != sorted(list(ccGroupJoinM.columns)):
    print("Error, missing columns")
    exit()

# Append CC rows
eiaFormatted = eiaFormatted.append(ccGroupJoinM, ignore_index=True)

## Formatting

### Capacity cut-off

In [None]:
# remove units <50 MW
tmp = pd.to_numeric(eiaFormatted['Nameplate Capacity (MW)'], errors='coerce')
print("Units missing capacity data or other errors: ", '\n', eiaFormatted.loc[tmp.isna(), ['Plant Name','Status_Tab']])
eiaFormatted.drop(index=eiaFormatted.loc[tmp.isna()].index, inplace=True)
eiaFormatted['Nameplate Capacity (MW)'] = eiaFormatted['Nameplate Capacity (MW)'].apply(lambda x: decimal.Decimal(x).quantize(decimal.Decimal('1'), 
    rounding=decimal.ROUND_HALF_UP)) # Python 3 went to Round Half Even, overriding
eiaFormatted.drop(index=eiaFormatted.loc[eiaFormatted['Nameplate Capacity (MW)'] < 50].index, inplace=True)

### Technology codes

In [None]:
# Change 'CS' to 'CC', etc
eiaFormatted.loc[eiaFormatted['Prime Mover Code'] == 'CS', 'Prime Mover Code'] = 'CC'
eiaFormatted.loc[eiaFormatted['Prime Mover Code'] == 'CT', 'Prime Mover Code'] = 'GT'
eiaFormatted.loc[eiaFormatted['Prime Mover Code'] == 'CA', 'Prime Mover Code'] = 'ST'

# Remove IC and other tech
print("Current tech codes: ", set(eiaFormatted['Prime Mover Code']))
print("Old row count: ", len(eiaFormatted.index))
eiaFormatted.drop(eiaFormatted.loc[~eiaFormatted['Prime Mover Code'].isin(allowedTechCodes)].index, inplace=True)
print("New row count: ", len(eiaFormatted.index))


### EIA Status

In [None]:
# Standardize status values
# statuses = list(eiaFormatted['Status'])
# s = []
# s += [x for x in statuses if type(x) != set]
# print(set(s))
st = {
    '(U) Under construction, less than or equal to 50 percent complete': 'construction',
    '(OP) Operating': 'operating', 
    '(OS) Out of service and NOT expected to return to service in next calendar year': 'mothballed', 
    '(OA) Out of service but expected to return to service in next calendar year': 'mothballed', 
    '(P) Planned for installation, but regulatory approvals not initiated': 'proposed', 
    '(TS) Construction complete, but not yet in commercial operation': 'construction', 
    '(T) Regulatory approvals received. Not under construction': 'proposed', 
    '(V) Under construction, more than 50 percent complete': 'construction', 
    '(SB) Standby/Backup: available for service but not normally used': 'mothballed', 
    '(L) Regulatory approvals pending. Not under construction': 'proposed'
}

for key in st:
    eiaFormatted.loc[eiaFormatted['Status'] == key] = st[key]

# Reset these statuses to construction
setFixPlant = [613, 10554]
setFixUnit = ['DBCC','']

eiaFormatted.loc[(eiaFormatted['Plant ID'].isin(setFixPlant)) & eiaFormatted['Unit Code'].isin(setFixUnit), 'Status'] = 'construction'

### Column names to match GGPT

### Other ID field values for GGPT

In [None]:
# Create 'Other ID' fields to match GGPT

In [None]:
# ccGroup.loc[ccGroup['Status'].apply(len) != 1].to_excel(r'C:\Users\Sarah\Documents\GEM\code\GGPT-US-EIA\scratch\eiaFormatted.xlsx', index=False)

# Compare new EIA data to existing GGPT values


In [None]:
# Match IDs to GGPT and compare field values (status, capacity, year)

# Locate any new unit IDs

# Check capacity for any new units

# Join CHP

In [None]:
# Check for missing dual-fuel plants in EIA-860


In [None]:
# Add in ownership information from separate EIA table