In [None]:
import pandas as pd
import decimal
import numpy as np
from datetime import datetime

# EIA Data #

## Initalize

### Variables

In [None]:
stamp = datetime.now().strftime("%m %d %Y %H-%M-%S")

# The EIA data table is formatted with notes at the beginning and end, drop them from dataframe
# see skiprows and skipfooter
eiaDropHead = [0, 1]

# Allowed technolgoy codes
allowedTechCodes = ['ST','GT','CC']

# Combined cycle is not operable but keep all units
CT = [10789, 52132, 55372, 55470, 10745, 57953, 54096, 50118]

# Reset these statuses to construction
setFixPlant = [613, 10554]
setFixUnit = ['DBCC','CC_set']

# Columns we don't need for GGPT (will drop later, putting here for visibility)
eiaDropCols = ['Sector', 'Net Summer Capacity (MW)', 'Net Winter Capacity (MW)', 'Technology', 'Operating Month', 'Planned Retirement Month', 'Planned Operation Month',
'Google Map', 'Bing Map', 'Balancing Authority Code', 'Planned Derate Year', 'Planned Derate Month', 'Planned Derate of Summer Capacity (MW)', 'Planned Uprate Year', 'Planned Uprate Month', 
'Planned Uprate of Summer Capacity (MW)']

# EIA list of tabs to merge
# eiaTabs = ['Operating', 'Planned', 'Retired', 'Canceled or Postponed', 'Operating_PR', 'Planned_PR', 'Retired_PR']
eiaTabs = ['Operating', 'Planned', 'Operating_PR', 'Planned_PR']

eiaColumns = []

eiaList = [] # for storing each tab to merge later, better performance and this is a large table)
eiaLenQC = 0

#
runningRowCount= 0

#
generatorTabs = ['Operable', 'Proposed', 'Retired and Canceled']

### Read 860M data

In [None]:
# Loop over each sheet to read in and merge
for i in eiaTabs:
    print("Processing tab: ", i)

    eia_ = pd.read_excel(r'C:\Users\Sarah\Documents\GEM\research\North America\US\EIA\july_generator2021.xlsx', sheet_name=i, skiprows=eiaDropHead, skipfooter=2, keep_default_na=False)
    eiaColumns += [x for x in eia_.columns if x not in eiaColumns]

    eiaLenQC += len(eia_.index) # Can compare this number later to the sum of all rows in the EIA Excel sheet

    # Keep only gas-fired plants
    eia_ = eia_.loc[eia_['Energy Source Code'] == 'NG']
   
    # Add status column based on the tab's name, EIA Status column inconsistent
    eia_['Status_Tab'] = i.lower()
            
    # Store this tab's data to merge
    eiaList += [eia_]

### Store 860M retirement data separately

In [None]:
# Mason didn't put retired or canceled units in the original push so adding them know creates a lot of errors in table comparison

### Backfill columns that are missing in each tab

Columns not in other tabs will backfill as NaN, and worse, for integer columns
will backfill as a NaN float because NaN can't be an integer (e.g. Retired Year column).
Can't blanket apply .fillna() because some columns are string and other float, throws error.
And if leave float NaN, when try to create Sets later on for merging rows, they don't reduce
(float NaN have some kind of differentiating ID that disallows counting as duplicates)

In [None]:
for i in eiaList:
    # Columns
    for col in eiaColumns:
        if col not in i.columns:
            i[col] = 'not found'

colLen = set()
for i in eiaList:
    # print(len(list(i.columns)))
    colLen.add(len(list(i.columns)))
if len(colLen) != 1:
    print('Error in columns.')
else:
    print('Columns complete.')

### Combine tabs

In [None]:
# Combine tabs
eia = pd.concat(eiaList)

# Drop columns
eia.drop(columns=eiaDropCols, inplace=True)

# Total rows
print('\n')
print("Total row count of all tabs: ", eiaLenQC, '\n')
print("Total row count, only NG-fired units: ", len(eia.index), '\n')

### Read 860 Data

In [None]:
# 860 spreadsheets, single tab
plantData = pd.read_excel(r'C:\Users\Sarah\Documents\GEM\research\North America\US\EIA\eia8602020\2___Plant_Y2020.xlsx', skiprows=[0], keep_default_na=False)
ownerData = pd.read_excel(r'C:\Users\Sarah\Documents\GEM\research\North America\US\EIA\eia8602020\4___Owner_Y2020.xlsx', skiprows=[0], keep_default_na=False)

# Loop over each sheet to read in and merge
g_=[]
for i in generatorTabs:
    print("Processing tab: ", i)
    if i in ['Operable', 'Proposed']:
        gData = pd.read_excel(r"C:\Users\Sarah\Documents\GEM\research\North America\US\EIA\eia8602020\3_1_Generator_Y2020.xlsx", sheet_name=i, skiprows=[0], skipfooter=1, keep_default_na=False)
    else:
        gData = pd.read_excel(r"C:\Users\Sarah\Documents\GEM\research\North America\US\EIA\eia8602020\3_1_Generator_Y2020.xlsx", sheet_name=i, skiprows=[0], keep_default_na=False)
    
    gCol = [x for x in list(gData.columns) if x not in ['Generator ID', 'Plant Code', 'Energy Source 1', 'Energy Source 2', 'Energy Source 3', 'Energy Source 4', 'Energy Source 5', 'Energy Source 6', 
    'Associated with Combined Heat and Power System']]
    gData.drop(columns=gCol, inplace=True)

    # Keep ~only gas-fired plants
    # gIndex = gData.loc[~gData['Plant Code'].isin(list(eia['Plant ID']))].index
    # gData.drop(index=gIndex, inplace=True)
            
    # Store this tab's data to merge
    g_ += [gData]

# Combine tabs
generatorData = pd.concat(g_)

## Add 860 data

### 860 Fuels + CHP
### (fuel has to be before Generator IDs become lists or else can't merge easily)

In [None]:
# EIA fuel columns
fuelCols = ['Energy Source 1', 'Energy Source 2', 'Energy Source 3', 'Energy Source 4', 'Energy Source 5', 'Energy Source 6']

# Dictionary of some EIA fuel codes to GGPT domain
fuelMix = {'OBS':'B',
'WDS':'B',
'BFG':'BFG',
'OBL':'BL',
'WDL':'BL',
'ANT':'C',
'BIT':'C',
'LIG':'C',
'SGC':'C',
'SUB':'C',
'WC':'C',
'RC':'C',
'DFO':'FO',
'RFO':'FO',
'JF':'J',
'SUN':'S',
'SGP':'SG'}

# Redo fuel codes
for i in fuelCols:
   generatorData[i] = generatorData[i].apply(lambda x: fuelMix[x] if (x in fuelMix.keys()) else x)

In [None]:
# Merge fuels into one column
generatorData['fuelMerge'] = generatorData[fuelCols].apply(lambda row: '/'.join([x for x in row if x != '']), axis=1)

# multi-column matching issue, so made single index; can't '+' if there's hidden strings or ints in the column
generatorData = generatorData.astype({'Plant Code':'str', 'Generator ID': 'str'})
generatorData['gMix'] = list(generatorData['Plant Code'] + "_" + generatorData['Generator ID'])

eia = eia.astype({'Plant ID':'str', 'Generator ID': 'str'})
eia['gMix'] = list(eia['Plant ID'] + "_" + eia['Generator ID'])

rmGenCols = fuelCols + ['Plant Code', 'Generator ID']
g2 = generatorData.drop(columns=rmGenCols)
g2.rename(columns={"Associated with Combined Heat and Power System": 'CHP'}, inplace=True)


# merge
eia = eia.merge(g2, how='left', on='gMix')

In [None]:
# Fix unmatched CHP values, if any
print("This many CHP errors: ", len(eia.loc[eia['CHP'].isna()].index))
eia.loc[eia['CHP'].isna(), "CHP"] = 'not found'

# Compare Energy Code column to fuelMerge and replace where needed
print("This many Fuel errors: ", len(eia.loc[(eia['Energy Source Code'] != eia['fuelMerge']) & (~eia['fuelMerge'].isna()), 'Energy Source Code']))
eia.loc[(eia['Energy Source Code'] != eia['fuelMerge']) & (~eia['fuelMerge'].isna()), 'Energy Source Code'] = eia.loc[(eia['Energy Source Code'] != eia['fuelMerge']) & (~eia['fuelMerge'].isna()), 'fuelMerge']

# Drop extra columns after merge
eia = eia.drop(columns=['gMix', 'fuelMerge'])

### Join in 860 data, ownership

In [None]:
ownerData = pd.read_excel(r'C:\Users\Sarah\Documents\GEM\research\North America\US\EIA\eia8602020\4___Owner_Y2020.xlsx', skiprows=[0], keep_default_na=False)


In [None]:
# Format
# Find plants missing values (returns as type str which causes issues). 

ownerNotFound = ownerData.loc[ownerData['Percent Owned'].apply(lambda x: True if (x == ' ') else False)].index
ownerData.loc[ownerData['Percent Owned'].apply(lambda x: True if (x == ' ') else False), 'Percent Owned'] = 0
ownerData['Percent Owned'] = ownerData['Percent Owned'].astype(float)
ownerData['Percent Owned'] = ownerData['Percent Owned'] * 100
ownerData['Percent Owned'] = ownerData['Percent Owned'].astype(int) # remove decimal points
ownerData['Percent Owned'] = ownerData['Percent Owned'].astype(str)

# Make into GGPT format
ownerData['Owner Name'] = ownerData['Owner Name'].astype(str)
ownerData['oID'] = ownerData['Owner Name'] + ' [' + ownerData['Percent Owned'] + '%]'

# Turn 0 from the blanks into 'not found'
ownerData.loc[ownerData.index.isin(ownerNotFound), 'oID'] = '[% not found]'

# Group by Generator ID
ownerDataGrouped = ownerData.groupby(['Plant Code', 'Generator ID'])['oID'].apply('; '.join).reset_index()

# Global ID
ownerDataGrouped['Plant Code'] = ownerDataGrouped['Plant Code'].astype(str)
ownerDataGrouped['Generator ID'] = ownerDataGrouped['Generator ID'].astype(str)
ownerDataGrouped['gid'] = ownerDataGrouped['Plant Code'] + '_' + ownerDataGrouped['Generator ID'] 

eia['Plant ID'] = eia['Plant ID'].astype(str)
eia['Generator ID'] = eia['Generator ID'].astype(str)
eia['gid'] = eia['Plant ID'] + '_' + eia['Generator ID'] 

print(len(eia.index))
print(len(eia[['gid']].drop_duplicates().index))

# [can't, eia data only puts first entitiy code, not a list to match] Merge on Utility ID <--> Entity ID
eiaMultiOwner = eia.merge(right=ownerDataGrouped[['gid', 'oID']], how='left', on='gid')
print(eiaMultiOwner.columns.values)
# Drop columns
eia = eia.drop(columns=['gid'])

In [None]:
# print(tmp.loc[tmp['oID'].isna(), ['Plant ID', 'Generator ID', 'gid']])
# print(ownerDataGrouped.loc[(ownerDataGrouped['Plant Code'] == '3')])
# print(ownerDataGrouped.loc[(ownerDataGrouped['Plant Code'] == '3') & (ownerDataGrouped['Generator ID'] =='1')])
print(len(tmp.loc[tmp['oID'].isna()]))
print(len(tmp.loc[~tmp['oID'].isna()]))

print(len(tmp.loc[tmp['oID'].isna()]) +
len(tmp.loc[~tmp['oID'].isna()]))

In [None]:
# If not matched in Ownership table, the entity from EIA 860M is the owner
sameOwner = eiaMultiOwner.loc[eiaMultiOwner['oID'].isna()]
sameOwner = sameOwner.drop(columns=['oID'])
sameOwner['oID']  = sameOwner['Entity Name'] + ' [100%]'

dSO = eiaMultiOwner.loc[eiaMultiOwner['gid'].isin(sameOwner['gid'].tolist())].index.tolist()
newEIA = eiaMultiOwner.drop(index=dSO)
newEIA = newEIA.append(other=sameOwner, ignore_index=True)
newEIA = newEIA.drop(columns=['gid', 'Entity ID', 'Entity Name'])

print(len(newEIA.index))
print(newEIA.columns.values)
print(len(newEIA.loc[newEIA['oID'].isna()].index))

In [None]:
# TODO: deal with new fields (CHP, ownership) in CC code

## Merge individual combined cycle units into single sets ##

### Groupby "Unit Code", the identifier for common components of a combined cycle set

In [None]:
# Merge units into proper sets
# 'ST', 'GT', and 'CS' are individual. 'CA' (steam) and 'CT' (gas turbine) need to be combined to 'CC'
ccSets = eia.loc[eia['Prime Mover Code'].isin(['CA', 'CT'])]

#Deal with these units separately
ccSets = ccSets.drop(index=ccSets.loc[ccSets['Plant ID'].isin(CT)].index)
ccToRemove = ccSets.index

runningRowCount = 0 - len(ccToRemove)
print('Running row count: ', runningRowCount)

# Group by plant ID and Unit Code fields
# Sum capacities, group Generator IDs, keep sets of values that could possibly vary between units
ccGroup = ccSets.groupby(['Plant ID', 'Unit Code', 'Status_Tab'], as_index=False).agg({'Nameplate Capacity (MW)': 'sum', 'Generator ID': lambda x: list(x), 'Status': lambda x: set(x),
'Operating Year': lambda x: set(x), 'Planned Retirement Year': lambda x: set(x), 'Planned Operation Year': lambda x: set(x)})

# Set the correct tech code
ccGroup['Prime Mover Code'] = 'CC'

# IF not unit code, mason called it 'cc_set'
ccGroup.loc[ccGroup['Unit Code'] == '', 'Unit Code'] = 'CC_set'

### Deal with conflicting values from merged rows, such as start year

In [None]:
# Check Sets for multiple values and turn into a range if necessary (e.g. different start years for units of a CC generator)
checkSets = ['Status', 'Operating Year', 'Planned Retirement Year', 'Planned Operation Year']
for i in checkSets:
    # Wherever there are multiple values print them to screen
    print('\n', i)
    print("Cells with multiple values: ", len(ccGroup.loc[ccGroup[i].apply(len) != 1].index))
    # Possible issues combing NaN and year
    if len(ccGroup.loc[(ccGroup[i].apply(lambda x: 'not found' in list(x))) & (ccGroup[i].apply(len) > 1)].index) > 0:
        print('ERROR Values and not founds: ', len(ccGroup.loc[(ccGroup[i].apply(lambda x: 'not found' in list(x))) & (ccGroup[i].apply(len) > 1)].index))
    # For mutliple years per new CC value, create year range
    check = []
    check = ccGroup.loc[ccGroup[i].apply(len) != 1]
    if len(check) > 0 and 'Year' in i:
        ccGroup.loc[ccGroup[i].apply(len) != 1, i] = ccGroup.loc[ccGroup[i].apply(len) != 1][i].apply(lambda x: str(int(min(list(x)))) + '-' +  str(int(max(list(x)))) )
    else:
        print("Manually fix: ", i)
        print(ccGroup.loc[ccGroup[i].apply(len) != 1])

    # Wherever there's just one value, replace the set with the value
    ccGroup.loc[ccGroup[i].apply(len) == 1, i] = ccGroup.loc[ccGroup[i].apply(len) == 1][i].apply(lambda x: list(x)[0])

In [None]:
# Join back relevant plant-level columns
eiaBackfill = eia.drop_duplicates(subset='Plant ID', ignore_index=True)
eiaBackfill = eiaBackfill.drop(columns=['Unit Code', 'Nameplate Capacity (MW)', 'Generator ID', 'Status', 'Operating Year', 'Planned Retirement Year', 'Planned Operation Year',
 'Prime Mover Code', 'Status_Tab'])
ccGroupJoin = ccGroup.merge(eiaBackfill, on='Plant ID', how='left')

In [None]:
# Remove units from eia dataset, add back-in the new sets
eiaFormatted = eia.drop(index=ccToRemove)

# Check to make sure columns match first
if sorted(list(eiaFormatted.columns)) != sorted(list(ccGroupJoin.columns)):
    print("Error, missing columns")
    exit()

# Append CC rows
eiaFormatted = eiaFormatted.append(ccGroupJoin, ignore_index=True)

# QC
runningRowCount = runningRowCount + len(ccGroupJoin.index)
print('Running row count: ', runningRowCount)

### Combined cycle fixes

In [None]:
# Manual fixes
# For single units in standby within CC unit
# convert tech to just GT or ST and make mothballed

# Unit ids
CT_U = list(eiaFormatted.loc[(eiaFormatted['Plant ID'].isin(CT)) & eiaFormatted['Prime Mover Code'].isin(['CT','CA'])].index)

# Deal with mothballed generators
sepCode = ['\(SB\)', '\(OS\)', '\(OA\)']
ccSetsSkip = eiaFormatted.loc[(eiaFormatted['Status'].str.contains('|'.join(sepCode))) & (eiaFormatted['Plant ID'].isin(CT)) & eiaFormatted['Prime Mover Code'].isin(['CT','CA'])]
CT_2 = [x for x in CT_U if x not in list(ccSetsSkip.index)]

for x in list(ccSetsSkip.index):
    eiaFormatted.loc[x, 'Status'] = 'mothballed'
    if eiaFormatted.loc[x, 'Prime Mover Code'] == 'CT':
        eiaFormatted.loc[x, 'Prime Mover Code'] = 'GT'
    else:
        eiaFormatted.loc[x, 'Prime Mover Code'] = 'ST'

In [None]:
# Now see if there's still a CC unit with one status to combine
# Merge units into proper sets
# 'ST', 'GT', and 'CS' are individual. 'CA' (steam) and 'CT' (gas turbine) need to be combined to 'CC'
ccSetsM = eiaFormatted.loc[eiaFormatted.index.isin(CT_2)]
ccToRemoveM = ccSetsM.index

# QC
runningRowCount = runningRowCount - len(ccToRemoveM)
print('Running row count: ', runningRowCount)

# Group by plant ID and Unit Code fields
# Sum capacities, group Generator IDs, keep sets of values that could possibly vary between units
ccGroupM = ccSetsM.groupby(['Plant ID', 'Unit Code', 'Status_Tab'], as_index=False).agg({'Prime Mover Code': lambda x: set(x), 'Nameplate Capacity (MW)': 'sum', 'Generator ID': lambda x: list(x), 'Status': lambda x: set(x),
'Operating Year': lambda x: set(x), 'Planned Retirement Year': lambda x: set(x), 'Planned Operation Year': lambda x: set(x)})

## QC ##
# Check for multiple statuses
if len(ccGroupM.loc[ccGroupM['Status'].apply(len) != 1].index) > 0:
    print("Cells with multiple Status values: ", ccGroupM.loc[ccGroupM['Status'].apply(len) != 1])

## Sort out what is still CC and what's individual turbines now ##
# Check is a CC unit still
ccGroupM.loc[ccGroupM['Prime Mover Code'].apply(len) > 1, 'Prime Mover Code'] = 'CC'

# If not, remove from the groupby and re-insert as individual units
cgmi = ccGroupM.loc[ccGroupM['Prime Mover Code'].apply(len) == 1].index
ccGroupM.drop(index=cgmi, inplace=True)

# Create single list of generator ids to drop
gm = ccGroupM['Generator ID'].apply(lambda x: x if (type(x) == list) else [x])
gmDrop = np.concatenate(gm.tolist())
# drop
ccSetsM2 = ccSetsM.drop(index=ccSetsM.loc[ccSetsM['Generator ID'].isin(gmDrop)].index)
# Rename codes
ccSetsM2.loc[ccSetsM2['Prime Mover Code'] == 'CT', 'Prime Mover Code'] = 'GT'
ccSetsM2.loc[ccSetsM2['Prime Mover Code'] == 'CA', 'Prime Mover Code'] = 'ST'


# Check Sets for multiple values and turn into a range if necessary (e.g. different start years for units of a CC generator)
for i in checkSets: # <-- previously defined
    print('\n', i)
    # Possible issues combing NaN and year
    if len(ccGroupM.loc[(ccGroupM[i].apply(lambda x: 'not found' in list(x))) & (ccGroupM[i].apply(len) > 1)].index) > 0:
        print('ERROR Values and not founds: ', len(ccGroupM.loc[(ccGroupM[i].apply(lambda x: 'not found' in list(x))) & (ccGroupM[i].apply(len) > 1)].index))
   
    # For mutliple years per new CC value, create year range
    ccGroupM.loc[ccGroupM[i].apply(len) != 1, i] = ccGroupM.loc[ccGroupM[i].apply(len) != 1][i].apply(lambda x: str(int(min(list(x)))) + '-' +  str(int(max(list(x)))) )
   
    # Wherever there's just one value, replace the set with the value
    ccGroupM.loc[ccGroupM[i].apply(len) == 1, i] = ccGroupM.loc[ccGroupM[i].apply(len) == 1][i].apply(lambda x: list(x)[0])

## Merge all CC-type units after editing ##
# Join back relevant plant-level columns before append
ccGroupM = ccGroupM.merge(eiaBackfill, on='Plant ID', how='left')
# Check to make sure columns match first
if sorted(list(ccGroupM.columns)) != sorted(list(ccSetsM2.columns)):
    print("Error, missing columns")
    exit()
ccGroupM = ccGroupM.append(ccSetsM2, ignore_index=True)

In [None]:
# Remove units from eia dataset, add back-in the new sets
eiaFormatted.drop(index=ccToRemoveM, inplace=True)

# Check to make sure columns match first
if sorted(list(eiaFormatted.columns)) != sorted(list(ccGroupM.columns)):
    print("Error, missing columns")
    exit()

# Append CC rows
eiaFormatted = eiaFormatted.append(ccGroupM, ignore_index=True)

#QC
runningRowCount = runningRowCount + len(ccGroupM.index)
print('Running row count: ', runningRowCount)

## Formatting

### Capacity cut-off

In [None]:
# remove units with capacity issues if not operating/planned
tmp = pd.to_numeric(eiaFormatted['Nameplate Capacity (MW)'], errors='coerce')
print("Units missing capacity data or other errors: ", '\n', eiaFormatted.loc[tmp.isna(), ['Plant Name','Status_Tab']])

#QC
runningRowCount = runningRowCount - len(eiaFormatted.loc[tmp.isna()].index)
print('Running row count: ', runningRowCount)

eiaFormatted.drop(index=eiaFormatted.loc[tmp.isna()].index, inplace=True)

# remove units <50 MW
eiaFormatted['Nameplate Capacity (MW)'] = eiaFormatted['Nameplate Capacity (MW)'].apply(lambda x: decimal.Decimal(x).quantize(decimal.Decimal('1'), 
    rounding=decimal.ROUND_HALF_UP)) # Python 3 went to Round Half Even, overriding

#QC
runningRowCount = runningRowCount - len(eiaFormatted.loc[eiaFormatted['Nameplate Capacity (MW)'] < 50].index)
print('Running row count: ', runningRowCount)

eiaFormatted.drop(index=eiaFormatted.loc[eiaFormatted['Nameplate Capacity (MW)'] < 50].index, inplace=True)

### Technology codes

In [None]:
# Change 'CS' to 'CC', etc
eiaFormatted.loc[eiaFormatted['Prime Mover Code'] == 'CS', 'Prime Mover Code'] = 'CC'

# Remove IC and other tech
print("Current tech codes: ", set(eiaFormatted['Prime Mover Code']))
#QC
runningRowCount = runningRowCount - len(eiaFormatted.loc[~eiaFormatted['Prime Mover Code'].isin(allowedTechCodes)].index)
print('Running row count: ', runningRowCount)

eiaFormatted.drop(eiaFormatted.loc[~eiaFormatted['Prime Mover Code'].isin(allowedTechCodes)].index, inplace=True)

### Join in 860 data, city

In [None]:
# Use City column from plant data spreadsheet
pDrop = [x for x in plantData.columns.values if x not in ['Plant Code', 'City']]
plantData.drop(columns=pDrop, inplace=True)
eiaFormatted = eiaFormatted.merge(plantData, how='left', left_on='Plant ID', right_on='Plant Code')
eiaFormatted.drop(columns=['Plant Code'], inplace=True)

### Generator ID lists to string

In [None]:
# Backfill single units with a unit name
eiaFormatted.loc[eiaFormatted['Unit Code'] == '', 'Unit Code'] = eiaFormatted.loc[eiaFormatted['Unit Code'] == '', 'Generator ID']

# Make string for CC units and Other IDs
eiaFormatted['Generator ID'] = eiaFormatted['Generator ID'].apply(lambda x: ', '.join(sorted(x)) if (type(x) == list) else x)

### EIA Status

In [None]:
# Standardize status values
# statuses = list(eiaFormatted['Status'])
# s = []
# s += [x for x in statuses if type(x) != set]
# print(set(s))
st = {
    '(U) Under construction, less than or equal to 50 percent complete': 'construction',
    '(OP) Operating': 'operating', 
    '(OS) Out of service and NOT expected to return to service in next calendar year': 'mothballed', 
    '(OA) Out of service but expected to return to service in next calendar year': 'mothballed', 
    '(P) Planned for installation, but regulatory approvals not initiated': 'proposed', 
    '(TS) Construction complete, but not yet in commercial operation': 'construction', 
    '(T) Regulatory approvals received. Not under construction': 'proposed', 
    '(V) Under construction, more than 50 percent complete': 'construction', 
    '(SB) Standby/Backup: available for service but not normally used': 'mothballed', 
    '(L) Regulatory approvals pending. Not under construction': 'proposed'
}

for key in st:
    eiaFormatted.loc[eiaFormatted['Status'] == key, 'Status'] = st[key]

# Reset these statuses to construction
eiaFormatted.loc[(eiaFormatted['Plant ID'].isin(setFixPlant)) & eiaFormatted['Unit Code'].isin(setFixUnit), 'Status'] = 'construction'

# Pre-process
eiaFormatted['Status_Tab'] = eiaFormatted['Status_Tab'].apply(lambda x: x.replace('_pr', ''))

# Move "planned" tab's start year column
eiaFormatted.loc[eiaFormatted['Status_Tab'].isin(['planned']), 'Operating Year'] = eiaFormatted.loc[eiaFormatted['Status_Tab'].isin(['planned']), 'Planned Operation Year']
eiaFormatted.drop(columns=['Planned Operation Year'], inplace=True)

# Deal with tabs without status column
eiaFormatted.loc[(eiaFormatted['Status'] ==  'not found'), 'Status'] = eiaFormatted.loc[(eiaFormatted['Status'] ==  'not found'), 'Status_Tab']
eiaFormatted.loc[(eiaFormatted['Status'] ==  'canceled or postponed'), 'Status'] = 'cancelled'
eiaFormatted.drop(columns=['Status_Tab'], inplace=True)

### Other ID field values for GGPT

In [None]:
# Create 'Other ID' fields to match GGPT
eiaFormatted['Other IDs (location)'] = eiaFormatted['Plant ID'].apply(lambda x: '{EIA: ' + str(x) + '}')
eiaFormatted['Other IDs (unit)'] = eiaFormatted['Generator ID'].apply(lambda x: '{EIA: ' + str(x) + '}')

### Columns where prefer blank to 'not found' for comparison later

In [None]:
# eiaFormatted.loc[eiaFormatted['Retirement Year'] == 'not found','Retirement Year'] = ''
eiaFormatted.loc[eiaFormatted['Planned Retirement Year'] == 'not found','Planned Retirement Year'] = ''

### Find any sets left over, or won't sort values later

In [None]:
# for i in eiaFormatted.columns:
#     print(eiaFormatted.loc[eiaFormatted[i].apply(lambda x: True if (type(x)==set) else False)])

# eiaFormatted.loc[eiaFormatted['Plant ID']==10554]

## Final QC

In [None]:
print("Starting row count: ", len(eia.index))

print("Ending row count: ", len(eiaFormatted.index))

print("Total removed rows: ", runningRowCount)

print("Adding removed and end count: ", len(eiaFormatted.index) + (-1*runningRowCount))

In [None]:
eiaFormatted.to_excel(r'C:\Users\Sarah\Documents\GEM\code\GGPT-US-EIA\scratch\eiaFormatted_' + stamp + '.xlsx', index=False)

In [None]:
# TODO: pygsheets
ggpt = pd.read_excel(r'C:\Users\Sarah\Documents\GEM\code\GGPT-Compile\2021_10_06_compiled\Global Gas Plant Tracker (GGPT) completed 2021-10-06.xlsx', sheet_name='GGPT - Gas Units',
 keep_default_na=False)
ggpt = ggpt.loc[ggpt['Country'] == 'United States']

In [None]:
eiaQC = eiaFormatted.copy(deep=True)
eiaQC['labelCheck'] = eiaQC['Plant ID'].apply(str) + " " + eiaQC['Generator ID'] # if same as GGPT field drops in compare function, hard to find plants
if len(eiaQC.loc[eiaQC.duplicated(subset='labelCheck')].index) > 0:
    print('EIA ID ERROR')
    eiaQC.loc[eiaQC.duplicated(subset='labelCheck')]

# GGPT
# {'EIA': '56249'}	{'EIA': '5'}
gl1 = ggpt['Other IDs (location)'].str.replace(r"{'EIA': '", '', regex=True).str.replace(r"'}", '', regex=True)

gl2 = ggpt['Other IDs (unit)'].str.replace(r"{'EIA': '", '', regex=True).str.replace(r"'}", '', regex=True)
gl2a = gl2.apply(lambda x: x.split(', ') if (', ' in x) else x)
gl2b = gl2a.apply(lambda x: sorted(x) if (type(x) == list) else x)
gl2c = gl2b.apply(lambda x: (', ').join(x) if (type(x) == list) else x)

ggpt['labelCheck'] = gl1 + " " + gl2c
if len(ggpt.loc[ggpt.duplicated(subset='labelCheck')].index) > 0:
    print('GGPT ID ERROR')
    ggpt.loc[ggpt.duplicated(subset='labelCheck')]

### Column names to match GGPT

In [None]:
# drop columns to compare to EIA
# Plant name has slight variations and states are in differentf ormat, so have to drop for now as well
refCols = [x for x in list(ggpt.columns) if '[ref]' in x]
ggpt.drop(columns=refCols, inplace=True)
ggpt.drop(columns=['Fuel', 'Location accuracy', 'Major area (prefecture, district)',
'Sponsor', 'Parent', 'Sponsor LEI', 'Parent LEI', 'Parent HQ country',
'CHP', 'Last Updated', 'Country', 'GEM location ID', 'GEM unit ID', 'WEPP location ID', 'WEPP unit ID',
'Plant name (local script)', 'Other plant names', 'Wiki URL', 'Other IDs (location)','Other IDs (unit)','Region', 'City',
'Plant name', 'Subnational unit (province, state)'], inplace=True)
ggpt.reset_index(inplace=True, drop=True)

In [None]:

eiaQC.drop(columns=['Plant ID', 'Generator ID', 'Energy Source Code', 'Other IDs (location)','Other IDs (unit)', 
'Plant State', 'Plant Name'], inplace=True)

eiaQC.reset_index(inplace=True, drop=True)

repCols = { 
  'Unit Code': 'Unit name',
   'Nameplate Capacity (MW)': 'Capacity elec. (MW)',
   'Prime Mover Code': 'Technology', 
   'Operating Year': 'Start year', 
   'Planned Retirement Year': 'Planned retire',
      'County': 'Local area (taluk, county)'
}
eiaQC.rename(columns=repCols, inplace=True)

eiaQC['Retired year'] = ''


In [None]:
eiaQC = eiaQC.reindex(columns=sorted(eiaQC.columns))
ggpt = ggpt.reindex(columns=sorted(ggpt.columns))

In [None]:
eiaQC.sort_values(by='labelCheck', ignore_index=True,  inplace=True)
ggpt.sort_values(by='labelCheck', ignore_index=True, inplace=True)

In [None]:
# eiaQC.merge(ggpt, on=list(eiaQC['labelCheck']), how='outer', indicator=True)
missingInGGPT=[x for x in list(eiaQC['labelCheck']) if x not in list(ggpt['labelCheck'])]
print(len(missingInGGPT))
print(len(list(eiaQC['labelCheck'])))

eiaQC.loc[eiaQC['labelCheck'].isin(missingInGGPT)].to_excel(r'C:\Users\Sarah\Documents\GEM\code\GGPT-US-EIA\scratch\eiaMissingFromGGPT_' + stamp + '.xlsx', index=False)

missingInEIA=[x for x in list(ggpt['labelCheck']) if x in list(eiaQC['labelCheck'])]
ggpt.loc[ggpt['labelCheck'].isin(missingInEIA)].to_excel(r'C:\Users\Sarah\Documents\GEM\code\GGPT-US-EIA\scratch\ggptMissingFromEIA_' + stamp + '.xlsx', index=False)


In [None]:
# FOr shared IDs, compare
eiaQC2 = eiaQC.copy(deep=True)

inGGPT = [x for x in list(eiaQC2['labelCheck']) if x in list(ggpt['labelCheck'])]
eiaQC2 = eiaQC2.loc[eiaQC2['labelCheck'].isin(inGGPT)]
eiaQC2.set_index(keys='labelCheck', inplace=True, drop=False)

ggpt2 = ggpt.copy(deep=True)
inQC2=[x for x in list(ggpt2['labelCheck']) if x in list(eiaQC2['labelCheck'])]
ggpt2 = ggpt2.loc[ggpt2['labelCheck'].isin(inQC2)]
ggpt2.set_index(keys='labelCheck', inplace=True, drop=False)

print(len(eiaQC2.index))
print(len(ggpt2.index))

In [None]:
print(eiaQC2.loc['10 1'])

print(ggpt2.loc['10 1'])

In [None]:
# Corece all Year values into strings or else dtype is compared and returns False on int/str matches
eiaQC2 = eiaQC2.astype({'Start year':'str', 'Retired year': 'str', 'Planned retire': 'str'})
ggpt2 = ggpt2.astype({'Start year':'str', 'Retired year': 'str', 'Planned retire': 'str'})

In [None]:
comparison = eiaQC2.compare(other=ggpt2, keep_shape=False)
multiCol=list(comparison.columns)
t2 = [x[0]+x[1].replace('self', 'eia') for x in multiCol]

c2 = comparison.droplevel(level=[0], axis=1)

c2.columns=t2

c2.sort_values(by=t2, axis=0, inplace=True)

c2.to_excel(r'C:\Users\Sarah\Documents\GEM\code\GGPT-US-EIA\scratch\eiaComparedShared_' + stamp + '.xlsx')

In [None]:
# Locate any new unit IDs

# Check capacity for any new units



## EIA Data Join

### Add in ownership information from separate EIA table

### Join CHP

In [None]:
# Check for missing dual-fuel plants in EIA-860
