In [2]:
import numpy as np
import os
import pandas as pd
import sys
import warnings
warnings.simplefilter('ignore')

pkg_dir = '/home/mrossol/NaTGenPD'
#pkg_dir = '..'
sys.path.append(pkg_dir)
import NaTGenPD as npd
import NaTGenPD.cluster as cluster
from NaTGenPD.analysis import ProcedureAnalysis, QuartileAnalysis


data_dir = '/scratch/mrossol/CEMS'
#data_dir = '/Users/mrossol/Downloads/CEMS'
out_dir = os.path.join(data_dir, 'analysis')
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
    
logger = npd.setup_logger('NaTGenPD.analysis', log_level='INFO')

# Procedure Stats

In [None]:
fits_dir = os.path.join(data_dir, 'Final_Fits')
raw_paths = [os.path.join(data_dir, '{y}/SMOKE_{y}.h5'.format(y=y))
            for y in (2016, 2017)]
clean_path = os.path.join(data_dir, 'SMOKE_Clean_2016-2017.h5')
filter_path = os.path.join(data_dir, 'SMOKE_Filtered_2016-2017.h5')
process_dir = os.path.join(out_dir, 'process')
if not os.path.exists(process_dir):
    os.makedirs(process_dir)

out_path = os.path.join(process_dir, 'process_stats_2016-2017.csv')

ProcedureAnalysis.stats(fits_dir, raw_paths, clean_path, filter_path, out_path)

INFO - 2019-06-27 09:56:51,934 [analysis.py:371] : Extracting stats for Boiler (Coal)
INFO - 2019-06-27 10:17:30,703 [analysis.py:371] : Extracting stats for Boiler (NG)
INFO - 2019-06-27 10:21:03,862 [analysis.py:371] : Extracting stats for Boiler (Oil)
INFO - 2019-06-27 10:21:12,367 [analysis.py:371] : Extracting stats for Boiler (Other Solid Fuel)
INFO - 2019-06-27 10:21:17,259 [analysis.py:371] : Extracting stats for CC (Coal)
INFO - 2019-06-27 10:21:18,571 [analysis.py:371] : Extracting stats for CC (NG)
INFO - 2019-06-27 10:42:47,005 [analysis.py:371] : Extracting stats for CC (Oil)
INFO - 2019-06-27 10:42:48,422 [analysis.py:371] : Extracting stats for CT (NG)


In [3]:
process_dir = os.path.join(out_dir, 'process')
path = os.path.join(process_dir, 'process_stats_2016-2017.csv')
stats_df = pd.read_csv(path, index_col=0)
stats_df

Unnamed: 0,raw_units,raw_cf,raw_gen,total_points,non_zero_points,clean_units,clean_points,clean_cf,clean_gen,filtered_units,filtered_points,filtered_cf,filtered_gen,final_units,final_cf,final_gen,final_points
Boiler (Coal),843,270562,2427729410,13839192,7360242,679,6500148,265967,2059205040,669,6422929,261257,2049211553,639,258400,2030091027,6207362
Boiler (NG),457,67891,125499620,6816528,1117520,252,837142,61006,95670784,228,814892,56718,94366927,216,54739,90772489,781413
Boiler (Oil),68,15754,14850968,1074696,75997,35,49020,13879,11993545,27,46664,9915,11612754,27,9915,11612754,46664
Boiler (Other Solid Fuel),10,379,2877966,166656,51066,5,34902,378,1990083,5,34256,376,1965544,5,376,1965544,34256
CC (Coal),2,444,5740836,35088,28979,1,15456,445,5629717,1,15373,445,5614312,1,445,5614312,15373
CC (NG),1027,233018,1976581560,17824728,10524380,553,5731883,224694,1811103888,548,5674954,222206,1798917049,400,184824,1496937481,4223533
CC (Oil),6,605,2611003,105264,31079,2,10687,568,2484323,2,10478,508,2450424,2,508,2450424,10478
CT (NG),1555,117572,141850971,26209152,2365899,1107,1480535,94560,110782882,954,1378315,80173,104843975,910,76520,93365558,1286664
CT (Oil),438,20599,10964829,6356736,178314,150,54580,9113,3419799,57,46182,3909,2966651,57,3909,2966651,46182
Cement Kiln (Coal),2,0,0,13152,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
plant_types = ['Boiler (Coal)', 'Boiler (NG)', 'Boiler (Oil)',
               'Boiler (Other Solid Fuel)', 'CT (NG)', 'CT (Oil)', 'CC (NG)', 'CC (Oil)']
table_1 = stats_df.loc[plant_types].copy()
# Combine Boilers and Stokers
table_1.loc['Boiler (Coal)'] += stats_df.loc['Stoker (Coal)']
table_1.loc['Boiler (NG)'] += stats_df.loc['Stoker (NG)']
table_1.loc['Boiler (Other Solid Fuel)'] += stats_df.loc['Stoker (Other Solid Fuel)']

table_1['raw_cf (GW)'] = table_1['raw_cf'] / 1000
table_1['raw_gen (TWh)'] = table_1['raw_gen'] / 1000000

# Compute units/cf removed
table_1['step_1_cf_removed (GW)'] = (table_1['raw_cf'] - table_1['clean_cf']) / 1000
table_1['step_1_gen_removed (TWh)'] = (table_1['raw_gen'] - table_1['clean_gen']) / 1000000
table_1['step_3_cf_removed (GW)'] = (table_1['clean_cf'] - table_1['final_cf']) / 1000
table_1['step_3_gen_removed (TWh)'] = (table_1['clean_gen'] - table_1['final_gen']) / 1000000

table_1['final_cf (GW)'] = table_1['final_cf'] / 1000
table_1['final_gen (TWh)'] = table_1['final_gen'] / 1000000

drop_cols = ['total_points', 'non_zero_points', 'raw_cf', 'raw_gen',
             'clean_units', 'clean_cf', 'clean_gen', 'filtered_units',
             'filtered_cf', 'filtered_gen', 
             'final_cf', 'final_gen', 'final_points']
table_1 = table_1.drop(columns=drop_cols)
table_1.at['Total'] = table_1.sum()
cols = ['raw_cf (GW)', 'raw_gen (TWh)',
        'step_1_cf_removed (GW)', 'step_1_gen_removed (TWh)',
        'step_3_cf_removed (GW)', 'step_3_gen_removed (TWh)', 
        'final_cf (GW)', 'final_gen (TWh)']

out_path = os.path.join(out_dir, 'process/Table_1.csv')
table_1[cols].to_csv(out_path)
table_1[cols]

Unnamed: 0,raw_cf (GW),raw_gen (TWh),step_1_cf_removed (GW),step_1_gen_removed (TWh),step_3_cf_removed (GW),step_3_gen_removed (TWh),final_cf (GW),final_gen (TWh)
Boiler (Coal),270.636,2427.837301,4.669,368.632261,7.567,29.114013,258.4,2030.091027
Boiler (NG),67.969,125.611944,6.963,29.94116,6.267,4.898295,54.739,90.772489
Boiler (Oil),15.754,14.850968,1.875,2.857423,3.964,0.380791,9.915,11.612754
Boiler (Other Solid Fuel),0.864,6.340704,0.149,1.384143,0.014,0.080102,0.701,4.876459
CT (NG),117.572,141.850971,23.012,31.068089,18.04,17.417324,76.52,93.365558
CT (Oil),20.599,10.964829,11.486,7.54503,5.204,0.453148,3.909,2.966651
CC (NG),233.018,1976.58156,8.324,165.477672,39.87,314.166407,184.824,1496.937481
CC (Oil),0.605,2.611003,0.037,0.12668,0.06,0.033899,0.508,2.450424
Total,727.017,4706.64928,56.515,607.032458,80.986,366.543979,589.516,3733.072843


In [4]:
plant_types = ['Boiler (Coal)', 'Boiler (NG)', 'Boiler (Oil)',
               'Boiler (Other Solid Fuel)', 'CT (NG)', 'CT (Oil)', 'CC (NG)', 'CC (Oil)']
table_2 = stats_df.loc[plant_types].copy()
# Combine Boilers and Stokers
table_2.loc['Boiler (Coal)'] += stats_df.loc['Stoker (Coal)']
table_2.loc['Boiler (NG)'] += stats_df.loc['Stoker (NG)']
table_2.loc['Boiler (Other Solid Fuel)'] += stats_df.loc['Stoker (Other Solid Fuel)']

table_2['raw_points'] = table_2['total_points'] / 1000000
table_2['final_points'] /= 1000000

# Compute units/cf removed
table_2['step_1_units_removed'] = table_2['raw_units'] - table_2['clean_units']
table_2['step_1_points_removed'] = (table_2['raw_points'] - table_2['clean_points']) / 1000000
table_2['step_3_units_removed'] = table_2['clean_units'] - table_2['final_units']
table_2['step_3_points_removed'] = (table_2['clean_points'] - table_2['final_points']) / 1000000

drop_cols = ['total_points', 'non_zero_points', 'raw_cf', 'raw_gen',
             'clean_units', 'clean_points', 'clean_cf', 'clean_gen',
             'filtered_units', 'filtered_points', 'filtered_cf', 'filtered_gen', 
             'final_cf', 'final_gen',]
table_2 = table_2.drop(columns=drop_cols)
table_2.at['Total'] = table_2.sum()
cols = ['raw_units', 'raw_points',
       'step_1_units_removed', 'step_1_points_removed',
       'step_3_units_removed', 'step_3_points_removed',
        'final_units', 'final_points']

out_path = os.path.join(out_dir, 'process/table_2.csv')
table_2[cols].to_csv(out_path)
table_2[cols]

Unnamed: 0,raw_units,raw_points,step_1_units_removed,step_1_points_removed,step_3_units_removed,step_3_points_removed,final_units,final_points
Boiler (Coal),884.0,14.532216,205.0,-6.500133,40.0,6.500142,639.0,6.207362
Boiler (NG),460.0,6.8604,208.0,-0.837135,36.0,0.837141,216.0,0.781413
Boiler (Oil),68.0,1.074696,33.0,-0.049019,8.0,0.04902,27.0,0.046664
Boiler (Other Solid Fuel),38.0,0.627192,25.0,-0.139332,0.0,0.139333,13.0,0.13604
CT (NG),1555.0,26.209152,448.0,-1.480509,197.0,1.480534,910.0,1.286664
CT (Oil),438.0,6.356736,288.0,-0.054574,93.0,0.05458,57.0,0.046182
CC (NG),1027.0,17.824728,474.0,-5.731865,153.0,5.731879,400.0,4.223533
CC (Oil),6.0,0.105264,4.0,-0.010687,0.0,0.010687,2.0,0.010478
Total,4476.0,73.590384,1685.0,-14.803254,527.0,14.803315,2264.0,12.738336


# Quartile Stats

In [32]:
fits_dir = os.path.join(data_dir, 'Final_Fits')
filter_path = os.path.join(data_dir, 'SMOKE_Filtered_2016-2017.h5')
quartile_dir = os.path.join(out_dir, 'final_fits')

analysis = QuartileAnalysis(fits_dir, filter_path)

In [44]:
group_type = 'CC (NG)'

group_fits = analysis._fits[group_type]
if "CC" in group_type:
    group_fits['unit_id'] = group_fits['unit_id'].str.split('-').str[0]
    group_fits = group_fits.groupby('unit_id').mean().reset_index()

pos = group_fits['a0'].isnull()
group_fits = group_fits.loc[~pos]

with npd.CEMS(analysis._filtered_path, mode='r') as f:
    filtered_df = f[group_type].df


pos = filtered_df['cluster'] >= 0
filtered_df = filtered_df.loc[pos]
pos = filtered_df['unit_id'].isin(group_fits['unit_id'].to_list())
filtered_df = filtered_df.loc[pos]


ave_hr = filtered_df.groupby('unit_id')['heat_rate'].mean()
ave_hr.name = 'ave_heat_rate'
filtered_df = pd.merge(filtered_df,
                       ave_hr.to_frame().reset_index(),
                       on='unit_id')

load_max = filtered_df.groupby('unit_id')['load'].max()
load_max.name = 'load_max'
filtered_df = pd.merge(filtered_df,
                       load_max.to_frame().reset_index(),
                       on='unit_id')

filtered_df['cf'] = (filtered_df['load']
                             / filtered_df['load_max'])

filtered_df[['unit_id', 'load', 'load_max', 'cf', 'ave_heat_rate']].head()

Unnamed: 0,unit_id,time,load,HTINPUT,heat_rate,latitude,longitude,state,EPA_region,NERC_region,unit_type,fuel_type,group_type,cts,cluster
3,10030_CC1,2016-01-03 16:00:00,50.076,444.200012,8.870517,39.146702,-75.546097,DE,3.0,,Combined cycle,Pipeline Natural Gas,CC (NG),1.0,0
4,10030_CC1,2016-01-03 17:00:00,45.261002,400.399994,8.846468,39.146702,-75.546097,DE,3.0,,Combined cycle,Pipeline Natural Gas,CC (NG),1.0,0
5,10030_CC1,2016-01-03 18:00:00,45.261002,401.5,8.870771,39.146702,-75.546097,DE,3.0,,Combined cycle,Pipeline Natural Gas,CC (NG),1.0,0
6,10030_CC1,2016-01-03 19:00:00,45.261002,401.5,8.870771,39.146702,-75.546097,DE,3.0,,Combined cycle,Pipeline Natural Gas,CC (NG),1.0,0
7,10030_CC1,2016-01-03 20:00:00,45.261002,401.700012,8.875191,39.146702,-75.546097,DE,3.0,,Combined cycle,Pipeline Natural Gas,CC (NG),1.0,0


Unnamed: 0,unit_id,load,load_max,cf,ave_heat_rate
0,10030_CC1,50.076,65.484001,0.764706,8.774563
1,10030_CC1,45.261002,65.484001,0.691176,8.774563
2,10030_CC1,45.261002,65.484001,0.691176,8.774563
3,10030_CC1,45.261002,65.484001,0.691176,8.774563
4,10030_CC1,45.261002,65.484001,0.691176,8.774563


In [None]:
fits_dir = os.path.join(data_dir, 'CEMS_Fits')
filter_path = os.path.join(data_dir, 'SMOKE_Filtered_2016-2017.h5')
quartile_dir = os.path.join(out_dir, 'filtered_fits')
if not os.path.exists(quartile_dir):
    os.makedirs(quartile_dir)

out_path = os.path.join(quartile_dir, 'filtered_quartile_stats.csv')

QuartileAnalysis.stats(fits_dir, filter_path, out_path)

In [None]:
quartile_df = pd.read_csv(out_path, index_col=0)
quartile_df

In [None]:
fits_dir = os.path.join(data_dir, 'Final_Fits')
filter_path = os.path.join(data_dir, 'SMOKE_Filtered_2016-2017.h5')
quartile_dir = os.path.join(out_dir, 'final_fits')
if not os.path.exists(quartile_dir):
    os.makedirs(quartile_dir)

out_path = os.path.join(quartile_dir, 'final_quartile_stats.csv')

QuartileAnalysis.stats(fits_dir, filter_path, out_path)

In [None]:
quartile_df = pd.read_csv(out_path, index_col=0)
quartile_df