<h1 style="color:#27469c"> Preprocessing of MRB data </h1>

In [None]:
import pandas as pd
import matplotlib.pyplot as pyplot
import preprocessor

import warnings
warnings.filterwarnings('ignore')

rootpath = ".."

In [None]:
renaming = {}
for i in range(1,5):

    filename = f'{rootpath}/data/mrb/mrb_cooling_tower_{i}.csv'
    mrb_tower_df = pd.read_csv(filename, index_col='time')

    renaming[f'Cooling_Tower_{i}_Cell_1Fan VFD %'] = f'MRB_Tower_{i} vfdPercent'
    renaming[f'Cooling_Tower_{i}_Cell_1Fan power'] = f'MRB_Tower_{i} fanA_vfdPower'
    renaming[f'Cooling_Tower_{i}_Cell_1Fan status'] = f'MRB_Tower_{i} fanA_fanStatus'
    renaming[f'Entering Water Temp'] = f'MRB_Tower_{i} enteringWaterTemp'
    renaming[f'Leaving Water Temp'] = f'MRB_Tower_{i} leavingWaterTemp'
    renaming[f'Outdoor Air Wetbulb'] = f'MRB_Tower_{i} outdoorAirWetBulb'

    mrb_tower_df.rename(renaming, inplace=True, axis=1)
    mrb_tower_df.to_csv(filename)

    renaming.clear()

Cooling towers 2-4 have identical variables but cooling tower 1 has data on an additional variable (CW_Pump_1 pumpStatus). This variable isn't needed so we'll remove it and have 4 datasets with identical variables.

Moreover, MRB cooling towers do not have outdoorAirDryBulb data so I have taken this data from MRB's Air Handling Unit 1.

<h2 style="color:#27469c">Add outdoor air dry bulb and humidity information from ESB trend in MRB</h2>

In [None]:
ext_data = pd.read_csv(f'{rootpath}/data/esb/esb_cooling_tower_1.csv', index_col="time")[["ESB_Tower_1 outdoorAirHumidity", "ESB_Tower_1 outdoorAirDryBulb"]]
ext_data.index = pd.to_datetime(ext_data.index, utc='True')

ext_data.rename({"ESB_Tower_1 outdoorAirHumidity":"outdoorAirHumidity", "ESB_Tower_1 outdoorAirDryBulb": "outdoorAirDryBulb"}, inplace=True, axis=1)

<h2 style="color:#27469c"> Preprocessing of MRB III Cooling Tower 1 data </h2>

In [None]:
# load cooling tower data
df = pd.read_csv(f'{rootpath}/data/mrb/mrb_cooling_tower_1.csv', index_col="time")
df.index = pd.to_datetime(df.index, utc='True')
initial_data_size = df.shape[0]

In [None]:
# add dry bulb and humidity data
df = df.merge(ext_data, on="time")
df.rename({"outdoorAirHumidity":"MRB_Tower_1 outdoorAirHumidity", "outdoorAirDryBulb": "MRB_Tower_1 outdoorAirDryBulb"}, inplace=True, axis=1)

# remove missing data
df = preprocessor.remove_missing_data(datadf=df, initial_data_size=initial_data_size)
print(f'After missing data removal, we are left with us with {df.shape[0]} rows out of {initial_data_size}.')

# remove unnecessary columns to prevent interference with missing data and outlier removal
df.drop('Efficiency', inplace=True, axis=1)
df.drop('CW_Pump_1 pumpStatus', inplace=True, axis=1)

# add efficiency, season, dayOfWeek and hourOfDay columns
preprocessor.create_efficiency_col(datadf=df, enteringWaterTemp="MRB_Tower_1 enteringWaterTemp", leavingWaterTemp='MRB_Tower_1 leavingWaterTemp', outdoorAirWetBulb='MRB_Tower_1 outdoorAirWetBulb', efficiency_col_name='MRB_Tower_1 efficiency')
preprocessor.create_season_col(datadf=df, season_col_name="MRB_Tower_1 season")
df["MRB_Tower_1 dayOfWeek"] = df.index.weekday
df['MRB_Tower_1 hourOfDay'] = df.index.hour

# outliers
df_on_condition = (df["MRB_Tower_1 fanStatus"] == True)
df = preprocessor.remove_outliers_std(df, has_off_data=True, on_condition=df_on_condition, threshold=5, verbose=True)

# to streamline the vfdPower columns, since ESB has 2 and MRB has 1 - I will duplicate the column for kissam
df['MRB_Tower_1 fanB_vfdPower'] = df['MRB_Tower_1 fanA_vfdPower']

# save preprocessed tower 2 data
df.sort_index(axis=1).to_csv(f'{rootpath}/data/mrb/mrb_tower_1_preprocessed.csv')

<h2 style="color:#27469c"> Preprocessing of MRB III Cooling Towers 2-4 data </h2>

In [None]:
for i in range(2,5):
    # load cooling tower data
    df = pd.read_csv(f'{rootpath}/data/mrb/mrb_cooling_tower_{i}.csv', index_col="time")
    df.index = pd.to_datetime(df.index, utc='True')
    initial_data_size = df.shape[0]

    print(f'\nPreprocessing logs for cooling tower {i}:\n')

    # add dry bulb and humidity data
    df = df.merge(ext_data, on="time")
    df.rename({"outdoorAirHumidity":f"MRB_Tower_{i} outdoorAirHumidity", f"outdoorAirDryBulb": f"MRB_Tower_{i} outdoorAirDryBulb"}, inplace=True, axis=1)

    # remove missing data
    df = preprocessor.remove_missing_data(datadf=df, initial_data_size=initial_data_size)
    print(f'After missing data removal, we are left with us with {df.shape[0]} rows out of {initial_data_size}.')

    # remove unnecessary columns to prevent interference with missing data and outlier removal
    df.drop('Efficiency', inplace=True, axis=1)

    # add efficiency, season, dayOfWeek and hourOfDay columns
    preprocessor.create_efficiency_col(datadf=df, enteringWaterTemp=f"MRB_Tower_{i} enteringWaterTemp", leavingWaterTemp=f'MRB_Tower_{i} leavingWaterTemp', outdoorAirWetBulb=f'MRB_Tower_{i} outdoorAirWetBulb', efficiency_col_name=f'MRB_Tower_{i} efficiency')
    preprocessor.create_season_col(datadf=df, season_col_name=f"MRB_Tower_{i} season")
    df[f"MRB_Tower_{i} dayOfWeek"] = df.index.weekday
    df[f'MRB_Tower_{i} hourOfDay'] = df.index.hour

    # outliers
    df_on_condition = (df[f"MRB_Tower_{i} fanStatus"] == True)
    df = preprocessor.remove_outliers_std(df, has_off_data=True, on_condition=df_on_condition, threshold=5, verbose=True)

    # to streamline the vfdPower columns, since ESB has 2 and MRB has 1 - I will duplicate the column for kissam
    df[f'MRB_Tower_{i} fanB_vfdPower'] = df[f'MRB_Tower_{i} fanA_vfdPower']

    # save preprocessed tower 2 data
    df.sort_index(axis=1).to_csv(f'{rootpath}/data/mrb/mrb_tower_{i}_preprocessed.csv')