<h1 style="color:#27469c">Preprocessing of Kissam Cooling Towers 1 and 2 data</h1>

In [1]:
import pandas as pd
import matplotlib.pyplot as pyplot
import preprocessor

import warnings
warnings.filterwarnings('ignore')

datapath = "../data"

In [2]:
# load cooling towers data
df = pd.read_csv(f'{datapath}/kissam/kissam_cooling_towers.csv', index_col="time")
df.index = pd.to_datetime(df.index, utc='True')
initial_data_size = df.shape[0]

<h2 style="color:#27469c">Rename columns and separate the 2 cooling towers' data</h2>

In [3]:
renaming = {}
for i in [1,2]:
    renaming[f"CT_{i}.TempCondIn"] = f"Kissam_Tower_{i} enteringWaterTemp"
    renaming[f"CT_{i}.TempCondOut"] = f"Kissam_Tower_{i} leavingWaterTemp"
    renaming[f"CT_{i}.PerFreqFan"] = f"Kissam_Tower_{i} vfdPercent"
    renaming[f"CT_{i}.PowFan"] = f"Kissam_Tower_{i} vfdPower"
    renaming[f"CT_{i}.StatusFan"] = f"Kissam_Tower_{i} fanStatus"
    renaming[f"CT_{i}.FlowCond"] = f"Kissam_Tower_{i} flowCond"
    renaming[f"CT_{i}.PerFreqConP"] = f"Kissam_Tower_{i} perFreqConP"
    renaming[f"CT_{i}.PowConP"] = f"Kissam_Tower_{i} powConP"
    renaming[f"CT_{i}.PressDiffCond"] = f"Kissam_Tower_{i} pressDiffCond"
    renaming[f"CH_{i}.PowChi"] = f"Kissam_Tower_{i} CH_PowChi"
    renaming[f"CH_{i}.Tonnage"] = f"Kissam_Tower_{i} CH_Tonnage"
renaming["TempWetBulb"] = "outdoorAirWetBulb"
renaming["TempAmbient"] = "outdoorAirDryBulb"

df.rename(renaming, inplace=True, axis=1)

# remove generated columns
df = df[[col for col in df.columns if "generated" not in col]]

# split dataframes
df1 = df[[col for col in df.columns if "2" not in col]] # cooling tower 1 data
df2 = df[[col for col in df.columns if "1" not in col]] # cooling tower 2 data

<h2 style="color:#27469c">Cooling Tower 1</h2>

In [4]:
# missing data
print("Tower 1:")
df1 = preprocessor.remove_missing_data(datadf=df1, initial_data_size=initial_data_size)

# outliers
df1_on_condition = (df1["Kissam_Tower_1 fanStatus"] == True)
df1 = preprocessor.remove_outliers_std(df1, has_off_data=True, on_condition=df1_on_condition, threshold=5)

# add efficiency and season columns
preprocessor.create_efficiency_col(datadf=df1, enteringWaterTemp="Kissam_Tower_1 enteringWaterTemp", leavingWaterTemp='Kissam_Tower_1 leavingWaterTemp', outdoorAirWetBulb='outdoorAirWetBulb', efficiency_col_name='Kissam_Tower_1 efficiency')
preprocessor.create_season_col(datadf=df1, season_col_name="Kissam_Tower_1 season")

# to streamline the vfdPower columns, since ESB has 2 and kissam has 1 - I will duplicate the column for kissam
df1['Kissam_Tower_1a vfdPower'] = df1['Kissam_Tower_1b vfdPower'] = df1['Kissam_Tower_1 vfdPower']
df1 = df1.drop('Kissam_Tower_1 vfdPower', axis=1)

# save tower 1 preprocessed data
df1.to_csv(f'{datapath}/kissam/kissam_tower_1_preprocessed.csv')

Tower 1:
After missing data removal, we are left with us with 102524 rows out of 104976.
Kissam_Tower_1 enteringWaterTemp has 0 outliers
Kissam_Tower_1 leavingWaterTemp has 0 outliers
outdoorAirDryBulb has 66 outliers
outdoorAirWetBulb has 17 outliers
Setpoint_Existing has 0 outliers
Kissam_Tower_1 vfdPercent has 0 outliers
Kissam_Tower_1 vfdPower has 1 outliers
Kissam_Tower_1 CH_PowChi has 1 outliers
Kissam_Tower_1 perFreqConP has 0 outliers
Kissam_Tower_1 powConP has 0 outliers
Setpoint_Python has 0 outliers
Kissam_Tower_1 CH_Tonnage has 0 outliers
Outier removal removed 85 rows (0.0829074167999688% of data) with outliers. Now left with 102439 rows.


<h2 style="color:#27469c">Replicate for Cooling Tower 2</h2>

In [5]:
# missing data
print("Tower 2:")
df2 = preprocessor.remove_missing_data(datadf=df2, initial_data_size=initial_data_size)

# outliers
df2_on_condition = (df2["Kissam_Tower_2 fanStatus"] == True)
df2 = preprocessor.remove_outliers_std(df2, has_off_data=True, on_condition=df2_on_condition, threshold=5)

# add efficiency and season columns
preprocessor.create_efficiency_col(datadf=df2, enteringWaterTemp="Kissam_Tower_2 enteringWaterTemp", leavingWaterTemp='Kissam_Tower_2 leavingWaterTemp', outdoorAirWetBulb='outdoorAirWetBulb', efficiency_col_name='Kissam_Tower_2 efficiency')
preprocessor.create_season_col(datadf=df2, season_col_name="Kissam_Tower_2 season")

# to streamline the vfdPower columns, since ESB has 2 and kissam has 1 - I will duplicate the column for kissam
df2['Kissam_Tower_2a vfdPower'] = df2['Kissam_Tower_2b vfdPower'] = df2['Kissam_Tower_2 vfdPower']
df2 = df2.drop('Kissam_Tower_2 vfdPower', axis=1)

# save tower 2 preprocessed data
df2.to_csv(f'{datapath}/kissam/kissam_tower_2_preprocessed.csv')

Tower 2:
After missing data removal, we are left with us with 102525 rows out of 104976.
outdoorAirDryBulb has 66 outliers
outdoorAirWetBulb has 17 outliers
Setpoint_Existing has 0 outliers
Kissam_Tower_2 enteringWaterTemp has 0 outliers
Kissam_Tower_2 leavingWaterTemp has 0 outliers
Kissam_Tower_2 vfdPercent has 0 outliers
Kissam_Tower_2 vfdPower has 1 outliers
Kissam_Tower_2 CH_PowChi has 0 outliers
Kissam_Tower_2 perFreqConP has 0 outliers
Kissam_Tower_2 powConP has 9 outliers
Setpoint_Python has 0 outliers
Kissam_Tower_2 CH_Tonnage has 0 outliers
Outier removal removed 93 rows (0.09070958302852962% of data) with outliers. Now left with 102432 rows.
