# Preprocessing of Kissam Cooling Towers 1 and 2 data

In [1]:
import pandas as pd
import matplotlib.pyplot as pyplot
from preprocessor import remove_outliers_std

import warnings
warnings.filterwarnings('ignore')

datapath = "../data"

In [2]:
# load cooling towers data
df = pd.read_csv(f'{datapath}/kissam/kissam_cooling_towers.csv', index_col="time")
df.index = pd.to_datetime(df.index, utc='True')

In [3]:
df.columns

Index(['CT_1.TempCondIn', 'generated', 'CT_1.TempCondOut', 'generated.1',
       'TempAmbient', 'generated.2', 'TempWetBulb', 'generated.3',
       'Setpoint_Existing', 'generated.4', 'CT_1.PerFreqFan', 'generated.5',
       'CT_1.PowFan', 'generated.6', 'CT_1.StatusFan', 'generated.7',
       'CT_2.TempCondIn', 'generated.8', 'CT_2.TempCondOut', 'generated.9',
       'CT_2.PerFreqFan', 'generated.10', 'CT_2.PowFan', 'generated.11',
       'CT_2.StatusFan', 'generated.12', 'CH_1.PowChi', 'generated.13',
       'generated.14', 'CT_1.FlowCond', 'CT_1.PerFreqConP', 'generated.15',
       'CT_1.PowConP', 'generated.16', 'generated.17', 'CT_1.PressDiffCond',
       'CH_2.PowChi', 'generated.18', 'generated.19', 'CT_2.FlowCond',
       'CT_2.PerFreqConP', 'generated.20', 'CT_2.PowConP', 'generated.21',
       'generated.22', 'CT_2.PressDiffCond', 'Setpoint_Python', 'generated.23',
       'CH_2.Tonnage', 'generated.24', 'CH_1.Tonnage', 'generated.25'],
      dtype='object')

### Remove rows of data in which both cooling towers were off

In [4]:
# Number of rows in which neither cooling towers was on - 2078
# Remove these rows
df = df[df['CT_1.StatusFan'] | df['CT_2.StatusFan']]

This leaves us with 6723 rows out of 8807.

### Missing data

In [5]:
# remove all columns that are completely empty
df.dropna(axis=1, how="all", inplace=True)

# check how many rows in each column are empty
df.isna().sum()

CT_1.TempCondIn        948
generated            72402
CT_1.TempCondOut       948
generated.1          72402
TempAmbient            948
generated.2          72402
TempWetBulb            948
generated.3          72402
Setpoint_Existing      948
generated.4          72402
CT_1.PerFreqFan        948
generated.5          72402
CT_1.PowFan           1120
generated.6          72402
CT_1.StatusFan           0
generated.7          72402
CT_2.TempCondIn        948
generated.8          72402
CT_2.TempCondOut       948
generated.9          72402
CT_2.PerFreqFan        948
generated.10         72402
CT_2.PowFan           1120
generated.11         72402
CT_2.StatusFan           0
generated.12         72402
CH_1.PowChi            948
generated.13         73044
generated.14         72402
CT_1.PerFreqConP       948
generated.15         72402
CT_1.PowConP          1120
generated.16         72402
generated.17         72402
CH_2.PowChi            948
generated.18         73044
generated.19         72402
C

In [6]:
df.drop(['CH_1.Tonnage', 'CH_2.Tonnage'], axis=1, inplace=True)
df.isna().sum()

CT_1.TempCondIn        948
generated            72402
CT_1.TempCondOut       948
generated.1          72402
TempAmbient            948
generated.2          72402
TempWetBulb            948
generated.3          72402
Setpoint_Existing      948
generated.4          72402
CT_1.PerFreqFan        948
generated.5          72402
CT_1.PowFan           1120
generated.6          72402
CT_1.StatusFan           0
generated.7          72402
CT_2.TempCondIn        948
generated.8          72402
CT_2.TempCondOut       948
generated.9          72402
CT_2.PerFreqFan        948
generated.10         72402
CT_2.PowFan           1120
generated.11         72402
CT_2.StatusFan           0
generated.12         72402
CH_1.PowChi            948
generated.13         73044
generated.14         72402
CT_1.PerFreqConP       948
generated.15         72402
CT_1.PowConP          1120
generated.16         72402
generated.17         72402
CH_2.PowChi            948
generated.18         73044
generated.19         72402
C

In [7]:
# only 2 rows are empty so drop them
df = df.dropna()

This leaves us with 6107 rows out of 8807.

### Outliers

In [8]:
df = remove_outliers_std(df)

TypeError: remove_outliers_std() missing 1 required positional argument: 'has_off_data'

This leaves us with 5826 rows out of 8807.

In [None]:
# save as combined towers data
df.to_csv(f'{datapath}/kissam/kissam_preprocessed.csv')