# Overview

Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
import os

# Convert Units

In [None]:
def convertRelevantToOnOffRemoveUncommonBAS(df_orig):
    df = df_orig.copy()
    # Convert speed to OnOff
    df.loc[df['GP_1.y']>0.05, 'GP_1.y'] = 1
    df.loc[df['GP_1.y']<=0.05, 'GP_1.y'] = 0
    df.loc[df['GP2.y']>0.05, 'GP2.y'] = 1
    df.loc[df['GP2.y']<=0.05, 'GP2.y'] = 0
    df.loc[df['GP_20.y']>0.05, 'GP_20.y'] = 1
    df.loc[df['GP_20.y']<=0.05, 'GP_20.y'] = 0
    df.loc[df['GP4.N_actual']>=1, 'GP4.N_actual'] = 1
    df.loc[df['GP4.N_actual']<1, 'GP4.N_actual'] = 0
    df.loc[df['GP_5.y']>0.05, 'GP_5.y'] = 1
    df.loc[df['GP_5.y']<=0.05, 'GP_5.y'] = 0
    df.loc[df['GP6.y']>0.05, 'GP6.y'] = 1
    df.loc[df['GP6.y']<=0.05, 'GP6.y'] = 0
    df.loc[df['GP_21.y']>0.05, 'GP_21.y'] = 1
    df.loc[df['GP_21.y']<=0.05, 'GP_21.y'] = 0
    
    # calculate the differential pressure of the second loop
    df['secloopdp'] = df['senRelPre.port_a.p'] - df['senRelPre.port_b.p']
    # remove the pressures
    df = df.drop('senRelPre.port_a.p', 1)
    df = df.drop('senRelPre.port_b.p', 1)
    
    # Drop other values identified as not being common BAS points
    df = df.drop('ARU_1.dp2', 1)
    #df = df.drop('VOut3.V_flow', 1)
    df = df.drop('ARU_1.dp1', 1)
    #df = df.drop('VOut7.V_flow', 1)
    
    df = df.drop('ARU_2.dp2', 1)
    #df = df.drop('VOut2.V_flow', 1)
    df = df.drop('ARU_2.dp1', 1)
    #df = df.drop('VOut6.V_flow', 1)
    
    df = df.drop('ARU_3.dp2', 1)
    #df = df.drop('VOut1.V_flow', 1)
    df = df.drop('ARU_3.dp1', 1)
    #df = df.drop('VOut5.V_flow', 1)
    
    return df

In [None]:
input_p_files = os.listdir('../Data/MolecularFoundryPickleFiles_1-cleaned-but-units-unconverted')

# For the output directory
if not os.path.exists('../Data/MolecularFoundryPickleFiles_2-cleaned-units_converted'):
    os.makedirs('../Data/MolecularFoundryPickleFiles_2-cleaned-units_converted')


In [None]:
for input_file in input_p_files:
    path = os.path.join('../Data/MolecularFoundryPickleFiles_1-cleaned-but-units-unconverted', input_file)
    df =  pd.read_pickle(path)
    df = convertRelevantToOnOffRemoveUncommonBAS(df)
    new_path = os.path.join('../Data/MolecularFoundryPickleFiles_2-cleaned-units_converted', input_file)
    df.to_pickle(new_path)

# Create y labels

In [None]:
'''
Given a pandas series (which can be a column from a dataframe), great a label pandas series when the pandas series is ON
'''
def labelWhenOn(pd_series):
    labels = pd.Series(index=pd_series.index, data=0)
    labels[pd_series>0] = 1
    return labels

In [None]:
# Define the important column name for each filename
filename_seriesname_dict = {}
filename_seriesname_dict['ARU1_max75min1.p'] = 'ARU_1.on'
filename_seriesname_dict['ARU2_max75min1.p'] = 'ARU_2.on'
filename_seriesname_dict['ARU3_max75min1.p'] = 'ARU_3.on'
filename_seriesname_dict['coolingtower1_efficiency75percent.p'] = 'CT01.y'
filename_seriesname_dict['coolingtower2_efficiency75percent.p'] = 'CT02.y'
filename_seriesname_dict['GP1_efficiencyx75percent.p'] = 'GP_1.y'
filename_seriesname_dict['GP20_efficiencyx75percent.p'] = 'GP_20.y'
filename_seriesname_dict['GP21_efficiencyx75percent.p'] = 'GP_21.y'
filename_seriesname_dict['GP2_efficiencyx75percent.p'] = 'GP2.y'
filename_seriesname_dict['GP4_efficiencyx75percent.p'] = 'GP4.P'
filename_seriesname_dict['GP5_efficiencyx75percent.p'] = 'GP_5.y'
filename_seriesname_dict['GP6_efficiencyx75percent.p'] = 'GP6.y'

In [None]:
directory = '../Data/MolecularFoundryPickleFiles_1-cleaned-but-units-unconverted'
input_p_files = os.listdir(directory)
for filename in input_p_files:
    df = pd.read_pickle(os.path.join(directory, filename))
    if filename in filename_seriesname_dict:
        series_name = filename_seriesname_dict[filename]
        labels = labelWhenOn(df[series_name])
        labels.to_pickle(os.path.splitext(filename)[0]+'-label.p')

df = pd.read_pickle(os.path.join(directory, 'no_faults.p'))
labels = pd.Series(index=df.index, data=0)
labels.to_pickle('no_faults-label.p')

# Create New Time Features

In [2]:
def DictMaxOfValues(orig_dict):
    new_dict = {}
    for key, value in orig_dict.items():
        new_dict[key] = max(value)
    return new_dict

In [3]:
def mergeDicts(a,b):
    new_dict = {}
    for key, value in a.items():
        if type(value) != list:
            value = [value]
        new_dict[key] = value
    for key, value in b.items():
        if type(value) != list:
            value = [value]
        
        if key in new_dict:
            new_dict[key].extend(value)
        else:
            new_dict[key] = value
    return new_dict

In [4]:
import pickle
aic = pickle.load(open('wt_aic_order_dict', 'rb'))
bic = pickle.load(open('wt_bic_order_dict', 'rb'))
merged_dict = mergeDicts(aic, bic)
order_dict = DictMaxOfValues(merged_dict)

In [6]:
'''
Creates lag dataframe
Input:
    df                 dataframe 
    order_dict         dict of {df.column name : number of lags}

Returns:
    lagged_features     dataframe with only the indices that have full rows
'''
def create_lagged_features(df, order_dict):

    lagged_features = pd.DataFrame(index=df.index)

    for key, n in order_dict.items():
        if n > 0:
            lags_for_one_variable = CreateLagsForOnePDSeries(df[key], key, n)
            lagged_features = pd.concat([lagged_features, lags_for_one_variable], axis=1, join='inner')
    
    return lagged_features
    
    
'''
Creates lags for one pandas series
Input:
    pd_series          pandas.Series
    name               str name of the pandas.Series
    num_lags           int number of lags to calculate
'''
def CreateLagsForOnePDSeries(pd_series, name, num_lags):
    lagged_variables = pd.DataFrame(index=pd_series.index)
    for lag in range(1, num_lags+1):
        temp_lags = pd.DataFrame(index=pd_series.index, data=pd_series.values, columns=[name + '_t-' + str(lag)])
        time_to_add = str(lag*10) + 'min'
        temp_lags.index = pd_series.index + pd.Timedelta(time_to_add)
        lagged_variables = pd.concat([lagged_variables, temp_lags], axis=1, join='inner')
    return lagged_variables


def create_mean_std_features(df, order_dict):
    """
    Calculate the 2hr mean and standard deviation for df
    """
    mean_std_features = pd.DataFrame(index=df.index)

    for key, n in order_dict.items():
        if n == 0:
            n = 6
        print('Calculating Mean for %s \n' % key)
        mean_for_one_variable = calculateSlidingMeanForSeries(df[key], n)
        mean_for_one_variable = pd.DataFrame(index=mean_for_one_variable.index, data=mean_for_one_variable.values, columns=[key + '_mean'])
        print('Calculating Std for %s \n' % key)
        std_for_one_variable = calculateSlidingStdForSeries(df[key], n)
        std_for_one_variable = pd.DataFrame(index=std_for_one_variable.index, data=std_for_one_variable.values, columns=[key + '_std'])
        mean_std_features = pd.concat([mean_std_features, mean_for_one_variable, std_for_one_variable], axis=1, join='inner')    
    return  mean_std_features
    

'''
Given a series, returns a series
'''    
def calculateSlidingMeanForSeries(pd_series, window_size):
    # Make a temporary new dataframe
    temp_features = pd_series.copy()
    temp_features[:] = np.nan
    time_to_add = str(window_size*10) + 'min'
    for i in range(window_size-1, len(pd_series)):
        time = pd_series.index[i]
        temp_features.iloc[i] = pd_series.loc[time-pd.Timedelta(time_to_add):time].mean(axis=0)
    return temp_features

'''
Given a series, returns a series
''' 
def calculateSlidingStdForSeries(pd_series, window_size):
    # Make a temporary new dataframe
    temp_features = pd_series.copy()
    temp_features[:] = np.nan
    time_to_add = str(window_size*10) + 'min'
    for i in range(window_size-1, len(pd_series)):
        time = pd_series.index[i]
        temp_features.iloc[i] = pd_series.loc[time-pd.Timedelta(time_to_add):time].std()   
    return temp_features


In [None]:
df = pd.read_pickle('../Data/PickleFiles/orig_and_eng.p')
lags = create_lagged_features(df, order_dict) # Create lags
meanstd = create_mean_std_features(df, order_dict)
meanstd.to_pickle('../Data/PickleFiles/mean_std_new_features.p')
new_df = pd.concat([df, lags, meanstd], axis=1, join='inner') # Concatenate with the original data
new_df.dropna(inplace=True)
new_df.to_pickle('../Data/PickleFiles/orig_eng_mean_std_lags.p')