In [None]:
from google.colab import drive
drive.mount('/content/drive') 

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import Model
import time
import matplotlib.pyplot as plt
import random

In [None]:
os.chdir("/content/drive/My Drive/Anomaly detection - BuildSys2020/")

In [None]:
# For reproducibility
np.random.seed(12)
tf.random.set_seed(12)
random.seed(1)

In [None]:
def usable_features(df_building, features_used):  # for droping the features not to be considered in the model
  ## df_building - pandas dataframe of the datset
  ## features_used - features to be included in the cleaned dataset

  # creating list of columns that are not used 
  col = df_building.columns
  features_not_used = [] # store columns that won't be used
  for x in col:
    if x not in features_used:
      features_not_used.append(x)

  # drop the columns
  df_building= df_building.drop(features_not_used, axis = 1) # dropping not used features 

  # to fill NA data we simply replace the values with 0
  df_building = df_building.fillna(0)
  
  return df_building

In [None]:
def normalize_minmax(df, col):  # function to normalize the columns 
    ## df - data frame containing the columns
    ## columns to be normalized  
    result = df.copy()
    for feature_name in col:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()

        print("Feature Name : {}  :  Max Value - {} ; Min Value - {}".format(feature_name, max_value, min_value))
        # normalize 
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result 

In [None]:
def normalize_zAlgo(df, col):  # function to normalize the columns 
    ## df - data frame containing the columns
    ## columns to be normalized  
    result = df.copy()
    for feature_name in col:
        std_value = df[feature_name].std()
        mean_value = df[feature_name].mean()

        print("Feature Name : {}  :  Std Value - {} ; Mean Value - {}".format(feature_name, std_value, mean_value))
        # normalize 
        result[feature_name] = (df[feature_name] - mean_value) / std_value
    return result

In [None]:
def clean_dataset_ashrae(dataset, meter_types, features_used, z_columns, minmax_columns):
  ## meter_types - list of meters to be included 
  ## features_used - list of features to be included in the final data
  ## z_columns - list of columns to be normalized with Z algorithm
  ## minmax_columns - list of columns to be normalized with min-max normalization

  meter_dataset = dataset.loc[dataset['meter']==0]
  
  # create a new column  
  meter_dataset['meter_reading_log'] = np.log(meter_dataset['meter_reading']+1) 

  #convert categorical data type to int
  meter_dataset['primary_use'] = meter_dataset['primary_use'].astype('category')
  unique_primUse_list = list(meter_dataset.primary_use.unique())
  unique_primUse_dict = {unique_primUse_list[i]: i for i in range(0, len(unique_primUse_list))}
  meter_dataset['primary_use'] = meter_dataset['primary_use'].map(unique_primUse_dict).astype(int)

  # remove columns that won't be included in the model's input 
  df_train = usable_features(meter_dataset, features_used)

  # normalize features 
  df_train_norm1 = normalize_zAlgo(df_train, z_columns) # z-algorithm
  df_train_norm = normalize_zAlgo(df_train_norm1, minmax_columns) # min-max scaler

  print("df_train_norm shape {}".format(df_train_norm.shape))
  return df_train_norm

In [None]:
## creating input sequence of length - seq_length
def create_windows(arr_data, arr_anomaly_label, arr_op, seq_length, nb_features):
  ## arr_data - normalized dataset as a numpy array 
  ## arr_op - array containing anomaly labels for each time step 
  ## seq_length - wiindow length 
  ## nb_features - no. of features to be used as input 

  anom_x = [] # storing anomalous windows - input 
  non_anom_x = [] # storing non-anomalous windows - input 
  anom_y = [] # storing anomalous windows - output 
  non_anom_y = [] # storing non-anomalous windows - output 

  # run a loop to move a seq_length size window across data non-overlapping
  for i in range(0,arr_data.shape[0]//seq_length):

    # slice the window 
    window_features = arr_data[i*seq_length:(i+1)*seq_length].reshape((seq_length, nb_features))   # window of seq_length
    window_output = arr_op[i*seq_length:(i+1)*seq_length].reshape((seq_length,1))

    cond_input = np.concatenate((window_output, window_features[0][0].reshape((1,1))))

    is_anomaly = np.count_nonzero(arr_anomaly_label[i*seq_length:(i+1)*seq_length])  #if even at one time point anomaly is present the window would be considered anomalous
    # print(is_anomaly)

    if is_anomaly > 0 :  # separating the anomalous and non-anomalous data 
      anom_x.append(cond_input)
      anom_y.append(window_output)
    else:
      non_anom_x.append(cond_input)
      non_anom_y.append(window_output)

  return non_anom_x, non_anom_y, anom_x, anom_y 

In [None]:
def mix_data(data_x, data_y):
  #  Mix Data (to make it similar to i.i.d)
  data_idx = np.random.permutation(len(data_x))


  output_data_x = []  # Store shuffled data
  output_data_y = []

  for i in range(len(data_x)):
    output_data_x.append(data_x[data_idx[i]])
    output_data_y.append(data_y[data_idx[i]])

  print("ouput_x shape {} , {}".format(len(output_data_x), output_data_x[0].shape))
  return output_data_x, output_data_y

In [None]:
with open('ashrae_rank1_train_clean.pkl', 'rb') as f:
      dataset = pickle.load(f)

In [None]:
def load_data_ashrae(dataset, meter_types, features_used, z_columns, minmax_columns, seq_length, nb_buildings, output_feature):
  ## seq_length - window length 
  ## nb_buildings - no of buildings for the input samples 
  ## output_feature - feature that would be taken as the output to be predicted 

  ## import the dataset 
  

  df_final_train_norm = clean_dataset_ashrae(dataset, meter_types, features_used, z_columns, minmax_columns)

  # lists to store final input data  
  anom_x = []
  anom_y = []
  non_anom_x = []
  non_anom_y = []

  # group data on the basis of their building_id 
  grp_data = df_final_train_norm.groupby('building_id')

  # create a random list of building whose data would be considered 
  list_buildings = random.sample(list(grp_data.groups.keys()), nb_buildings)

  # loop throught the building ids - segregate them into Anomalous and Non-Anomalouws windows
  for grp_no in list_buildings:

    # pick a building id whose data needs to be added into final input data
    building_data = grp_data.get_group(grp_no)
    #building_id is not needed anymore 
    building_data = building_data.drop('building_id', axis = 1) # dropping not used features 

    # separating labels from the data 
    arr_labels = building_data['is_bad_meter_reading'].to_numpy()
    # label is not not need anymore 
    building_data = building_data.drop('is_bad_meter_reading', axis = 1) # dropping not used features 

    # separating output_feature from the data 
    arr_output = building_data[output_feature].to_numpy()
    # output_feature is not not need anymore 
    building_data = building_data.drop(output_feature, axis = 1) # dropping not used features 

    # making sure the data is sequential 
    building_data.sort_values("timestamp", axis = 0, ascending = True) 
    building_data = building_data.drop('timestamp', axis = 1) # dropping not used features 

    # creating numpy array of remaining features 
    building_data = building_data.to_numpy()

    # creating windowed data
    nb_features = building_data.shape[1]
    na_x, na_y, a_x, a_y = create_windows(building_data, arr_labels, arr_output, seq_length, nb_features)

    print(" na_x - len {}, shape {} ".format(len(na_x), na_x[0].shape))
    # print(" a_x - len {}".format(len(a_x)))
    # accumulating single bulding data 
    anom_x.extend(a_x)
    anom_y.extend(a_y)
    non_anom_x.extend(na_x)
    non_anom_y.extend(na_y)

  # making data similar to i.i.d.
  anom_x, anom_y = mix_data(anom_x, anom_y)
  non_anom_x, non_anom_y = mix_data(non_anom_x, non_anom_y)

  return non_anom_x, non_anom_y, anom_x, anom_y 

In [None]:
## change these lists as per model's input 
meter_types = [0] 

features_used = ['building_id','timestamp', 'month', 'is_bad_meter_reading','meter_reading_log']

z_columns = []          

minmax_columns = ['month']

seq_length = 24 


nb_buildings = 145

output_feature = 'meter_reading_log'

In [None]:
non_anom_x, non_anom_y, anom_x, anom_y = load_data_ashrae(dataset, meter_types, features_used, z_columns, minmax_columns, seq_length, nb_buildings, output_feature)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


Feature Name : month  :  Std Value - 3.4435748699115054 ; Mean Value - 6.552057846381409
df_train_norm shape (12060910, 5)
 na_x - len 354, shape (25, 1) 
 na_x - len 351, shape (25, 1) 
 na_x - len 361, shape (25, 1) 
 na_x - len 353, shape (25, 1) 
 na_x - len 357, shape (25, 1) 
 na_x - len 361, shape (25, 1) 
 na_x - len 339, shape (25, 1) 
 na_x - len 342, shape (25, 1) 
 na_x - len 311, shape (25, 1) 
 na_x - len 365, shape (25, 1) 
 na_x - len 363, shape (25, 1) 
 na_x - len 340, shape (25, 1) 
 na_x - len 268, shape (25, 1) 
 na_x - len 225, shape (25, 1) 
 na_x - len 252, shape (25, 1) 
 na_x - len 341, shape (25, 1) 
 na_x - len 335, shape (25, 1) 
 na_x - len 224, shape (25, 1) 
 na_x - len 343, shape (25, 1) 
 na_x - len 339, shape (25, 1) 
 na_x - len 365, shape (25, 1) 
 na_x - len 332, shape (25, 1) 
 na_x - len 339, shape (25, 1) 
 na_x - len 365, shape (25, 1) 
 na_x - len 224, shape (25, 1) 
 na_x - len 178, shape (25, 1) 
 na_x - len 225, shape (25, 1) 
 na_x - len 2

In [None]:
def split_data(train_percent, val_percent, test_percent, output_non_anom_x, output_non_anom_y ):

  # train dataset
  X_train_non_anom = output_non_anom_x[:int((len(output_non_anom_y)*train_percent))]
  Y_train_non_anom = output_non_anom_y[:int((len(output_non_anom_y)*train_percent))]

  # validation dataset
  X_val_non_anom = output_non_anom_x[int((len(output_non_anom_y)*train_percent)):-int((len(output_non_anom_y)*test_percent) )]
  Y_val_non_anom = output_non_anom_y[int((len(output_non_anom_y)*train_percent)):-int((len(output_non_anom_y)*test_percent) )]

  # test dataset                                                          
  X_test_non_anom = output_non_anom_x[-int(len(output_non_anom_y)*test_percent):]
  Y_test_non_anom = output_non_anom_y[-int(len(output_non_anom_y)*test_percent):]     

  return  X_train_non_anom, Y_train_non_anom, X_val_non_anom, Y_val_non_anom, X_test_non_anom, Y_test_non_anom

In [None]:
train_percent = 0.8
val_percent = 0.1
test_percent = 0.1

In [None]:
X_train_non_anom, Y_train_non_anom, X_val_non_anom, Y_val_non_anom, X_test_non_anom, Y_test_non_anom = split_data(train_percent, val_percent, test_percent, non_anom_x, non_anom_y)

In [None]:
from numpy import percentile

In [None]:
X_train_anom, Y_train_anom, X_val_anom, Y_val_anom, X_test_anom, Y_test_anom = split_data(train_percent, val_percent, test_percent, anom_x, anom_y)

In [None]:
os.chdir("/content/drive/My Drive/ASHRAEData/")

In [None]:
## libraries to create the model 
from tensorflow.keras.layers import  MaxPool1D, Input, MaxPool1D, UpSampling1D
from tensorflow.keras.layers import  Dense, Activation, Dropout, Reshape, Conv1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
import tensorflow.keras.backend as K

In [None]:
tf.keras.backend.clear_session()

In [None]:
def compile_model(model):
    opt = Adam(lr=0.0001)
    model.compile(optimizer=opt, loss='mse', metrics='mse')
    return model

In [None]:
def generate_model_autoencoder1(input_len, name):
  input_encoder = Input(shape=(input_len,))  # input layer
  ip = Dropout(0.05)(input_encoder)
  encoder = Reshape((input_len,1))(ip)

  encoder = Conv1D(12, 2, strides=1, padding='same', data_format='channels_last', activation='tanh')(encoder)
  encoder = MaxPool1D(pool_size=2, data_format='channels_last')(encoder)

  encoder = Conv1D(6, 2, strides=1, padding='same', data_format='channels_last', activation='tanh')(encoder)  
  encoder = MaxPool1D(pool_size=2, data_format='channels_last')(encoder)

  encoder = Conv1D(1, 2, strides=1, padding='same', data_format='channels_last', activation='tanh')(encoder)  
  encoder = MaxPool1D(pool_size=2, data_format='channels_last')(encoder)

  decoder = Conv1D(1, 2, strides=1, padding='same', data_format='channels_last', activation='tanh')(encoder)  
  decoder = UpSampling1D(size=2)(decoder)

  decoder = Conv1D(6, 2, strides=1, padding='same', data_format='channels_last', activation='tanh')(decoder)  
  decoder = UpSampling1D(size=2)(decoder)

  decoder = Conv1D(12, 2, strides=1, padding='same', data_format='channels_last', activation='tanh')(decoder)  
  decoder = UpSampling1D(size=2)(decoder)

  decoded = decoder = Conv1D(1, 2, strides=1, padding='same', data_format='channels_last', activation='relu')(decoder) 
  
  model = Model(input_encoder, decoded, name = name)

  return model

In [None]:
autoencoder1 = generate_model_autoencoder1(24,  "AutoencoderConv-")

In [None]:
autoencoder1 = compile_model(autoencoder1)

In [None]:
## create dataset 
train_dataset_autoencoder1 = tf.data.Dataset.from_tensor_slices((Y_train_non_anom,Y_train_non_anom))

val_dataset_autoencoder1 = tf.data.Dataset.from_tensor_slices((Y_val_non_anom, Y_val_non_anom))


BATCH_SIZE_TRAIN = 32
BATCH_SIZE_VAL = 16

train_dataset_autoencoder1 = train_dataset_autoencoder1.batch(BATCH_SIZE_TRAIN)
val_dataset_autoencoder1 = val_dataset_autoencoder1.batch(BATCH_SIZE_VAL)
print(train_dataset_autoencoder1)
print(val_dataset_autoencoder1)

<BatchDataset shapes: ((None, 24, 1), (None, 24, 1)), types: (tf.float32, tf.float32)>
<BatchDataset shapes: ((None, 24, 1), (None, 24, 1)), types: (tf.float32, tf.float32)>


In [None]:
checkpoint_path = './final_models_4/baseline_model3.ckpt'
model_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path, monitor='val_loss', save_best_only=True,
    save_weights_only=True, mode='min', save_freq='epoch')
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='logs/fit3/baseline_model3')
early_stopping = tf.keras.callbacks.EarlyStopping( monitor='val_loss', patience=60, min_delta=0.0001)

In [None]:
autoencoder1.fit(train_dataset_autoencoder1, epochs=300, validation_data = val_dataset_autoencoder1, callbacks=[model_callback, tensorboard_callback, early_stopping])

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f0652a49d68>

In [None]:
tf.keras.backend.clear_session()

In [None]:
def generate_model_autoencoder2(input_len, name):
  input_encoder = Input(shape=(input_len,))  # input layer
  ip = Dropout(0.05)(input_encoder)
  encoder = Reshape((input_len,1))(ip)

  encoder = Conv1D(12, 2, strides=1, padding='same', data_format='channels_last', activation='tanh')(encoder)
  encoder = MaxPool1D(pool_size=2, data_format='channels_last')(encoder)

  encoder = Conv1D(6, 2, strides=1, padding='same', data_format='channels_last', activation='tanh')(encoder)  
  encoder = MaxPool1D(pool_size=2, data_format='channels_last')(encoder)

  encoder = Conv1D(1, 2, strides=1, padding='same', data_format='channels_last', activation='tanh')(encoder)  
  encoder = MaxPool1D(pool_size=2, data_format='channels_last')(encoder)

  decoder = Conv1D(1, 2, strides=1, padding='same', data_format='channels_last', activation='tanh')(encoder)  
  decoder = UpSampling1D(size=2)(decoder)

  decoder = Conv1D(6, 2, strides=1, padding='same', data_format='channels_last', activation='tanh')(decoder)  
  decoder = UpSampling1D(size=2)(decoder)

  decoder = Conv1D(12, 2, strides=1, padding='same', data_format='channels_last', activation='tanh')(decoder)  
  decoder = UpSampling1D(size=2)(decoder)

  decoded = decoder = Conv1D(1, 2, strides=1, padding='same', data_format='channels_last', activation='relu')(decoder) 
  
  model = Model(input_encoder, decoded, name = name)

  return model

In [None]:
autoencoder2 = generate_model_autoencoder2(25,  "AutoencoderBasic-Cond")

In [None]:
autoencoder2 = compile_model(autoencoder2)

In [None]:
autoencoder2.summary()

Model: "AutoencoderBasic-Cond"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 25)]              0         
_________________________________________________________________
dropout (Dropout)            (None, 25)                0         
_________________________________________________________________
reshape (Reshape)            (None, 25, 1)             0         
_________________________________________________________________
conv1d (Conv1D)              (None, 25, 12)            36        
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 12, 12)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 12, 6)             150       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 6, 6)    

In [None]:
checkpoint_path = './final_models_4/baseline_model4.ckpt'
model_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path, monitor='val_loss', save_best_only=True,
    save_weights_only=True, mode='min', save_freq='epoch')
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='logs/fit3/baseline_model4')
early_stopping = tf.keras.callbacks.EarlyStopping( monitor='val_loss', patience=60, min_delta=0.0001)

In [None]:
train_dataset_autoencoder2 = tf.data.Dataset.from_tensor_slices((X_train_non_anom,Y_train_non_anom))

val_dataset_autoencoder2 = tf.data.Dataset.from_tensor_slices((X_val_non_anom, Y_val_non_anom))


BATCH_SIZE_TRAIN = 32
BATCH_SIZE_VAL = 16

train_dataset_autoencoder2 = train_dataset_autoencoder2.batch(BATCH_SIZE_TRAIN)
val_dataset_autoencoder2 = val_dataset_autoencoder2.batch(BATCH_SIZE_VAL)
print(train_dataset_autoencoder2)
print(val_dataset_autoencoder2)

<BatchDataset shapes: ((None, 25, 1), (None, 24, 1)), types: (tf.float64, tf.float32)>
<BatchDataset shapes: ((None, 25, 1), (None, 24, 1)), types: (tf.float64, tf.float32)>


In [None]:
autoencoder2.fit(train_dataset_autoencoder2, epochs=300, validation_data = val_dataset_autoencoder2, callbacks=[model_callback, tensorboard_callback, early_stopping])

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f0650340f98>

In [None]:
#Add metric