In [None]:
from google.colab import drive
drive.mount('/content/drive') 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import pickle
import pandas as pd
import tensorflow as tf
import numpy as np
import random

In [None]:
# For reproducibility
np.random.seed(7)
tf.random.set_seed(7)
random.seed(7)

In [None]:
## CAN BE CHANGED 

dir_dataset = "/content/drive/My Drive/Anomaly detection - BuildSys2020/"
dir_labels =  "/content/drive/My Drive/ASHRAEData/"
dir_save_data = "/content/drive/My Drive/ASHRAEData/data/"

In [None]:
## CAN BE CHANGED 

## change these lists as per model's input 
site_id = 0 

## features to be used in the model ('anomaly' included in the list for the labels)
features_used = ['building_id','timestamp', 'month', 'anomaly','meter_reading_log']

## columns to be normalized using Z algorithm
z_columns = []          

## columns for which min-max normalization
minmax_columns = ['month']

## window length 
seq_length = 24 

## number of buildings to be considered if not using site id
nb_buildings = 145

## output feature to be predicted 
output_feature = 'meter_reading_log'

## name of the model for which data is prepared
model_name = 'Baseline'

## for splitting the non-Anomalous data 
train_percent = 0.8
val_percent = 0.1
test_percent = 0.1

## for splitting the Anomalous data 
train_percent_anom = 0.1
val_percent_anom = 0.2
test_percent_anom = 0.7

In [None]:
## Reading dataset 
os.chdir(dir_dataset)
with open('ashrae_rank1_train_clean.pkl', 'rb') as f:
  dataset = pickle.load(f)

## Reading Curated Anomaly Labels 
os.chdir(dir_labels)
anomaly_labels = pd.read_csv("anomaly_labels.csv")

## Concatenating the two 
dataset = pd.concat([dataset, anomaly_labels], axis=1)

In [None]:
def usable_features(df_building, features_used):  # for droping the features not to be considered in the model
  ## df_building - pandas dataframe of the datset
  ## features_used - features to be included in the cleaned dataset

  # creating list of columns that are not used 
  col = df_building.columns
  features_not_used = [] # store columns that won't be used
  for x in col:
    if x not in features_used:
      features_not_used.append(x)

  # drop the columns
  df_building= df_building.drop(features_not_used, axis = 1) # dropping not used features 

  # to fill NA data we simply replace the values with 0
  df_building = df_building.fillna(0)
  
  return df_building

In [None]:
def normalize_minmax(df, col):  # function to normalize the columns 
    ## df - data frame containing the columns
    ## columns to be normalized  
    result = df.copy()
    for feature_name in col:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()

        print("Feature Name : {}  :  Max Value - {} ; Min Value - {}".format(feature_name, max_value, min_value))
        # normalize 
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result 

In [None]:
def normalize_zAlgo(df, col):  # function to normalize the columns 
    ## df - data frame containing the columns
    ## columns to be normalized  
    result = df.copy()
    for feature_name in col:
        std_value = df[feature_name].std()
        mean_value = df[feature_name].mean()

        print("Feature Name : {}  :  Std Value - {} ; Mean Value - {}".format(feature_name, std_value, mean_value))
        # normalize 
        result[feature_name] = (df[feature_name] - mean_value) / std_value
    return result

In [None]:
def clean_dataset_ashrae(dataset, site_id, features_used, z_columns, minmax_columns):
  ## site_id - site id be included 
  ## features_used - list of features to be included in the final data
  ## z_columns - list of columns to be normalized with Z algorithm
  ## minmax_columns - list of columns to be normalized with min-max normalization

  ## divide by site ids
  dataset = dataset.loc[dataset['site_id'] == site_id]
  
  meter_dataset = dataset.loc[dataset['meter']==0]
  
  # create a new column  
  meter_dataset['meter_reading_log'] = np.log(meter_dataset['meter_reading']+1) 

  #convert categorical data type to int
  meter_dataset['primary_use'] = meter_dataset['primary_use'].astype('category')
  unique_primUse_list = list(meter_dataset.primary_use.unique())
  unique_primUse_dict = {unique_primUse_list[i]: i for i in range(0, len(unique_primUse_list))}
  meter_dataset['primary_use'] = meter_dataset['primary_use'].map(unique_primUse_dict).astype(int)

  # remove columns that won't be included in the model's input 
  df_train = usable_features(meter_dataset, features_used)

  # normalize features 
  df_train_norm1 = normalize_zAlgo(df_train, z_columns) # z-algorithm
  df_train_norm = normalize_zAlgo(df_train_norm1, minmax_columns) # min-max scaler

  print("df_train_norm shape {}".format(df_train_norm.shape))
  return df_train_norm

In [None]:
## creating input sequence of length - seq_length
def create_windows(arr_data, arr_anomaly_label, arr_op, seq_length, nb_features):
  ## arr_data - normalized dataset as a numpy array 
  ## arr_op - array containing anomaly labels for each time step 
  ## seq_length - wiindow length 
  ## nb_features - no. of features to be used as input 

  anom_x = [] # storing anomalous windows - input 
  non_anom_x = [] # storing non-anomalous windows - input 
  anom_y = [] # storing anomalous windows - output 
  non_anom_y = [] # storing non-anomalous windows - output 

  # run a loop to move a seq_length size window across data non-overlapping
  for i in range(0,arr_data.shape[0]//seq_length):

    # slice the window 
    window_features = arr_data[i*seq_length:(i+1)*seq_length].reshape((seq_length, nb_features))   # window of seq_length
    window_output = arr_op[i*seq_length:(i+1)*seq_length].reshape((seq_length,1))

    cond_input = np.concatenate((window_output, window_features[0][0].reshape((1,1))))

    is_anomaly = np.count_nonzero(arr_anomaly_label[i*seq_length:(i+1)*seq_length])  #if even at one time point anomaly is present the window would be considered anomalous
    # print(is_anomaly)

    if is_anomaly > 0 :  # separating the anomalous and non-anomalous data 
      anom_x.append(cond_input)
      anom_y.append(window_output)
    else:
      non_anom_x.append(cond_input)
      non_anom_y.append(window_output)

  return non_anom_x, non_anom_y, anom_x, anom_y 

In [None]:
def mix_data(data_x, data_y):
  #  Mix Data (to make it similar to i.i.d)
  data_idx = np.random.permutation(len(data_x))


  output_data_x = []  # Store shuffled data
  output_data_y = []

  for i in range(len(data_x)):
    output_data_x.append(data_x[data_idx[i]])
    output_data_y.append(data_y[data_idx[i]])

  print("ouput_x shape {} , {}".format(len(output_data_x), output_data_x[0].shape))
  return output_data_x, output_data_y

In [None]:
def load_data_ashrae(dataset, site_id, features_used, z_columns, minmax_columns, seq_length, nb_buildings, output_feature):
  ## dataset - dataset DataFrame
  ## site_id - site_id to be considered
  ## seq_length - window length 
  ## nb_buildings - no of buildings for the input samples 
  ## output_feature - feature that would be taken as the output to be predicted 

  df_final_train_norm = clean_dataset_ashrae(dataset, site_id, features_used, z_columns, minmax_columns)

  # lists to store final input data  
  anom_x = []
  anom_y = []
  non_anom_x = []
  non_anom_y = []

  # group data on the basis of their building_id 
  grp_data = df_final_train_norm.groupby('building_id')

  # create a random list of building whose data would be considered 
  # list_buildings = random.sample(list(grp_data.groups.keys()), nb_buildings)

  ## all buildings in the site 
  list_buildings = list(grp_data.groups.keys())

  # loop throught the building ids - segregate them into Anomalous and Non-Anomalouws windows
  for grp_no in list_buildings:

    # pick a building id whose data needs to be added into final input data
    building_data = grp_data.get_group(grp_no)
    #building_id is not needed anymore 
    building_data = building_data.drop('building_id', axis = 1) # dropping not used features 

    # separating labels from the data 
    arr_labels = building_data['anomaly'].to_numpy()
    # label is not not need anymore 
    building_data = building_data.drop('anomaly', axis = 1) # dropping not used features 

    # separating output_feature from the data 
    arr_output = building_data[output_feature].to_numpy()
    # output_feature is not not need anymore 
    building_data = building_data.drop(output_feature, axis = 1) # dropping not used features 

    # making sure the data is sequential 
    building_data.sort_values("timestamp", axis = 0, ascending = True) 
    building_data = building_data.drop('timestamp', axis = 1) # dropping not used features 

    # creating numpy array of remaining features 
    building_data = building_data.to_numpy()

    # creating windowed data
    nb_features = building_data.shape[1]
    na_x, na_y, a_x, a_y = create_windows(building_data, arr_labels, arr_output, seq_length, nb_features)

    # print(" na_x - len {} ".format(len(na_x)))
    # print(" a_x - len {}".format(len(a_x)))
    # accumulating single bulding data 
    anom_x.extend(a_x)
    anom_y.extend(a_y)
    non_anom_x.extend(na_x)
    non_anom_y.extend(na_y)

  # making data similar to i.i.d.
  anom_x, anom_y = mix_data(anom_x, anom_y)
  non_anom_x, non_anom_y = mix_data(non_anom_x, non_anom_y)

  return non_anom_x, non_anom_y, anom_x, anom_y 

In [None]:
def split_data(train_percent, val_percent, test_percent, output_non_anom_x, output_non_anom_y ):
  ## function for splitting the data into train, val and test
  
  # train dataset
  X_train_non_anom = output_non_anom_x[:int((len(output_non_anom_y)*train_percent))]
  Y_train_non_anom = output_non_anom_y[:int((len(output_non_anom_y)*train_percent))]

  # validation dataset
  X_val_non_anom = output_non_anom_x[int((len(output_non_anom_y)*train_percent)):-int((len(output_non_anom_y)*test_percent) )]
  Y_val_non_anom = output_non_anom_y[int((len(output_non_anom_y)*train_percent)):-int((len(output_non_anom_y)*test_percent) )]

  # test dataset                                                          
  X_test_non_anom = output_non_anom_x[-int(len(output_non_anom_y)*test_percent):]
  Y_test_non_anom = output_non_anom_y[-int(len(output_non_anom_y)*test_percent):]     

  return  X_train_non_anom, Y_train_non_anom, X_val_non_anom, Y_val_non_anom, X_test_non_anom, Y_test_non_anom

In [None]:
## loading the data ( calling the function here )
non_anom_x, non_anom_y, anom_x, anom_y = load_data_ashrae(dataset, site_id, features_used, z_columns, minmax_columns, seq_length, nb_buildings, output_feature)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Feature Name : month  :  Std Value - 3.4521200118787347 ; Mean Value - 6.529371681698442
df_train_norm shape (908409, 5)
ouput_x shape 22919 , (25, 1)
ouput_x shape 14927 , (25, 1)


In [None]:
## Splitting Non-Anomalous data 
X_train_non_anom, Y_train_non_anom, X_val_non_anom, Y_val_non_anom, X_test_non_anom, Y_test_non_anom = split_data(train_percent, val_percent, test_percent, non_anom_x, non_anom_y)

In [None]:
## Saving Non- Anomalous data - Conditional
os.chdir(dir_save_data)
if not os.path.exists(model_name + "_data/"):
  os.mkdir(model_name + "_data/")

if not os.path.exists(model_name + "_data/Conditional/"):
  os.mkdir(model_name + "_data/Conditional/")

if not os.path.exists(model_name + "_data/Conditional/" + "site_id_" + str(site_id)+"/"):
  os.mkdir(model_name + "_data/Conditional/" + "site_id_" + str(site_id)+"/")

if not os.path.exists(model_name + "_data/Conditional/" + "site_id_" + str(site_id)+"/non_anom/"):
  os.mkdir(model_name + "_data/Conditional/" + "site_id_" + str(site_id)+"/non_anom/")

## Saving training data
with open("./" + model_name + "_data/Conditional/"+ "site_id_" + str(site_id)+"/non_anom/X_train_data.pkl", 'wb') as f:
  pickle.dump(X_train_non_anom, f)

with open("./" + model_name + "_data/Conditional/"+ "site_id_" + str(site_id)+"/non_anom/Y_train_data.pkl", 'wb') as f:
  pickle.dump(Y_train_non_anom, f)

## Saving validation data
with open("./" + model_name + "_data/Conditional/"+ "site_id_" + str(site_id)+"/non_anom/X_val_data.pkl", 'wb') as f:
  pickle.dump(X_val_non_anom, f)

with open("./" + model_name + "_data/Conditional/"+ "site_id_" + str(site_id)+"/non_anom/Y_val_data.pkl", 'wb') as f:
  pickle.dump(Y_val_non_anom, f)

## Saving test data
with open("./" + model_name + "_data/Conditional/"+ "site_id_" + str(site_id)+"/non_anom/X_test_data.pkl", 'wb') as f:
  pickle.dump(X_test_non_anom, f)

with open("./" + model_name + "_data/Conditional/"+ "site_id_" + str(site_id)+"/non_anom/Y_test_data.pkl", 'wb') as f:
  pickle.dump(Y_test_non_anom, f)

In [None]:
## Saving Non- Anomalous data - Not Conditional
os.chdir(dir_save_data)
if not os.path.exists(model_name + "_data/Not_Conditional/"):
  os.mkdir(model_name + "_data/Not_Conditional/")

if not os.path.exists(model_name + "_data/Not_Conditional/" + "site_id_" + str(site_id)+"/"):
  os.mkdir(model_name + "_data/Not_Conditional/" + "site_id_" + str(site_id)+"/")

if not os.path.exists(model_name + "_data/Not_Conditional/" + "site_id_" + str(site_id)+"/non_anom/"):
  os.mkdir(model_name + "_data/Not_Conditional/" + "site_id_" + str(site_id)+"/non_anom/")

## Saving training data
with open("./" + model_name + "_data/Not_Conditional/"+ "site_id_" + str(site_id)+"/non_anom/train_data.pkl", 'wb') as f:
  pickle.dump(Y_train_non_anom, f)

## Saving validation data
with open("./" + model_name + "_data/Not_Conditional/"+ "site_id_" + str(site_id)+"/non_anom/val_data.pkl", 'wb') as f:
  pickle.dump(Y_val_non_anom, f)

## Saving test data
with open("./" + model_name + "_data/Not_Conditional/"+ "site_id_" + str(site_id)+"/non_anom/test_data.pkl", 'wb') as f:
  pickle.dump(Y_test_non_anom, f)

In [None]:
## Splitting the Anomalous data
X_train_anom, Y_train_anom, X_val_anom, Y_val_anom, X_test_anom, Y_test_anom = split_data(train_percent_anom, val_percent_anom, test_percent_anom, anom_x, anom_y)

In [None]:
## Saving Anomalous data - Conditional Data
os.chdir(dir_save_data)
if not os.path.exists(model_name + "_data/Conditional/" + "site_id_" + str(site_id)+"/anom/"):
  os.mkdir(model_name + "_data/Conditional/" + "site_id_" + str(site_id)+"/anom/")

## Saving training data
with open("./" + model_name + "_data/Conditional/"+ "site_id_" + str(site_id)+"/anom/X_train_data.pkl", 'wb') as f:
  pickle.dump(X_train_anom, f)

with open("./" + model_name + "_data/Conditional/"+ "site_id_" + str(site_id)+"/anom/Y_train_data.pkl", 'wb') as f:
  pickle.dump(Y_train_anom, f)

## Saving validation data
with open("./" + model_name + "_data/Conditional/"+ "site_id_" + str(site_id)+"/anom/X_val_data.pkl", 'wb') as f:
  pickle.dump(X_val_anom, f)

with open("./" + model_name + "_data/Conditional/"+ "site_id_" + str(site_id)+"/anom/Y_val_data.pkl", 'wb') as f:
  pickle.dump(Y_val_anom, f)

## Saving test data
with open("./" + model_name + "_data/Conditional/"+ "site_id_" + str(site_id)+"/anom/X_test_data.pkl", 'wb') as f:
  pickle.dump(X_test_anom, f)

with open("./" + model_name + "_data/Conditional/"+ "site_id_" + str(site_id)+"/anom/Y_test_data.pkl", 'wb') as f:
  pickle.dump(Y_test_anom, f)

In [None]:
## Saving Anomalous data - Not Conditional
os.chdir(dir_save_data)
if not os.path.exists(model_name + "_data/Not_Conditional/" + "site_id_" + str(site_id)+"/anom/"):
  os.mkdir(model_name + "_data/Not_Conditional/" + "site_id_" + str(site_id)+"/anom/")

## Saving training data
with open("./" + model_name + "_data/Not_Conditional/"+ "site_id_" + str(site_id)+"/anom/train_data.pkl", 'wb') as f:
  pickle.dump(Y_train_anom, f)

## Saving validation data
with open("./" + model_name + "_data/Not_Conditional/"+ "site_id_" + str(site_id)+"/anom/val_data.pkl", 'wb') as f:
  pickle.dump(Y_val_anom, f)

## Saving test data
with open("./" + model_name + "_data/Not_Conditional/"+ "site_id_" + str(site_id)+"/anom/test_data.pkl", 'wb') as f:
  pickle.dump(Y_test_anom, f)