In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import io
import os
import urllib
import zipfile
import time 
import boto3
import boto.s3
import sys
import json
from orion import Orion
import logging
import pickle
from typing import List, Union
import pandas as pd
from mlblocks import MLPipeline
from orion.evaluation import CONTEXTUAL_METRICS as METRICS
from orion.data import load_signal, load_anomalies
S3_URL = 'https://d3-ai-orion-analysis.s3.amazonaws.com/'
BUCKET = "d3-ai-orion-analysis"


In [2]:
signal = 'S-1'
# load signal
train_data = load_signal(signal)
# load ground truth anomalies
# known_anomalies = load_anomalies(signal)
train_data.head(5)

Unnamed: 0,timestamp,value
0,1222819200,-0.366359
1,1222840800,-0.394108
2,1222862400,0.403625
3,1222884000,-0.362759
4,1222905600,-0.370746


In [3]:
def build_df(data, start=0):
    index = np.array(range(start, start + len(data)))
    step = 300
    initial_time = 1222819200 
    timestamp = index * step + initial_time 
    return pd.DataFrame({'timestamp': timestamp}, dtype='int64')
def time_index(index):
    step = 300
    initial_time = 1222819200
    return index * step + initial_time

In [4]:
def get_creds():    
    credentials_file = "credentials_file.txt" #some path
    with open(credentials_file,) as file:
        lines = file.readlines()
        ACCESS_KEY_ID_LINE_NUM = 2
        SECRET_ACCESS_KEY_ID_LINE_NUM = 3
        access_key_id = lines[ACCESS_KEY_ID_LINE_NUM]
        secret_access_key = lines[SECRET_ACCESS_KEY_ID_LINE_NUM]
        
    return access_key_id, secret_access_key    
    
def send_file_to_s3(file_path, file_name): 
    #Creating Session With Boto3.
    #Creating S3 Resource From the Session.
    # Fetch S3 credentials from credentials file
    access_key_id, secret_access_key = get_creds()
    s3 = boto3.Session(
        aws_access_key_id = access_key_id,
        aws_secret_access_key = secret_access_key
    ).resource('s3')
    #     'https://d3-ai-orion-analysis.s3.amazonaws.com/'
    result = s3.Bucket(BUCKET).upload_file(file_path, file_name)
    print(result)    

def load_file_from_s3(file_name):
    # Fetch S3 credentials from credentials file
    access_key_id, secret_access_key = get_creds()
    s3 = boto3.Session(
        aws_access_key_id = access_key_id,
        aws_secret_access_key = secret_access_key
    ).resource('s3')
    
    file = s3.Object(BUCKET, file_name).get()['Body'].read()
    
    return file

In [5]:
def get_anomalies_and_viz_and_pickle(orion, train_data, pkl_file_substring):
    anomalies, viz = orion.detect(train_data, visualization=True)
    
    anomalies_path = os.path.join(os.path.abspath('.'), pkl_file_substring + "-anomalies.pkl")
    gen_time_series_path = os.path.join(os.path.abspath('.'), pkl_file_substring + "-generated_timeseries.pkl")

    # Creates local file path to load from
    with open(anomalies_path, 'wb') as new_file:
        pickle.dump(anomalies, new_file)
        # Send Anomalies to S3
#         send_file_to_s3(new_file, anomalies_path)
    with open(gen_time_series_path, 'wb') as new_file:
        pickle.dump(viz, new_file)
        # Send VIZ to S3
#         send_file_to_s3(new_file, get_time_series_path)
    
    return anomalies, viz

In [6]:
# Variables
# Num Epochs
# Strat
def get_date_index():
    step = 300
    initial_time = 1222819200
    
    time_struct = time.localtime()
    month, day = time_struct.tm_mon, time_struct.tm_mday
    hour, minute, sec = time_struct.tm_hour, time_struct.tm_min, time_struct.tm_sec
        
    ## CONVERT TO INTEGER WITH TIME STAMP
    index = month * 30 * 3600 * 24 + day * 3600 * 24 + hour * 3600 + minute * 60 + sec 
    index = index % step
    date_index =  str(index * step + initial_time)
    return date_index

def create_pipeline_file_name(pipeline_name):
    date_index = get_date_index()
    pkl_file_name = pipeline_name + date_index
    pkl_file_name += ".pkl"
    
    return pkl_file_name

def single_epoch_pipeline(num_epochs, train_data, s3_path = None):
    pipeline_name = 'lstm_dynamic_threshold'
    ## WARNING: OS DEPENDENT- WINDOWS = \\, LINUX = /
    ##################################################
    pkl_file_name = create_pipeline_file_name(pipeline_name)
    new_path = os.path.join(os.path.abspath('.'), pkl_file_name)
    
    print(new_path)
    
    if s3_path:
        # Load pkl file content from S3
        pkl_file = load_file_from_s3(s3_path)
        # Creates local file path to load from
        with open(new_path, 'wb') as new_file:
            pickle.dump(pkl_file, new_file)   
        
        orion.load(new_path)
        
    elif s3_path is None:
        # initialize pipeline if no previous pipeline data
        hyperparameters = {
            'keras.Sequential.LSTMTimeSeriesRegressor#1': {
                'epochs': 1,
                'verbose': True
            }  
        }
#         orion = Orion(
#             pipeline='lstm_dynamic_threshold_viz',
#             hyperparameters=hyperparameters
#         )
        lstm_viz_path = "C:\\Users\\rarh9\\Desktop\\MIT\\UROPs\\Orion\\orion_venv\\Lib\\site-packages\\orion\\pipelines\\verified\\lstm_dynamic_threshold\\lstm_dynamic_threshold_viz"        
        
        orion = Orion(lstm_viz_path, hyperparameters)
        # Creates local file path to load from
        orion.fit(train_data)
        orion.save(new_path)
    
    # Caching and then deleting 
    
    
    for i in range(num_epochs):
        orion = Orion.load(new_path)
        orion.fit(train_data)
        
        ## Extract MSE, F1 scores from here
        ## Maybe use Vizualization

        orion.save(new_path)
    pkl_file_name = 'd3-ai-orion-' + create_pipeline_file_name(pipeline_name)
    anomalies, viz = get_anomalies_and_viz_and_pickle(orion, train_data, pkl_file_name[:-4])
    # Slightly different pipeline file name to distinguish S3 files from local files
    # Upload to S3
    # Send PKL file to S3 Bucket
#     send_file_to_s3(new_path, pkl_file_name)        
    
    return orion 
            

In [7]:
orion = single_epoch_pipeline(1, train_data)


C:\Users\rarh9\Desktop\MIT\UROPs\Orion\orion_venv\MSE_F1\lstm_dynamic_threshold1222852200.pkl


Using TensorFlow backend.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.

Train on 7919 samples, validate on 1980 samples
Epoch 1/1
{'layers': [{'class': 'keras.layers.LSTM', 'parameters': {'input_shape': 'input_shape', 'units': 'lstm_1_units', 'return_sequences': True}}, {'class': 'keras.layers.Dropout', 'parameters': {'rate': 'dropout_1_rate'}}, {'class': 'keras.layers.LSTM', 'parameters': {'units': 'lstm_2_units', 'return_sequences': False}}, {'class': 'keras.layers.Dropout', 'parameters': {'rate': 'dropout_2_rate'}}, {'class': 'keras.layers.Dense', 'parameters': {'units': 'dense_units', 'activation': 'linear'}}], 'optimizer': <class 'keras.optimizers.Adam'>, 'loss': <function mean_squared_error at 0x0000029F9B6255E8>, 'metrics': ['mse'], 'epochs': 1, 'verbose': True, 'classification': False, 'hyperparameters': {'input_shape': None, 'dense_units': 1, 'return_sequences': False, 'lstm_1_units': 80, 'dropout_1_rate': 0.3, 'lstm_2_units': 80, 'dropout_2_rate': 0.3}, 'validation_

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Train on 7919 samples, validate on 1980 samples
Epoch 1/1
{'layers': [{'class': 'keras.layers.LSTM', 'parameters': {'input_shape': 'input_shape', 'units': 'lstm_1_units', 'return_sequences': True}}, {'class': 'keras.layers.Dropout', 'parameters': {'rate': 'dropout_1_rate'}}, {'class': 'keras.layers.LSTM', 'parameters': {'units': 'lstm_2_units', 'return_sequences': False}}, {'class': 'keras.layers.Dropout', 'parameters': {'rate': 'dropout_2_rate'}}, {'class': 'keras.layers.Dense', 'parameters': {'units': 'dense_units', 'activation': 'linear'}}], 'optimizer': <class 'keras.optimizers.Adam'>, 'loss': <function mean_squared_error at 0x0000029F9B6255E8>, 'metrics': ['mse'], 'epochs': 1, 'verbose': True, 'classification': False, 'hyperparameters': {'input_shape': None, 'dense_units': 1, 'return_sequences': False, 'lstm_1_units': 80, 'dropout_1_rate': 0.3, 'lstm_2_units': 80, 'dropout_2_rate': 0.3}, 'validation_split': 0.2, 'batch_size': 64, 'shuffle': True, 'callbacks': [{'class': <class 'ke

In [19]:
print(anomalies)

        start         end  severity
0  1228219200  1229493600  0.721846


In [18]:
print(len(viz['generated_timeseries']))

9899


In [None]:
os.path.join(os.path.abspath('.'), create_pipeline_file_name(pipeline_name))

In [15]:
viz

{'generated_timeseries': array([[-0.03341556],
        [-0.04529629],
        [-0.03904901],
        ...,
        [-0.06958377],
        [-0.07055108],
        [-0.0603055 ]], dtype=float32)}

In [39]:
# send_file_to_s3("example.py", "ExampleFile.py")

example_file = load_file_from_s3("ExampleFile.py")

HTTPClientError: An HTTP Client raised an unhandled exception: Invalid header value b'AWS4-HMAC-SHA256 Credential=accessKeyId\n/20220302/us-east-1/s3/aws4_request, SignedHeaders=host;x-amz-content-sha256;x-amz-date, Signature=4936767c5fe094e88b7515824ce6715cc65bab01778b5446564d793af0216d96'

In [23]:
# # Create an S3 client
# s3_client = boto3.client('s3')

# # Call to S3 to retrieve the policy for the given bucket
# result = s3_client.get_bucket_acl(Bucket='d3-ai-orion-analysis')
# print(result)

In [None]:

# """Orion Core module.

# This module defines the Orion Class, which is responsible for the
# main anomaly detection functionality, as well as the interaction
# with the underlying MLBlocks pipelines.
# """
# import json
# import logging
# import os
# import pickle
# from typing import List, Union

# import pandas as pd
# from mlblocks import MLPipeline

# from orion.evaluation import CONTEXTUAL_METRICS as METRICS

# LOGGER = logging.getLogger(__name__)


# class Orion:
#     """Orion Class.

#     The Orion Class provides the main anomaly detection functionalities
#     of Orion and is responsible for the interaction with the underlying
#     MLBlocks pipelines.

#     Args:
#         pipeline (str, dict or MLPipeline):
#             Pipeline to use. It can be passed as:
#                 * An ``str`` with a path to a JSON file.
#                 * An ``str`` with the name of a registered pipeline.
#                 * An ``MLPipeline`` instance.
#                 * A ``dict`` with an ``MLPipeline`` specification.
#         hyperparameters (dict):
#             Additional hyperparameters to set to the Pipeline.
#     """

#     PIPELINES_DIR = tuple(
#         dirname
#         for dirname, _, _ in os.walk(os.path.join(os.path.dirname(__file__), 'pipelines'))
#         if os.path.exists(os.path.join(dirname, os.path.basename(dirname) + '.json'))
#     )
#     PIPELINES = tuple(os.path.basename(pipeline) for pipeline in PIPELINES_DIR)

#     DEFAULT_PIPELINE = 'lstm_dynamic_threshold'

#     def _get_mlpipeline(self):
#         pipeline = self._pipeline
#         if isinstance(pipeline, str) and os.path.isfile(pipeline):
#             with open(pipeline) as json_file:
#                 pipeline = json.load(json_file)

#         mlpipeline = MLPipeline(pipeline)
#         if self._hyperparameters:
#             mlpipeline.set_hyperparameters(self._hyperparameters)

#         return mlpipeline

#     def __init__(self, pipeline: Union[str, dict, MLPipeline] = None,
#                  hyperparameters: dict = None):
#         self._pipeline = pipeline or self.DEFAULT_PIPELINE
#         self._hyperparameters = hyperparameters
#         self._mlpipeline = self._get_mlpipeline()
#         self._fitted = False


#     def __eq__(self, other):
#         return (
#             isinstance(other, self.__class__) and
#             self._pipeline == other._pipeline and
#             self._hyperparameters == other._hyperparameters and
#             self._fitted == other._fitted
#         )

#     def fit(self, data: pd.DataFrame, **kwargs):
#         """Fit the pipeline to the given data.

#         Args:
#             data (DataFrame):
#                 Input data, passed as a ``pandas.DataFrame`` containing
#                 exactly two columns: timestamp and value.
#         """
#         if not self._fitted:
#             self._mlpipeline = self._get_mlpipeline()

#         self._mlpipeline.fit(data, **kwargs)
#         self._fitted = True


#     def _get_outputs_spec(self):
#         outputs_spec = ["default"]
#         try:
#             visualization_outputs = self._mlpipeline.get_output_names('visualization')
#             outputs_spec.append('visualization')
#         except ValueError:
#             visualization_outputs = []

#         return outputs_spec, visualization_outputs

#     @staticmethod
#     def _build_events_df(events):
#         events = pd.DataFrame(list(events), columns=['start', 'end', 'severity'])
#         events['start'] = events['start'].astype('int64')
#         events['end'] = events['end'].astype('int64')

#         return events

#     def _detect(self, method, data, visualization=False, **kwargs):
#         if visualization:
#             outputs_spec, visualization_names = self._get_outputs_spec()
#         else:
#             outputs_spec = 'default'

#         outputs = method(data, output_=outputs_spec, **kwargs)

#         if visualization:
#             if visualization_names:
#                 events = outputs[0]
#                 visualization_outputs = outputs[-len(visualization_names):]
#                 visualization_dict = dict(zip(visualization_names, visualization_outputs))
#             else:
#                 events = outputs
#                 visualization_dict = {}

#             return self._build_events_df(events), visualization_dict

#         return self._build_events_df(outputs)

#     def detect(self, data: pd.DataFrame, visualization: bool = False) -> pd.DataFrame:
#         """Detect anomalies in the given data..

#         If ``visualization=True``, also return the visualization
#         outputs from the MLPipeline object.

#         Args:
#             data (DataFrame):
#                 Input data, passed as a ``pandas.DataFrame`` containing
#                 exactly two columns: timestamp and value.
#             visualization (bool):
#                 If ``True``, also capture the ``visualization`` named
#                 output from the ``MLPipeline`` and return it as a second
#                 output.

#         Returns:
#             DataFrame or tuple:
#                 If visualization is ``False``, it returns the events
#                 DataFrame. If visualization is ``True``, it returns a
#                 tuple containing the events DataFrame followed by the
#                 visualization outputs dict.
#         """
#         return self._detect(self._mlpipeline.predict, data, visualization)


#     def fit_detect(self, data: pd.DataFrame, visualization: bool = False,
#                    **kwargs) -> pd.DataFrame:
#         """Fit the pipeline to the data and then detect anomalies.

#         This method is functionally equivalent to calling ``fit(data)``
#         and later on ``detect(data)`` but with the difference that
#         here the ``MLPipeline`` is called only once, using its ``fit``
#         method, and the output is directly captured without having
#         to execute the whole pipeline again during the ``predict`` phase.

#         If ``visualization=True``, also return the visualization
#         outputs from the MLPipeline object.

#         Args:
#             data (DataFrame):
#                 Input data, passed as a ``pandas.DataFrame`` containing
#                 exactly two columns: timestamp and value.
#             visualization (bool):
#                 If ``True``, also capture the ``visualization`` named
#                 output from the ``MLPipeline`` and return it as a second
#                 output.

#         Returns:
#             DataFrame or tuple:
#                 If visualization is ``False``, it returns the events
#                 DataFrame. If visualization is ``True``, it returns a
#                 tuple containing the events DataFrame followed by the
#                 visualization outputs dict.
#         """
#         if not self._fitted:
#             self._mlpipeline = self._get_mlpipeline()

#         result = self._detect(self._mlpipeline.fit, data, visualization, **kwargs)
#         self._fitted = True

#         return result

#     def save(self, path: str):
#         """Save this object using pickle.

#         Args:
#             path (str):
#                 Path to the file where the serialization of
#                 this object will be stored.
#         """
#         os.makedirs(os.path.dirname(path), exist_ok=True)
#         with open(path, 'wb') as pickle_file:
#             pickle.dump(self, pickle_file)


#     @classmethod
#     def load(cls, path: str):
#         """Load an Orion instance from a pickle file.

#         Args:
#             path (str):
#                 Path to the file where the instance has been
#                 previously serialized.

#         Returns:
#             Orion

#         Raises:
#             ValueError:
#                 If the serialized object is not an Orion instance.
#         """
#         with open(path, 'rb') as pickle_file:
#             orion = pickle.load(pickle_file)
#             if not isinstance(orion, cls):
#                 raise ValueError('Serialized object is not an Orion instance')

#             return orion


#     def evaluate(self, data: pd.DataFrame, ground_truth: pd.DataFrame, fit: bool = False,
#                  train_data: pd.DataFrame = None, metrics: List[str] = METRICS) -> pd.Series:
#         """Evaluate the performance against ground truth anomalies.

#         Args:
#             data (DataFrame):
#                 Input data, passed as a ``pandas.DataFrame`` containing
#                 exactly two columns: timestamp and value.
#             ground_truth (DataFrame):
#                 Ground truth anomalies passed as a ``pandas.DataFrame``
#                 containing two columns: start and stop.
#             fit (bool):
#                 Whether to fit the pipeline before evaluating it.
#                 Defaults to ``False``.
#             train_data (DataFrame):
#                 Training data, passed as a ``pandas.DataFrame`` containing
#                 exactly two columns: timestamp and value.
#                 If not given, the pipeline is fitted on ``data``.
#             metrics (list):
#                 List of metrics to used passed as a list of strings.
#                 If not given, it defaults to all the Orion metrics.

#         Returns:
#             Series:
#                 ``pandas.Series`` containing one element for each
#                 metric applied, with the metric name as index.
#         """
#         if not fit:
#             method = self._mlpipeline.predict
#         else:
#             if not self._fitted:
#                 mlpipeline = self._get_mlpipeline()

#             if train_data is not None:
#                 # Fit first and then predict
#                 mlpipeline.fit(train_data)
#                 method = mlpipeline.predict
#             else:
#                 # Fit and predict at once
#                 method = mlpipeline.fit

#         events = self._detect(method, data)

#         scores = {
#             metric: METRICS[metric](ground_truth, events, data=data)
#             for metric in metrics
#         }

#         return pd.Series(scores)
