In [None]:
import pickle 
import stumpy
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import STL

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Data Pre-Processing

## Reading Raw Data Files and Joining Them

In [None]:
# Reading Sensors Data that has Location and sensortype_id

sensor_df = pd.read_csv('./Data/Sensor.csv')

In [None]:
# Reading SensorType Data that has sensortype_id, sensor name and sensor type

sensor_type_df = pd.read_csv('./Data/SensorType.csv')

In [None]:
# Selecting only Relevant Columns from Sensor Data (sensor_df)

sensor_df = sensor_df[['id', 'sensortype_id', 'location_identifier']]

In [None]:
# Renaming Relevant Columns from Sensor Data (sensor_df)

sensor_df.columns = ['sensor_id', 'sensortype_id', 'location_identifier']

In [None]:
# Joining Sensor Data (sensor_df) and SensorType Data (sensor_type_df) based on the sensortype_id

sensor_data = sensor_df.merge(sensor_type_df, left_on='sensortype_id', right_on='id')

In [None]:
# Displaying Sensor Data after the join

sensor_data

In [None]:
# Selecting only Relevant Columns from Joined Sensor Data and SensorType Data (sensor_data)

sensor_data = sensor_data[['sensor_id', 'location_identifier', 'name', 'type']]

In [None]:
# Reading SensorHistory Data (sensor_history_df) that has sensor_id, Timestamp, sensor_reading and id

sensor_history_df = pd.read_csv("./Data/SensorDataHistorylarge.csv", header=None)

In [None]:
# Renaming Relevant Columns from SensorHinstory Data (sensor_history_df)

sensor_history_df.columns = ["reading_id", "sensor_id", "value", "value1", "value2", "Timestamp"]

In [None]:
# Dropping 2 Columns (value1 and value2) from SensorHinstory Data (sensor_history_df) 
# as all the values in these columns were missing (NaN)

sensor_history_df.drop(["value1", "value2"], inplace=True, axis=1)

In [None]:
# Joining Previously Joined Sensor Data and SensorType Data (sensor_data) with our 
# SensorHistory Data (sensor_history_df) based on the sensor_id

sensor_data = sensor_history_df.merge(sensor_data, on='sensor_id')

In [None]:
# Converting Timestamp Column of Joined Data (sensor_data) from string type to datetime type 

sensor_data['Timestamp'] = sensor_data['Timestamp'].astype('datetime64[s]')

In [None]:
# Sorting the sensor_data by Timestamp column

sensor_data = sensor_data.sort_values(by='Timestamp')

## Creating Time Series DataFrame of Sensor Readings From Sensor Data

In [None]:
# Converting Timestamp Column of Joined Data (sensor_data) from string type to datetime type 

sensor_data['Timestamp'] = sensor_data['Timestamp'].astype('datetime64[s]')

In [None]:
# Looping Over all the sensor_ids in the sensor_data and selecting the Timestamp and value(sensor_reading) column
# This creates a dictionary (dfs) where key is the sensor_id and value is a dataframe containing rows 
# that have only this specific sensor_id 

dfs = {}
for sensor_id in tqdm(sensor_data.sensor_id.unique()):
    dfs[sensor_id] = sensor_data[sensor_data["sensor_id"] == sensor_id][["Timestamp", "value"]]

In [None]:
# Looping Over all the key, value pairs in the dictionary(dfs) and renaming the dataframe on the value of this dict  
for key, value in tqdm(dfs.items()):
    value.columns = ["Timestamp", key]

In [None]:
# Converting Dictionary of Dataframes to a list of dfs

dfs = [value for value in dfs.values()]

In [None]:
# Looping over all the list items in the dataframe and performing downsampling 
#  interval is 5 minutes 

new_dfs = []
for df in tqdm(dfs):
    df = df.sort_values(by='Timestamp')
    df = df.resample('5T', on="Timestamp").mean()
    new_dfs.append(df)

In [None]:
# Joining First 2 values in the list(new_dfs) on Timestamp

outer_join_df = pd.merge(new_dfs[0], new_dfs[1], on="Timestamp", how="outer")

In [None]:
# Joining rest of the values from 2 to onwards in the list(new_dfs) on Timestamp

for value in tqdm(new_dfs[2:]):
    outer_join_df = pd.merge(outer_join_df, value, on='Timestamp', how="outer")

In [None]:
# Displaying the final result of join performed above
outer_join_df

In [None]:
# Create a summary DataFrame from the time series DataFrame (outer_join_df)

summary = outer_join_df.describe()

In [None]:
# Displaying Summary DataFrame created from outer_join_df 
summary

In [None]:
# Creating a list of Sensor_ids where total number of Non-null values are less than 50
invaluable_cols = [summary_col for summary_col in summary.columns if summary[summary_col]['count'] < 50]

In [None]:
# Displaying the count of total useless sensor_ids
len(invaluable_cols)

In [None]:
# Removing all rows from sensor_data where these useless sensor_ids exist


for column in tqdm(invaluable_cols):
    condition = (sensor_data['sensor_id'] == column)
    sensor_data = sensor_data[~condition]

In [None]:
# Dropping all the useless columns/sensor_ids from the Time Series DataFrame
outer_join_df.drop(invaluable_cols, inplace=True, axis=1)

In [None]:
outer_join_df

In [None]:
# Saving sensor_data as a csv file in the Processed_Data Directory

sensor_data.to_csv('./Processed_Data/Sensors_Data.csv', index=False)

In [None]:
# Saving the Time Series DataFrame as a csv file in Processed Data Directory

outer_join_df.to_csv("./Processed_Data/Final_Sensor_Time_Series.csv")

## Reading Processed Data File and Create Different Representations

In [None]:
# Reading sensor_data from csv file Sensor_Data.csv in the Processed_Data Directory

sensor_data = pd.read_csv('./Processed_Data/Sensors_Data.csv')

In [None]:
# Displaying sensor_data read from csv file Sensor_Data.csv in the Processed_Data Directory

sensor_data

In [None]:
# Creating Representation of SensorType Counts in Location
# Grouping Data First on SensorTyp and then on Location

sensor_location_type_count = sensor_data.groupby(['name', "location_identifier"])['sensor_id'].nunique()

In [None]:
# Unstacking the multiple level of grouping

sensor_location_type_count = sensor_location_type_count.unstack()

In [None]:
# Replacing the NaN values with 0
sensor_location_type_count.fillna(0, inplace=True)

In [None]:
# Displaying the Representation
sensor_location_type_count

In [None]:
# # Saving the Representation of Sensortype Counts
sensor_location_type_count.to_csv('./Processed_Data/Sensor_Location_Type_Count.csv')

In [None]:
sensor_type_names_dict = {}
sensor_type_names = sensor_data.groupby(['type'])['name'].unique()

for key in sensor_type_names.keys():
    sensor_type_names_dict[key] = sensor_type_names[key].tolist()

In [None]:
# Saving Dictionary of sensor names
with open('./Processed_Data/sensor_type_names_dict.pkl', 'wb') as f:
    pickle.dump(sensor_type_names_dict, f)

In [None]:
# Creating Representation of Sensor Ids for each Sensor Type in Location
# Grouping Data First on SensorType(name) and then on Location(location_identifier)

sensor_location_type_ids = sensor_data.groupby(['name', "location_identifier"])['sensor_id'].unique()

In [None]:
# Unstacking the multiple level of grouping

sensor_location_type_ids = sensor_location_type_ids.unstack()

In [None]:
# Adding an additional rows in the data based on the sensor types 'bool_list' containing bool type sensor_ids and 'decimal_list' containing decimal type sensor_ids 

for column in sensor_location_type_ids.columns:
    for key, values in sensor_type_names_dict.items():
        sensor_location_type_ids.loc[f'{key}_list', column] = [f"{item}" for sublist in sensor_location_type_ids.loc[values, column] if isinstance(sublist, np.ndarray) for item in sublist]
    

In [None]:
# Displaying the Representation

sensor_location_type_ids

In [None]:
# Saving the Representation 
sensor_location_type_ids.to_csv('./Processed_Data/Sensor_Location_Type_Ids.csv')

## Applying Interpolation to Impute Missing Values

In [None]:
# Reading Time Series DataFrame from Processed Data Directory
final_data = pd.read_csv("./Processed_Data/Final_Sensor_Time_Series.csv", index_col='Timestamp', parse_dates=True)

In [None]:
# Reading the Representation of Sensor Ids for each Sensor Type in Location from Processed_Data Directory
sensor_location_type_ids = pd.read_csv('./Processed_Data/Sensor_Location_Type_Ids.csv', index_col='name')

In [None]:
# Reading Dictionary of sensor names for a specific Sensor Type to pickle file in Processed_Data Directory
with open('./Processed_Data/sensor_type_names_dict.pkl', 'rb') as f:
    sensor_type_names_dict = pickle.load(f)

In [None]:
# Converting the boolean column/sensor_ids to boolean from sensor_location_type_ids 

bool_columns = eval(sensor_location_type_ids.loc[f'bool_list', 'B-1 407'])

for column in bool_columns:
    conditions = [(final_data[column] == 0.), (~final_data[column].isna())]
    values = [False, True]
    final_data[column] = (np.select(conditions, values, default=np.NaN))

In [None]:
# Getting list of sensor_ids/columns from sensor_location_type_
columns = []
for key in sensor_type_names_dict.keys():
    columns.extend(eval(sensor_location_type_ids.loc[f'{key}_list', 'B-1 407']))

In [None]:
# Creating a Mapping of Sensor id to Device Type

sensors_id_type = sensor_location_type_ids.loc[:,'B-1 407'].to_dict()

sensor_id_type_mapping = {}
for key, values in sensors_id_type.items():
    if isinstance(values, str)  and 'list' not in key:
        for value in eval(values):
            sensor_id_type_mapping[f"{value}"] = key

In [None]:
# Plotting the Selected Columns as Time Series

plt.rc('figure', figsize=(15, 8))
for column in columns:
    final_data[column].plot(label=sensor_id_type_mapping[column])
    plt.ylabel(column, fontsize=10)
    plt.legend()
    plt.show()

In [None]:
# Using STL for Seasonal decomposition and Interpolation data with 
#using 2 different interpolation methods 1 is with seasonality 
# The 2nd one is a Linear interpolation 
# Also applying forward filling and backward filling for booleans

for column in bool_columns:
    final_data[column] = final_data[column].fillna(method='ffill')
    final_data[column] = final_data[column].fillna(method='bfill')
    
for column in columns:
    if final_data[column].isna().any():
        res = STL(final_data[column].interpolate(method='linear').fillna(method='bfill'), seasonal=15, period=12*24).fit()
        seasonal_component = res.seasonal

        if not seasonal_component.isna().all():
            deseasonalized_data = final_data[column] - seasonal_component
            deseasonalized_data_imputed = deseasonalized_data.interpolate(method='linear')
            values = (deseasonalized_data_imputed + seasonal_component).fillna(method='bfill')
            final_data[column] = values

        else:
            deseasonalized_data_imputed = final_data[column].interpolate(method='linear')
            final_data[column] = deseasonalized_data_imputed

In [None]:
# Plotting the Selected Columns 
plt.rc('figure', figsize=(15, 8))
for column in columns:
    final_data[column].plot(label=sensor_id_type_mapping[column])
    plt.ylabel(column, fontsize=10)
    plt.legend()
    plt.show()

In [None]:
# Creating a list of all the boolean columns/sensor_ids 
all_bool_columns = [f"{item}" for sublist in sensor_location_type_ids.loc['bool_list', :] 
                                                       if isinstance(eval(sublist), list) 
                                                       for item in eval(sublist)]


In [None]:
# Converting all the boolean column/sensor_ids to boolean
# 
# If a value is 0 it is set as False if value is greater than 0 it will be set as True and NaN will be NaN

for column in tqdm(all_bool_columns):
    conditions = [(final_data[column] == 0.), (~final_data[column].isna())]
    values = [False, True]
    final_data[column] = (np.select(conditions, values, default=np.NaN))

In [None]:
# Applying forward filling and backward filling for all the boolea
for column in tqdm(all_bool_columns):
    final_data[column] = final_data[column].fillna(method='ffill')
    final_data[column] = final_data[column].fillna(method='bfill')

In [None]:
# Using STL for Seasonal decomposition and Interpolation data with 
#using 2 different interpolation methods 1 is with seasonality 
# The 2nd one is a Linear interpolation 
# Also applying forward filling and backward filling for booleans
columns = final_data.columns
columns = list(set(columns) - set(all_bool_columns))

for column in tqdm(columns):
    res = STL(final_data[column].interpolate(method='linear').fillna(method='bfill'), seasonal=15, period=12*24).fit()
    seasonal_component = res.seasonal
    
    if not seasonal_component.isna().all():
        deseasonalized_data = final_data[column] - seasonal_component
        deseasonalized_data_imputed = deseasonalized_data.interpolate(method='linear')
        values = (deseasonalized_data_imputed + seasonal_component).fillna(method='bfill')
        final_data[column] = values
    
    else:
        deseasonalized_data_imputed = final_data[column].interpolate(method='linear')
        final_data[column] = deseasonalized_data_imputed.fillna(method='bfill')

In [None]:
# Checking if decimal Columns have any NaN values left

for column in tqdm(columns):
    if final_data[column].isna().any():
        print(column)

In [None]:
# Checking if Time for the Minimum DateTime is 00:00AM if not then select data from the subsequent date

first_date = final_data.index.min()

if not first_date.time() == pd.to_datetime('00:00:00').time():
    final_data = final_data[final_data.index.date > first_date.date()]

In [None]:
final_data

In [None]:
# Saving the Time Series DataFrame as a csv file in Processed Data Directory

final_data.to_csv("./Processed_Data/Final_Sensor_Time_Series_Imputed.csv")

# Create Multidimensional Matrix Profile (MMP)

In [None]:
# Reading the Representation of Sensor Ids for each Sensor Type in Location from Processed_Data Directory

sensor_location_type_ids = pd.read_csv('./Processed_Data/Sensor_Location_Type_Ids.csv', index_col='name')

In [None]:
# Reading Dictionary of sensor names for a specific Sensor Type to pickle file in Processed_Data Directory

with open('./Processed_Data/sensor_type_names_dict.pkl', 'rb') as f:
    sensor_type_names_dict = pickle.load(f)

In [None]:
location_input = input("Enter a Specific Location For MMP: ")

In [None]:
# Getting list of sensor_ids/columns from sensor_location_type_ids

columns = []
for key in sensor_type_names_dict.keys():
    columns.extend(eval(sensor_location_type_ids.loc[f'{key}_list', location_input]))
    
columns.append('Timestamp')

In [None]:
# Reading Time Series DataFrame from Processed Data Directory for user selected Location only

final_data = pd.read_csv("./Processed_Data/Final_Sensor_Time_Series_Imputed.csv", usecols=columns, 
                         index_col='Timestamp', parse_dates=True)


In [None]:
# Displaying Time Series DataFrame for selexted Location

final_data

In [None]:
# Creating a Mapping of Sensor id to Device Type for a Specific User Selected Location 

sensors_id_type = sensor_location_type_ids.loc[:,location_input].to_dict()

sensor_id_type_mapping = {}
for key, values in sensors_id_type.items():
    if isinstance(values, str)  and 'list' not in key:
        for value in eval(values):
            sensor_id_type_mapping[f"{value}"] = key

In [None]:
sensor_id_type_mapping

In [None]:
# Calculating Multidimensional Matrix Profile
m = 2016

mps, indices = stumpy.mstump_m(final_data, m)

In [None]:
# Displaying the shape of MultiDimensional Matrix Profile 

mps.shape

In [None]:
# Getting the motifs_idx based on the standard np.argmin method 
motifs_idx = np.argmin(mps, axis=1)

In [None]:
# Displaying the motifs of each dimension

motifs_idx

In [None]:
# Getting the nearest neighbours of each individual motifs_idx

nn_idx = indices[np.arange(len(motifs_idx)), motifs_idx]

In [None]:
# Resetting the date index 
df = final_data.reset_index(drop=True)

In [None]:
# Plotting the dimensions of the time series dataframe df

fig, axs = plt.subplots(mps.shape[0] * 2, sharex=True, gridspec_kw={'hspace': 0}, figsize=(15, 20))

for k, dim_name in enumerate(df.columns):
    axs[k].set_ylabel(dim_name, fontsize=10)
    axs[k].set_xlabel('Time', fontsize=10) 
    axs[k].plot(df[dim_name], label=sensor_id_type_mapping[dim_name])
    axs[k].legend(loc="upper right")
    axs[k].plot(range(motifs_idx[k], motifs_idx[k] + m), df[dim_name].iloc[motifs_idx[k] : motifs_idx[k] + m], c='red', linewidth=4)
    axs[k].plot(range(nn_idx[k], nn_idx[k] + m), df[dim_name].iloc[nn_idx[k] : nn_idx[k] + m], c='red', linewidth=4)
    
    axs[k].axvline(x=motifs_idx[k], linestyle="dashed", c='black')
    axs[k].axvline(x=nn_idx[k], linestyle="dashed", c='black')

    axs[k + mps.shape[0]].set_ylabel(f"P_{dim_name}", fontsize=10)
    axs[k + mps.shape[0]].plot(mps[k], c='orange')
    axs[k + mps.shape[0]].set_xlabel('Time', fontsize=10)    
    
    axs[k + mps.shape[0]].axvline(x=motifs_idx[k], linestyle="dashed", c='black')
    axs[k + mps.shape[0]].axvline(x=nn_idx[k], linestyle="dashed", c='black')    
    
    axs[k + mps.shape[0]].plot(motifs_idx[k], mps[k, motifs_idx[k]] + 1, marker="v", markersize=10, color='red')
    axs[k + mps.shape[0]].plot(nn_idx[k], mps[k, nn_idx[k]] + 1, marker="v", markersize=10, color='red')

plt.show()



In [None]:
# Getting motifs_idx based on a threshold. 
threshold = int(input("Enter a Specific Threshold Value For Motif Selection: "))
motifs_idx = np.argwhere(mps < threshold)

In [None]:
# Creating a dictionary of motifs for each dimension selected using a threshold 

motifs = {}
for i in range(0, len(motifs_idx)):
    if motifs.get(motifs_idx[i][0]):
        motifs[motifs_idx[i][0]].append(motifs_idx[i][1])
    else:
        motifs.setdefault(motifs_idx[i][0], []).append(motifs_idx[i][1])

In [None]:
# Further Processing

update_motifs = {}
for key, motifs_idx in motifs.items():
    for i in range(0, len(motifs_idx), 2016):
        if update_motifs.get(key):
            update_motifs[key].append(motifs_idx[i])
        else:
            update_motifs.setdefault(key, []).append(motifs_idx[i])

In [None]:
# Selecting Nearest Neighbour that correspond to each motif_idx in the update_motifs dict

final_nn = {}
final_motifs = {}

for key, values in update_motifs.items():
    nns = {}
    for value in values:
        if nns.get(value) is None:
            if final_nn.get(key):
                final_nn[key].append(indices[key, value])
                final_motifs[key].append(value)
            else:
                final_nn.setdefault(key, []).append(indices[key, value])
                final_motifs.setdefault(key, []).append(value)
        
            nns[indices[key, value]] = 1


In [None]:
# Displaying the the final_motifs dict
# Key is the dimension index of mps like 0, 1, 2

final_motifs

In [None]:
# Displaying Nearest Neighbour of Each Motif Selected based on user defined threshold 
final_nn

In [None]:
# Plotting the dimensions of the time series dataframe df 
fig, axs = plt.subplots(mps.shape[0] * 2, sharex=True, gridspec_kw={'hspace': 0}, figsize=(15, 20))

for k, dim_name in enumerate(df.columns):
    axs[k].set_ylabel(dim_name, fontsize=10)
    axs[k].set_xlabel('Time', fontsize=10) 
    axs[k].plot(df[dim_name], label=sensor_id_type_mapping[dim_name])
    axs[k].legend(loc="upper right")
    i = 0
    if final_motifs.get(k) and final_nn.get(k):
        for motifs_idx, nn_idx in zip(final_motifs.get(k), final_nn.get(k)):
            
            axs[k].plot(df[dim_name].iloc[motifs_idx : motifs_idx + m], c='red', linewidth=4)
            axs[k].plot(df[dim_name].iloc[nn_idx : nn_idx + m], c='red', linewidth=4)
            axs[k].axvline(x=motifs_idx, linestyle="dashed", c='black')
            axs[k].axvline(x=nn_idx, linestyle="dashed", c='black')
            
            axs[k + mps.shape[0]].plot(motifs_idx, mps[k, motifs_idx] + 1, marker="v", markersize=10, color='red')
            axs[k + mps.shape[0]].plot(nn_idx, mps[k, nn_idx] + 1, marker="v", markersize=10, color='red')

            axs[k + mps.shape[0]].axvline(x=motifs_idx, linestyle="dashed", c='black')
            axs[k + mps.shape[0]].axvline(x=nn_idx, linestyle="dashed", c='black')
            
            axs[k + mps.shape[0]].text(motifs_idx, mps[k][motifs_idx], f"{i+1}m", fontsize="xx-large")
            axs[k + mps.shape[0]].text(nn_idx, mps[k][motifs_idx], f"{i+1}n", fontsize="xx-large")
            i += 1
    axs[k + mps.shape[0]].set_ylabel(f"P_{dim_name}", fontsize=10)
    axs[k + mps.shape[0]].plot(mps[k], c='orange')
    axs[k + mps.shape[0]].set_xlabel('Time', fontsize=10)    
    
plt.show()



# Threshold/Exclusion Zone For Dimensions

In [None]:
# Reading the Representation of Sensor Ids for each Sensor Type in Location from Processed_Data
sensor_location_type_ids = pd.read_csv('./Processed_Data/Sensor_Location_Type_Ids.csv', index_col='name')

In [None]:
# Reading Dictionary of sensor names for a specific Sensor Type to pickle file in Processed_Data 
with open('./Processed_Data/sensor_type_names_dict.pkl', 'rb') as f:
    sensor_type_names_dict = pickle.load(f)

In [None]:
location_input = input("Enter a Specific Location For MMP: ")

In [None]:
columns = []
for key in sensor_type_names_dict.keys():
    columns.extend(eval(sensor_location_type_ids.loc[f'{key}_list', location_input]))
    
columns.append('Timestamp')

In [None]:
# Reading Time Series DataFrame from Processed Data Directory for user selected Location only

final_data = pd.read_csv("./Processed_Data/Final_Sensor_Time_Series_Imputed.csv", usecols=columns, 
                         index_col='Timestamp', parse_dates=True)


In [None]:
# Calculating Multidimensional Matrix Profile
m = 2016

mps, indices = stumpy.mstump_m(final_data, m)

In [None]:
# Getting the motifs_idx based on the standard np.argmin


motifs_idx = np.argmin(mps, axis=1)

In [None]:
# Getting the nearest neighbours

nn_idx = indices[np.arange(len(motifs_idx)), motifs_idx]

In [None]:
mdls, subspaces = stumpy.mdl(final_data, m, motifs_idx, nn_idx)

In [None]:
mdls

In [None]:
plt.plot(np.arange(len(mdls)), mdls, c='red', linewidth='4')
plt.xlabel('k (zero-based)', fontsize='20')
plt.ylabel('Bit Size', fontsize='20')
plt.xticks(range(mps.shape[0]))
plt.show()

In [None]:
subspaces

In [None]:
final_data

In [None]:
k = 3 

In [None]:
columns = final_data.columns[subspaces[k]].tolist()

In [None]:
columns

In [None]:
final_data = final_data[columns]

In [None]:
final_data

In [None]:
# Calculating Multidimensional Matrix Profile For a weekly rhythm where subsequent length m=2016
m = 2016

mps, indices = stumpy.mstump_m(final_data, m)

In [None]:
# Further Processing the motifs 

def get_percent_motif_start_index(motifs):
    final_motifs = []
    
    for i in range(0, len(motifs), 2016):
        final_motifs.append(motifs.index[i])
            
    return final_motifs

In [None]:
# Further Processing the motifs

def get_motif_start_index(motifs):
    final_motifs = []
    j = 0
    check = False
    
    for i in range(0, len(motifs), 2016):
        if not check:
            j = i

        final_motifs.append(motifs[j])

        if check:
            j = i + value

        elif motifs[i] + 2016 > len(motifs):
            value = len(motifs) - motifs[i]
            j = i + value
            check = True
            
                
    return final_motifs


In [None]:
# A function that would take in percentage for discord and percentage for motif 

def select_motifs_discords_percentage(mps, dimension, motif_percentage, discord_percentage, motifs={}, discords={}):
    motif_threshold = mps.quantile(motif_percentage/100)
    discord_threshold = mps.quantile((100 - discord_percentage)/100)
    
    motif = mps[mps < motif_threshold]
    discord = mps[mps > discord_threshold]
    
    if len(motif):
        
        motifs[dimension] = get_percent_motif_start_index(motif)
        
    return motifs, discords

In [None]:
# Select the Upper K of all points and Lowest J of all points as discords and motifs from mps

def select_top_k_motifs_discords(mps, dimension, k_motifs, k_discords, motifs={}, discords={}):
    # Getting the motifs_idx based on the standard np.argmin method 
    # It gives 1 single motif for each dimension
    sorted_mps = np.argsort(mps, kind='stable')
    
    motifs[dimension] = get_motif_start_index(sorted_mps)[:k_motifs]
    
    return motifs, discords

In [None]:
# Taking user's input for function to be applied on each dimension and Parameters of those functions
motif_thresholds_for_all_dimensions = {}

for dimension in range(mps.shape[0]):
    motif_thresholds_for_single_dimension = {}
    function = input(f"Select A Function To Be Applied to Dimension {dimension}\n"
                     f"\tPress 1 For Selection Based on Percentage\n "
                     f"\tPress 2 For Top K Motifs and Discords Selection: ")
    
    if int(function) == 1:
        motif_percentage = input(f"\tEnter a Specific Threshold Value For Motif Selection For Dimension {dimension}: ")
        discord_percentage = input(f"\tEnter a Specific Threshold Value For Discord Selection For Dimension {dimension}: ")
        motif_thresholds_for_single_dimension["function"] = int(function)
        motif_thresholds_for_single_dimension["motif_percentage"] = int(motif_percentage)
        motif_thresholds_for_single_dimension["discord_percentage"] = int(discord_percentage)
    
    elif int(function) == 2:
        k_motif = input(f"\tEnter Top K Motif Selection For Dimension {dimension}: ")
        k_discord = input(f"\tEnter Top K Discord Selection For Dimension {dimension}: ")
        motif_thresholds_for_single_dimension["function"] = int(function)
        motif_thresholds_for_single_dimension["k_motifs"] = int(k_motif)
        motif_thresholds_for_single_dimension["k_discords"] = int(k_discord)
    else:
        continue
    motif_thresholds_for_all_dimensions[dimension] = motif_thresholds_for_single_dimension

In [None]:
# Displaying Dictionary of Each Dimensions Function and Function Parameters 

motif_thresholds_for_all_dimensions

In [None]:
# Calculating Motifs and Discords 
motifs = {}
discords = {}

mps_df = pd.DataFrame(mps).T

for key, value in motif_thresholds_for_all_dimensions.items():
    if value.get('function') == 1:
        motifs, discords = select_motifs_discords_percentage(mps_df[key], key, value["motif_percentage"], 
                                                             value["discord_percentage"], motifs, discords)
    elif value.get('function') == 2:
        motifs, discords = select_top_k_motifs_discords(mps_df[key], key, value['k_motifs'], 
                                                        value['k_discords'], motifs, discords)
        

In [None]:
# Displaying Each Motif Selected based on user defined Parameters
motifs

In [None]:
# Selecting Nearest Neighbour that correspond to each motif_idx in the final_motifs dict

final_nn = {}
final_motifs = {}

for key, values in motifs.items():
    nns = {}
    for value in values:
        if nns.get(value) is None:
            if final_nn.get(key):
                final_nn[key].append(indices[key, value])
                final_motifs[key].append(value)
            else:
                final_nn.setdefault(key, []).append(indices[key, value])
                final_motifs.setdefault(key, []).append(value)
        
            nns[indices[key, value]] = 1


In [None]:
# Displaying Each Motif Selected based on user defined Paramters For Each Dimension After filtering

final_motifs

In [None]:
# Displaying Nearest Neighbour of Each Motif Selected based on user defined Parameters For Each Dimension

final_nn

In [None]:
df = final_data.reset_index(drop=True)

In [None]:
sensors_id_type = sensor_location_type_ids.loc[:,location_input].to_dict()

sensor_id_type_mapping = {}
for key, values in sensors_id_type.items():
    if isinstance(values, str)  and 'list' not in key:
        for value in eval(values):
            sensor_id_type_mapping[f"{value}"] = key

In [None]:
# Plotting the dimensions of the time series dataframe df 
fig, axs = plt.subplots(mps.shape[0] * 2, sharex=True, gridspec_kw={'hspace': 0}, figsize=(15, 20))

for k, dim_name in enumerate(df.columns):
    axs[k].set_ylabel(dim_name, fontsize=10)
    axs[k].set_xlabel('Time', fontsize=10) 
    axs[k].plot(df[dim_name], label=sensor_id_type_mapping[dim_name])
    axs[k].legend(loc="upper right")
    i = 0
    if final_motifs.get(k) and final_nn.get(k):
        for motifs_idx, nn_idx in zip(final_motifs.get(k), final_nn.get(k)):
            
            axs[k].plot(df[dim_name].iloc[motifs_idx : motifs_idx + m], c='red', linewidth=4)
            axs[k].plot(df[dim_name].iloc[nn_idx : nn_idx + m], c='red', linewidth=4)
            axs[k].axvline(x=motifs_idx, linestyle="dashed", c='black')
            axs[k].axvline(x=nn_idx, linestyle="dashed", c='black')
            
            axs[k + mps.shape[0]].plot(motifs_idx, mps[k, motifs_idx] + 1, marker="v", markersize=10, color='red')
            axs[k + mps.shape[0]].plot(nn_idx, mps[k, nn_idx] + 1, marker="v", markersize=10, color='red')

            axs[k + mps.shape[0]].axvline(x=motifs_idx, linestyle="dashed", c='black')
            axs[k + mps.shape[0]].axvline(x=nn_idx, linestyle="dashed", c='black')
            
            axs[k + mps.shape[0]].text(motifs_idx, mps[k][motifs_idx], f"{i+1}m", fontsize="xx-large")
            axs[k + mps.shape[0]].text(nn_idx, mps[k][motifs_idx], f"{i+1}n", fontsize="xx-large")
            
            i += 1
            
    axs[k + mps.shape[0]].set_ylabel(f"P_{k}", fontsize=10)
    axs[k + mps.shape[0]].plot(mps[k], c='orange')
    axs[k + mps.shape[0]].set_xlabel('Time', fontsize=10)    

plt.show()
