In [None]:
!pip install pymongo



In [None]:
# Standard Library Imports
import os
import json
import re
import pickle
import hashlib
from datetime import datetime
from bson.objectid import ObjectId
from hashlib import sha256
# Third-Party Imports
import numpy as np
import pandas as pd
import requests
from requests.auth import HTTPBasicAuth
from scipy.stats import zscore, entropy
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure, InvalidName, PyMongoError
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.exceptions import NotFittedError
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest, RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, roc_auc_score, make_scorer, confusion_matrix, accuracy_score
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from spacy.lang.en.stop_words import STOP_WORDS
import spacy

# Keras/TensorFlow Imports
from keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model

# LangChain Imports
# from langchain_openai import ChatOpenAI
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.output_parsers import StrOutputParser

# # Other Library Imports
# from sklearnex import patch_sklearn
# import pygwalker as pyg
# import sweetviz as sv
import calendar

import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import networkx as nx

from matplotlib import cm
cmap = cm.get_cmap('Set1')
from plotly.subplots import make_subplots

pd.set_option('display.max_columns', None)
date_pattern = r'(?:\d{1,2}[-/]\d{1,2}[-/]\d{4})|(?:\d{4}[-/]\d{1,2}[-/]\d{1,2})'


datetime_keywords = set(['rx.ts', 'info.datecreated', 'date', 'time',
                        'iothubenqueuedtime' 'timestamp', 'utc', 'published',
                        'publishedat','payload.publishedat', 'datecreated',
                        'payload.timestamp', 'noted_date', 'created_at',
                        'updated_at', 'modified', 'expires', 'expiry_date',
                        'accessed_at', 'deleted_at', 'published_on', 'event_time',
                        'transaction_time', 'log_time', 'start_date', 'start_time', 'end_date',
                        'end_time', 'recorded_at', 'received_at', 'sent_at'])

dataselection = 'npksensor'
databaseoptions = ["Industry 4.0", "Cisco","MDS", "Vertiv", "Smart City", "Smart Farm"]



def convert_to_contiguous_array(s):

    try:

        arr = np.fromstring(s, sep=',')
        # Ensure the array is C-contiguous
        return np.ascontiguousarray(arr)

    except Exception as e:
        print(f"Error converting: {s}")
        print(e)
        return None




  cmap = cm.get_cmap('Set1')


# Mongo DB Stuff

In [None]:
def mongoDBConnection(meip):

    client = MongoClient(meip)
    if client:

        selected_db = client["sensorsdb"]
        dev_db = client["dev_db"]

        Modelsdb = client["Modelsdb"]

        Models_evaluation = client["Models_evaluation"]

        Anomalydb = client["Anomaly_db"]

        vector_db = client["document_embeddings"]

        db_client = client

    return selected_db, dev_db, Modelsdb, Models_evaluation, Anomalydb, vector_db, db_client




def save_to_mongo(dataframe, dataset_name, collection_suffix, db_connection):

    collection_name = f"{dataset_name}{collection_suffix}"
    collection = db_connection[collection_name]
    collection.delete_many({})  # Clear the collection before saving
    collection.insert_many(dataframe.to_dict('records'))


def load_from_mongo(dataset_name, collection_suffix, db_connection):

    collection_name = f"{dataset_name}{collection_suffix}"
    collection = db_connection[collection_name]

    try:
        if collection.count_documents({}) == 0:
            print('Data not found. Please first run the appropriate section.')
            return pd.DataFrame()

        data_cursor = collection.find()
        dataframe = pd.DataFrame(list(data_cursor))

        dataframe.drop(columns=['_id'], inplace=True, errors='ignore')
        return dataframe
    except Exception as e:
        print(f'Error loading data from MongoDB: {e}')
        return pd.DataFrame()

def model_hash(model):
    """Generate a hash for the model based on its class and parameters."""
    model_info = str(model.__class__.__name__) + str(model.get_params())
    return hashlib.md5(model_info.encode()).hexdigest()

def save_sklearn_scaler_to_mongo(artifact, artifact_type, db):
    """
    Save or update a scikit-learn scaler to MongoDB.
    This function overwrites the existing artifact with the same hash, if it exists,
    effectively ensuring only the latest version of the artifact is stored.
    """
    try:

        artifact_class_name = artifact.__class__.__name__
        artifact_hash_value = model_hash(artifact)
        serialized_artifact = pickle.dumps(artifact)

        artifact_entry = {
            'class': artifact_class_name,
            'hash': artifact_hash_value,
            'type': artifact_type,
            'artifact_data': serialized_artifact,
            'saved_on': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }

        # Check if an existing artifact with the same hash exists
        existing_artifact = db.models.find_one({'class': artifact_class_name, 'hash': artifact_hash_value})

        if existing_artifact is not None:
            # If artifact exists, update it
            db.models.replace_one({'class': artifact_class_name, 'hash': artifact_hash_value}, artifact_entry)
            print(f"Updated {artifact_type} as {artifact_class_name} with hash {artifact_hash_value}.")
        else:
            # Insert the new artifact
            db.models.insert_one(artifact_entry)
            print(f"Saved new {artifact_type} as {artifact_class_name} with hash {artifact_hash_value}.")

    except Exception as e:
        print(f"Error saving {artifact_type}: {e}")



def model_hash(model):
    """
    Generate a hash value for a scikit-learn model.
    """
    return sha256(pickle.dumps(model)).hexdigest()

def save_sklearn_model_to_mongo(model, db, classificationReport):
    # Here I want the new model to be saved only if classification report outperforms historical model
    """
    Save or update a scikit-learn model to MongoDB.
    This function overwrites the existing model with the same hash, if it exists,
    effectively ensuring only the latest version of the model is stored.
    """
    try:
        model_class_name = model.__class__.__name__
        model_hash_value = model_hash(model)
        serialized_model = pickle.dumps(model)

        model_entry = {
            'class': model_class_name,
            'hash': model_hash_value,
            'model_data': serialized_model,
            'saved_on': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }

        # Check if an existing model with the same hash exists
        existing_model = db.models.find_one({'class': model_class_name, 'hash': model_hash_value})

        if existing_model is not None:
            # If model exists, update it
            db.models.replace_one({'class': model_class_name, 'hash': model_hash_value}, model_entry)
            print(f"Updated model {model_class_name} with hash {model_hash_value}.")
        else:
            # Insert the new model
            db.models.insert_one(model_entry)
            print(f"Saved new model {model_class_name} with hash {model_hash_value}.")

    except Exception as e:
        print(f"Error saving model: {e}")



def clean_column_names(columns):
    new_columns = []
    pattern = re.compile(r'filtered/application/\d+/device/[^/]+/event/up\.(.+)')
    for col in columns:
        match = pattern.search(col)
        col = match.group(1) if match else col.split("/")[-1]
        col = col.split('.', 1)[-1] if '.' in col else col
        new_columns.append(col)
    return new_columns

def mongodbdatasets(iotdatabase, nameOfCollection: str):
    try:
        data = list(iotdatabase[nameOfCollection].find())
        df = pd.json_normalize(data)

        # Convert JSON strings to objects where applicable
        if 'payload.objectJSON' in df.columns:
            df['payload.objectJSON'] = df['payload.objectJSON'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
            object_df = pd.json_normalize(df['payload.objectJSON'].dropna())
            df = df.drop(columns='payload.objectJSON').join(object_df, lsuffix='_left')

        # Regex pattern for rxInfo columns
        rxInfo_pattern = re.compile(r'filtered/application/\d+/device/.+/event/up\.rxInfo')
        rxInfo_columns = [col for col in df.columns if rxInfo_pattern.match(col)]

        for rxInfo_column in rxInfo_columns:
            # Safe JSON parsing
            df[rxInfo_column] = df[rxInfo_column].apply(lambda x: json.loads(x) if isinstance(x, str) else x)

            # Explode and normalize rxInfo columns
            exploded_df = df.explode(rxInfo_column).reset_index(drop=True)
            normalized_df = pd.json_normalize(exploded_df[rxInfo_column].dropna())
            df = exploded_df.drop(columns=rxInfo_column).join(normalized_df, rsuffix='_rxInfo')

        # Attempt to convert columns to float, handling exceptions gracefully
        for col in df.columns:
            if not df.empty and not isinstance(df[col].iloc[0], ObjectId):
                try:
                    df[col] = pd.to_numeric(df[col], errors='ignore')
                except Exception as e:
                    print(f"Failed to convert column {col} to numeric. Error: {e}")

        cleaned_columns = clean_column_names(df.columns)
        df.columns = cleaned_columns
        df.columns = df.columns.str.lower()


        return df

    except Exception as e:
        print(f"An error occurred: {e}")
        return None



In [None]:
def identify_datetime_cols(df, datetime_keywords):
    datetime_cols = []
    for col in df.columns:
        try:
            if df[col].dtype == 'datetime64[ns]':
                datetime_cols.append(col)

            elif df[col].dtype == 'object' or col in datetime_keywords:
                datetime_cols.append(col)

            elif col in datetime_keywords:
                datetime_cols.append(col)
        except Exception as e:
            print(f'{e}')

    return datetime_cols

def identify_and_convert_datetime_columns(df, datetime_keywords):
    # Identify potential datetime columns
    datetime_cols = identify_datetime_cols(df, datetime_keywords)

    # Iterate over the identified datetime columns to convert and format them
    for col in datetime_cols:
        try:
            # Attempt to convert each column to datetime format
            df[col] = pd.to_datetime(df[col], errors='coerce')
            # If conversion is successful, format the datetime data
            if df[col].dtype == 'datetime64[ns]':
                df[col] = df[col].dt.strftime('%Y-%m-%d %H:%M:%S')
        except Exception as e:
            print(f"Error converting column '{col}' to datetime format: {e}")

    # Attempt to identify the main datetime column (if necessary)
    main_datetime_col = datetime_cols[0] if datetime_cols else None
    if main_datetime_col:
        df[main_datetime_col] = pd.to_datetime(df[main_datetime_col])

    return df, datetime_cols, main_datetime_col


def drop_unwanted_columns(df: pd.DataFrame):
    columns_to_drop = ['_id', 'index', 'Unnamed: 0', 'level_0','unnamed: 0']
    df_cleaned = df.drop(columns=columns_to_drop, axis=1, errors='ignore')
    return df_cleaned


def create_features(df, datecol = None, label=None):
    df[datecol] = pd.to_datetime(df[datecol])

    season_dict = {
    '01': 'Winter',
    '02': 'Winter',
    '03': 'Spring',
    '04': 'Spring',
    '05': 'Spring',
    '06': 'Summer',
    '07': 'Summer',
    '08': 'Summer',
    '09': 'Fall',
    '10': 'Fall',
    '11': 'Fall',
    '12': 'Winter'}

    """
    Creates time series features from datetime index
    """
    df['timeofday'] = pd.to_datetime(df[datecol]).dt.hour.apply(
    lambda x: 'night' if 0 <= x < 5 else
            'morning' if 5 <= x < 12 else
            'afternoon' if 12 <= x < 18 else
            'evening')

    df['month'] = df[datecol].dt.month.apply(lambda x: f"{x:02d}")
    Season = df['month'].apply(lambda x: season_dict[x])

    df['timeofday'] = pd.to_datetime(df[datecol]).dt.hour.apply(lambda x: 'morning' if 5 <= x <= 12 else 'afternoon' if 13 <= x <= 17 else 'evening')

    df['season'] = Season

    df['dayofweek'] = df[datecol].dt.dayofweek.apply(lambda x: calendar.day_name[x].lower())

    df['month'] = df[datecol].dt.month.apply(lambda x: calendar.month_name[x].lower())

    df['hour'] = df[datecol].dt.hour


    return df



def best_fill_na_method(column):
    methods = ['mean', 'median', 'zero', 'bfill', 'ffill']
    original_total = column.sum()
    best_method = None
    best_difference = float('inf')

    for method in methods:
        col_copy = column.copy()
        fill_value = 0
        if method in ['mean', 'median']:
            fill_value = getattr(col_copy, method)()  # Use getattr to call mean or median
        col_copy.fillna(fill_value if method != 'bfill' and method != 'ffill' else method, inplace=True)

        total = col_copy.sum()
        difference = abs(total - original_total)

        if difference < best_difference:
            best_difference = difference
            best_method = method

    return best_method

def fill_missing_values(df):
    for col_name in df.select_dtypes(include=[np.number]).columns:
        best_method = best_fill_na_method(df[col_name])
        if best_method in ['mean', 'median', 'zero']:
            fill_value = getattr(df[col_name], best_method)() if best_method != 'zero' else 0
            df[col_name].fillna(fill_value, inplace=True)
        else:  # For 'bfill' and 'ffill'
            df[col_name].fillna(method=best_method, inplace=True)
    return df


def remove_low_variance_features(dataset):
    exclusion_list = ['deveui', 'devicename', 'devicetype', 'latitude', 'longitude', 'gatewayid','fire alarm','occupancy','hour',
                      'alarm', 'label','target',"Fire_Alarm"]

    """Remove features with low variance not in the exclusion list."""
    for col in dataset.columns:
        try:
            if dataset[col].nunique() <= 2 and col not in exclusion_list:
                dataset.drop(col, axis=1, inplace=True)
        except Exception as e:
            print(f"Failed to convert column {col} to float. Skipping conversion for this column.")
            continue

    return dataset


def train_test_split_datetime(df, date_column, test_size=0.2):
    """
    Split the dataset based on datetime, ensuring chronological order.
    """
    if date_column not in df.columns:
        raise ValueError(f"{date_column} column not found in the DataFrame.")

    df[date_column] = pd.to_datetime(df[date_column])
    df_sorted = df.sort_values(by=date_column)
    split_index = int((1 - test_size) * len(df_sorted))
    return df_sorted[:split_index], df_sorted[split_index:]


def split_data_by_datetime(x, y, date_column, test_size=0.4):
    """
    Splits the dataset into training and testing sets based on the datetime column.
    """
    if date_column not in x.columns:
        raise ValueError(f"{date_column} column not found in the DataFrame.")

    x_sorted = x.sort_values(by=date_column)
    y_sorted = y.reindex(x_sorted.index)

    split_idx = int(len(x) * (1 - test_size))

    return x_sorted.iloc[:split_idx], x_sorted.iloc[split_idx:], y_sorted.iloc[:split_idx], y_sorted.iloc[split_idx:]


def scale_ml_data(data, main_datetime_col):
    exclusion_list = ['deveui', 'devicename', 'devicetype','latitude', 'longitude', 'gatewayid','fire alarm','occupancy','hour',
                      'alarm', 'label','target',"Fire_Alarm",'frequency']

    """
    Scale numerical columns using RobustScaler and preserve other types.
    """
    data.set_index(main_datetime_col, inplace=True)
    numeric_cols = [col for col in data.select_dtypes(include=['number']).columns if col not in exclusion_list]

    scaler = RobustScaler()
    data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

    scaler_info = {'scaler': scaler, 'columns': numeric_cols}

    return data, scaler_info

def save_to_mongo(dataframe, dataset_name, collection_suffix, db_connection):

    collection_name = f"{dataset_name}{collection_suffix}"
    collection = db_connection[collection_name]
    collection.delete_many({})  # Clear the collection before saving
    collection.insert_many(dataframe.to_dict('records'))



In [None]:

def split_and_store_data(df, datetime_col, dataset_name, db_connection):
    # Scale the data
    scaled_data, scaler = scale_ml_data(df, datetime_col)


    # Split the data
    scaled_data.reset_index(inplace=True)

    scaled_train, scaled_test = train_test_split_datetime(scaled_data, datetime_col)
    train, test = train_test_split_datetime(df, datetime_col)

    # Save to MongoDB
    scaler_name = f'{dataset_name}_scaler'
    save_sklearn_scaler_to_mongo(scaler, scaler_name, db_connection)


    save_to_mongo(train, dataset_name, "_train", db_connection)
    save_to_mongo(test, dataset_name, "_test", db_connection)

    save_to_mongo(scaled_train, dataset_name, "Scaled_train", db_connection)
    save_to_mongo(scaled_test, dataset_name, "Scaled_test", db_connection)

    print(f"Data processing complete. Train and test sets saved to MongoDB.")


def process_datafile(df, datetime_keywords):

    Facingdf, datetime_cols, main_datetime_col = identify_and_convert_datetime_columns(df, datetime_keywords)

    Facingdf = drop_unwanted_columns(Facingdf)

    Facingdf = remove_low_variance_features(Facingdf)

    Facingdf.columns = clean_column_names(Facingdf.columns)

    processedNoFeats = fill_missing_values(Facingdf)

    processedNoFeats.columns = processedNoFeats.columns.str.lower()


    fullyprocessed = create_features(processedNoFeats, main_datetime_col)

    return processedNoFeats, fullyprocessed, main_datetime_col




def load_and_process_data(dataselection, datetime_keywords, db_connection):
    """
    Loads data from MongoDB, processes it, and stores split datasets back into MongoDB.

    Parameters:
    - resource_recommendation: MongoDB client or specific database connection.
    - dataselection: The name of the collection from which to load the data.
    - date_pattern: A pattern or specific datetime column name used for splitting the data.
    - datetime_keywords: Set of keywords to identify datetime columns for conversion.
    - db_connection: MongoDB connection for storing processed data.
    """
    # try:
    df = mongodbdatasets(db_connection, dataselection)

    processedNoFeats, fullyprocessed, main_datetime_col = process_datafile(df, datetime_keywords)

    split_and_store_data(fullyprocessed, main_datetime_col, dataselection, db_connection)


    print("Data processing and storage complete.")

    return fullyprocessed, main_datetime_col

    # except Exception as e:
    #     print(f"Processing failed: {e}")


def load_from_mongo(dataset_name, collection_suffix, db_connection):

    collection_name = f"{dataset_name}{collection_suffix}"
    collection = db_connection[collection_name]

    try:
        if collection.count_documents({}) == 0:
            print('Data not found. Please first run the appropriate section.')
            return pd.DataFrame()

        data_cursor = collection.find()
        dataframe = pd.DataFrame(list(data_cursor))

        dataframe.drop(columns=['_id'], inplace=True, errors='ignore')
        return dataframe
    except Exception as e:
        print(f'Error loading data from MongoDB: {e}')
        return pd.DataFrame()


def keywordConditions():
    keywords = ['Anomaly', 'error', 'anomaly', 'severity', 'optimization', 'prediction','distance']
    anomaly_condition = lambda col: 'Anomaly' in col or 'error' in col
    keyword_condition = lambda col: any(keyword in col for keyword in keywords)
    scores_condition = lambda col: ('Anomaly' in col and 'Score' in col) or 'error' in col
    all_conditions = [anomaly_condition, keyword_condition, scores_condition]

    return anomaly_condition, keyword_condition, scores_condition, all_conditions

def extract_columns_and_data(df, anomaly_condition, keyword_condition, scores_condition):

    anomaly_columns = []
    keyword_columns = []
    scores_columns = []

    for col in df.columns:
        if anomaly_condition(col):
            anomaly_columns.append(col)
        if keyword_condition(col):
            keyword_columns.append(col)
        if scores_condition(col):
            scores_columns.append(col)

    collected_columns = list(set(anomaly_columns + keyword_columns + scores_columns))
    return anomaly_columns, keyword_columns, scores_columns, collected_columns


In [None]:

selected_db, dev_db, Modelsdb, Models_evaluation, Anomalydb, vector_db, db_client = mongoDBConnection('mongodb+srv://smsmiot:smsmiot@cluster0.tisss.mongodb.net/')
anomaly_condition, keyword_condition, scores_condition, all_conditions = keywordConditions()

# interfacelatestdata = mongodbdatasets(cisco_db, 'interfacelatestdata')
# fullyprocessed, main_datetime_col = load_and_process_data('VertivRackDewPoint_csv', datetime_keywords, cisco_db)

sensor_data_with_fire_alarms = pd.read_csv('AirQualitySensor.csv')
dashFacingdf, fullyprocessed, main_datetime_col = process_datafile(sensor_data_with_fire_alarms, datetime_keywords)
split_and_store_data(fullyprocessed, main_datetime_col, 'AirQualitySensor', dev_db)


Updated AirQualitySensor_scaler as dict with hash 23cc58fa5aa36c7b26d1d300753845578ac1412b7a2492c1d0be8856a5a00bdb.
Data processing complete. Train and test sets saved to MongoDB.


In [None]:

RANDOM_STATE = 42
CV = 4
ANOMALY = 1
NOT_ANOMALY = 0
UNFITTED = -1



def sklearnModels(rawinputdata, ScaledMLInputdata):


    """
    The function `sklearnModels` takes in a scaled dataset and a dataframe of anomalies, tunes the
    hyperparameters of three anomaly detection algorithms (Isolation Forest, Local Outlier Factor, and
    One-Class SVM), applies the algorithms to the dataset, and returns the updated dataframe of
    anomalies.

    :param PWRdf_scaled: PWRdf_scaled is a scaled version of the PWRdf dataset. It is likely a pandas
    DataFrame or numpy array that contains the input data for the anomaly detection models. The data
    should be preprocessed and scaled before passing it to the models
    :param Anomalies: Anomalies is a DataFrame that contains the data on which the anomaly detection
    algorithms will be applied. It should have the necessary columns for the algorithms to work properly
    :return: the "Anomalies" dataframe, which contains the results of the anomaly detection algorithms.
    """



    anomaly_algorithms = {

        'EllipticEnvelope': EllipticEnvelope(support_fraction=.35),

        'IsolationForest': IsolationForest(),

        'OneClassSVM': OneClassSVM(kernel='rbf'),

    }
    for name, algorithm in anomaly_algorithms.items():

        if name == 'EllipticEnvelope':

            # Get the Mahalanobis distances, which can serve as anomaly scores (lower values indicate stronger anomalies)
            rawinputdata["EllipticEnvelope_Anomaly"] = np.where(algorithm.fit_predict(ScaledMLInputdata) == UNFITTED, ANOMALY, NOT_ANOMALY)
            rawinputdata["EllipticEnvelope_AnomalyScore"] = algorithm.mahalanobis(ScaledMLInputdata) / 100


        elif name == 'IsolationForest':

            rawinputdata["IsolationForest_Anomaly"] = np.where(algorithm.fit_predict(ScaledMLInputdata) == UNFITTED, ANOMALY, NOT_ANOMALY)
            rawinputdata["IsolationForest_AnomalyScore"] = algorithm.decision_function(ScaledMLInputdata)


        else:
            rawinputdata['OneClassSVM_Anomaly'] = np.where(algorithm.fit_predict(ScaledMLInputdata) == UNFITTED, ANOMALY, NOT_ANOMALY)
            rawinputdata['OneClassSVM_AnomalyScore'] = algorithm.decision_function(ScaledMLInputdata) / 100

    return ScaledMLInputdata, rawinputdata


def trainAutoEncoder(data, epochs=50, batch_size=32, test_size=0.2):
    """
    Train an autoencoder model on the given data.

    :param data: Input data for training the autoencoder.
    :param epochs: Number of training epochs (default=50).
    :param batch_size: Size of training batch (default=32).
    :param test_size: Fraction of data to use for testing (default=0.1).
    :return: Normalized reconstruction error and anomaly threshold.
    """
    # try:
    train_data, test_data = train_test_split(data, test_size=test_size, random_state=42)
    input_dim = train_data.shape[1]
    encoding_dim = int(input_dim / 2)

    input_layer = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation='relu')(input_layer)
    decoded = Dense(input_dim, activation='relu')(encoded)

    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    autoencoder.fit(train_data, train_data, epochs=epochs, batch_size=batch_size, shuffle=True, validation_data=(test_data, test_data), verbose=0)

    scaled_data_predictions = autoencoder.predict(data)
    reconstruction_error = np.mean(np.power(data - scaled_data_predictions, 2), axis=1)
    normalized_error = (reconstruction_error - np.min(reconstruction_error)) / (np.max(reconstruction_error) - np.min(reconstruction_error))
    threshold = np.percentile(normalized_error, 95)

    return normalized_error, threshold

def runDeepAutoEncoder(ScaledMLInputdata, rawinputdata):
    """
    Use a trained autoencoder model to detect anomalies in the data.

    :param ScaledMLInputdata: Scaled input data for anomaly detection.
    :param rawinputdata: DataFrame containing previously detected anomalies.
    :return: Returns updated versions of ScaledMLInputdata and rawinputdata, with additional columns for autoencoder-based anomaly detection.
    """
    normalized_error, threshold = trainAutoEncoder(ScaledMLInputdata)


    print("Autoencoder training successful.")
    rawinputdata['Autoencoder_Anomaly'] = np.where(normalized_error > threshold, 1, 0)  # 1 is for anomaly and 0 for not_anomaly
    rawinputdata['Autoencoder_reconstruction_error'] = normalized_error.tolist()

    return ScaledMLInputdata, rawinputdata

def classify_anomaly_enhanced(score, std_dev, quantiles, std_quantiles):
    """
    Classify anomalies using score mean, standard deviation, and quantiles.

    :param score: Mean anomaly score for the data point.
    :param std_dev: Standard deviation of anomaly scores for the data point.
    :param quantiles: Quantile thresholds for scoring.
    :param std_quantiles: Quantile thresholds for standard deviation of scores.
    :return: Anomaly classification level.
    """
    # Use standard deviation quantiles to adjust the sensitivity of anomaly detection
    if std_dev >= std_quantiles[0.75]:
        sensitivity_adjustment = 1  # Increase sensitivity for high variability cases
    elif std_dev <= std_quantiles[0.25]:
        sensitivity_adjustment = -1  # Decrease sensitivity for low variability cases
    else:
        sensitivity_adjustment = 0  # Default sensitivity

    # Adjust scoring quantiles based on sensitivity adjustment
    critical_threshold = quantiles[0.95] + (0.05 * sensitivity_adjustment)
    high_threshold = quantiles[0.75] + (0.05 * sensitivity_adjustment)
    # No adjustment for moderate as it serves as the baseline

    # Classify based on adjusted thresholds
    if score >= critical_threshold:
        return 'Critical'
    elif score >= high_threshold:
        return 'Concern'
    elif score >= quantiles[0.50]:
        return 'Notice'
    elif score >= quantiles[0.25]:
        return 'Low'
    else:
        return 'Normal'


def run_anomaly_detection_models(raw_data, scaled_data):
    """
    Run anomaly detection models on both scaled and raw data and return the results.

    :param raw_data: Original input data.
    :param scaled_data: Preprocessed and scaled input data.
    :return: DataFrame containing anomaly detection results.
    """
    # try:

    ScaledMLInputdata, rawinputdata = sklearnModels(raw_data, scaled_data)

    RETScaledMLInputdata, RETrawinputdata = runDeepAutoEncoder(ScaledMLInputdata, rawinputdata)

    return RETScaledMLInputdata, RETrawinputdata





In [None]:

def anomaly_insights(RetFromSkModels):
    anoms = ['IsolationForest_Anomaly', 'EllipticEnvelope_Anomaly',
            'OneClassSVM_Anomaly', 'Autoencoder_Anomaly']

    anomscore = ['IsolationForest_AnomalyScore', 'EllipticEnvelope_AnomalyScore',
                'OneClassSVM_AnomalyScore', 'Autoencoder_reconstruction_error']

    Scores = RetFromSkModels[anomscore]
    Anomalies = RetFromSkModels[anoms]

    Anomaly_Score_STD = Scores.std(axis=1)
    anomalyscore_mean = Scores.mean(axis=1)
    anomaly_sum = Anomalies.sum(axis=1)
    anomaly_counts = Anomalies.sum(axis=0)
    positive_classification_rates = Anomalies.mean(axis=0)


    RetFromSkModels['Voted_Anomaly'] = np.where(anomaly_sum >= 3, 1, 0)
    RetFromSkModels['Voted_Anomaly_Score_Mean'] = anomalyscore_mean
    RetFromSkModels['Voted_Anomaly_Score'] = anomaly_sum * 0.25


    # 1. Calculate a dynamic threshold based on the statistical distribution of the anomaly_sum
    dynamic_threshold = anomaly_sum.median()  # Using median as an example

    # 2. Calculate standard deviation for anomaly scores to assess variability
    RetFromSkModels['Anomaly_Score_STD'] = Anomaly_Score_STD

    normalized_rates = positive_classification_rates / positive_classification_rates.sum()

    # Invert the rates to penalize models with higher rates of positive classifications
    # This creates a scenario where a model with more frequent positive classifications gets a lower weight
    inverted_rates = 1 - normalized_rates

    # Normalize inverted rates to ensure they sum up to 1 and can be used as weights
    weights = inverted_rates / inverted_rates.sum()


    # Calculate a weighted anomaly score mean based on dynamically adjusted weights
    RetFromSkModels['Weighted_Anomaly_Score'] = Anomalies.dot(weights) / weights.sum()


    # Updating the 'Voted_Anomaly' column with the dynamic threshold
    RetFromSkModels['Dynamic_Anomaly'] = np.where(Anomalies.sum(axis=1) >= dynamic_threshold, 1, 0)

    # Enhanced scoring mechanism considering both mean score and variability
    # For instance, you may penalize scores with high variability
    penalty_factor = 0.1  # Example penalty for high variability
    RetFromSkModels['Enhanced_Anomaly_Score'] = RetFromSkModels['Weighted_Anomaly_Score'] - (RetFromSkModels['Anomaly_Score_STD'] * penalty_factor)

    # Incorporate a check for unanimous agreement among models for a bonus
    unanimous_bonus = 0.05  # Bonus to add for unanimous agreement among models
    RetFromSkModels['Enhanced_Anomaly_Score'] += np.where(Anomalies.sum(axis=1) == len(anoms), unanimous_bonus, 0)

    # You may want to normalize or scale the 'Enhanced_Anomaly_Score' as needed

    Scorequantile = RetFromSkModels['Enhanced_Anomaly_Score'].quantile(q=[0.25, 0.50, 0.75, 0.95])


    std_quantiles = RetFromSkModels['Anomaly_Score_STD'].quantile(q=[0.25, 0.50, 0.75])

    RetFromSkModels['Anomaly_Level'] = RetFromSkModels.apply(lambda row: classify_anomaly_enhanced(row['Enhanced_Anomaly_Score'], row['Anomaly_Score_STD'], Scorequantile, std_quantiles), axis=1)


    return RetFromSkModels


In [None]:
def perform_grid_search_with_lgbm(X_train, y_train, scoring_metric='roc_auc', cv_folds=3):
    """
    Performs grid search to find the best LGBMClassifier parameters for imbalanced datasets.
    """
    # Define the parameter grid to search
    param_grid = {
        'num_leaves': [31, 50, 100],  # Example values, adjust based on your dataset
        'is_unbalance': [True],       # This is set to True for imbalanced datasets
        'learning_rate': [0.01, 0.1, 0.05],  # Example values
        'n_estimators': [100, 200, 500]  # Example values
    }

    # Configure the classifier
    classifier = LGBMClassifier(objective='binary')

    # Define the scoring function
    scoring = make_scorer(roc_auc_score) if scoring_metric == 'roc_auc' else scoring_metric

    # Perform grid search
    grid_search = GridSearchCV(classifier, param_grid, cv=cv_folds, scoring=scoring, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    return grid_search.best_estimator_, grid_search.best_score_, grid_search.best_params_

def optimize_xgboost(X_train, y_train):
    """
    Optimizes an LGBM classifier for imbalanced datasets, including num_leaves parameter.
    """


    lgbmc_lassifier = LGBMClassifier( is_unbalance=True,learning_rate=0.1, objective='binary').fit(X_train, y_train)
    # best_estimator_, best_score_, best_params_ = perform_grid_search_with_lgbm(X_train, y_train)

    return lgbmc_lassifier

def anomaly_classifier(raw_data, main_datetime_col, col):
    # Something very wrong with train test splits here
    """
    Perform anomaly detection and classification using a voting classifier.

    :param rawinputdata: Raw input data.
    :param keywords: List of keywords to identify anomaly columns.
    :return: Classification report, confusion matrix, X_test with predictions, voting classifier, full prediction results, best parameters, best score.
    """
    keywords = ['Anomaly', 'error', 'anomaly', 'severity', 'distance', 'cluster']
    # try:

    anomaly_cols = [col for col in raw_data.columns if any(keyword in col.lower() for keyword in keywords)]
    y = raw_data[col]
    X = raw_data.drop(columns=anomaly_cols)



    # smote = SMOTE()
    # X_resampled, y_resampled = smote.fit_resample(X, y)

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    best_model = optimize_xgboost(x_train, y_train)
    Test_prediction = x_test.copy()


    y_pred = best_model.predict(x_test)
    y_pred_prob = best_model.predict_proba(x_test)[:, 1]

    Test_prediction['Anomaly_Labels'] = y_test
    Test_prediction['Anomaly_Predictions'] = y_pred
    Test_prediction['Anomaly_PredictedProb'] = y_pred_prob

    # This could be converted into a single prompt item
    test_cm = confusion_matrix(Test_prediction['Anomaly_Labels'], Test_prediction['Anomaly_Predictions'])
    test_classreport = classification_report(Test_prediction['Anomaly_Labels'], Test_prediction['Anomaly_Predictions'], output_dict=True)
    accuracy = accuracy_score(Test_prediction['Anomaly_Labels'], Test_prediction['Anomaly_Predictions'])

    return Test_prediction, best_model, x_test, y_test, test_cm, test_classreport, accuracy


def feature_selection(X, y, n_features=None):
    anova_selector = SelectKBest(score_func=f_classif, k='all').fit(X, y)

    rf_model = RandomForestClassifier(n_estimators=50, random_state=42).fit(X, y)

    rf_importance = rf_model.feature_importances_

    mi_scores = mutual_info_classif(X, y)

    feature_scores = pd.DataFrame({
        'ANOVA': anova_selector.scores_,
        'RandomForest': rf_importance,
        'MutualInformation': mi_scores}, index=X.columns)

    scaler = StandardScaler()
    feature_scores_scaled = pd.DataFrame(scaler.fit_transform(feature_scores), columns=feature_scores.columns, index=feature_scores.index)
    feature_scores['feature_importance'] = feature_scores_scaled.sum(axis=1)

    selected_features = feature_scores.nlargest(n_features if n_features else X.shape[1], 'feature_importance').index.tolist()

    return selected_features, feature_scores

def compute_feature_values(X_train, y_train, n_features=None):
    selected_features, feature_scores = feature_selection(X_train, y_train, n_features=n_features)
    return selected_features, feature_scores





In [None]:
def display_anomaly_level_summary(Anomalies):
    """
    The function takes a DataFrame of anomaly levels, calculates the counts for each level,
    and displays a pie chart showing the distribution of different anomaly levels.

    :param Anomalies: pandas DataFrame with anomaly level results.
    """

    anomaly_level_counts = Anomalies['Anomaly_Level'].value_counts()

    labels = anomaly_level_counts.index
    values = anomaly_level_counts.values

    fig = px.pie(
        names=labels,
        values=values,
        title='Distribution of Anomaly Levels',
        hole=0.4,
        template='plotly_white'
    )

    fig.update_traces(textinfo='percent+label')
    fig.update_layout(width=550,  height=500, margin=dict(t=20, b=40))


    pie_chart_info = "The pie chart above shows the distribution of different anomaly levels.\n"
    for level, count in anomaly_level_counts.items():
        pie_chart_info += f"{level}: {count}\n"


    print(pie_chart_info)
    fig.show()

    return pie_chart_info


In [None]:
def display_anomaly_complementary_summary(Anomalies):
    """
    The `display_anomaly_complementary_summary` function takes in a DataFrame of anomaly results from different
    algorithms, calculates the counts of anomalies and non-anomalies for each algorithm and the final
    decision, and displays a pie chart showing the distribution of anomalies and non-anomalies across all algorithms.

    :param Anomalies: The parameter "Anomalies" is expected to be a pandas DataFrame that contains the
    anomaly detection results for each data point. The DataFrame should have the following columns:
    """
    anomaly_countsdf = Anomalies[
        [
            'IsolationForest_Anomaly', 'EllipticEnvelope_Anomaly','OneClassSVM_Anomaly',
            'Autoencoder_Anomaly','Voted_Anomaly']
        ]

    anomaly_counts = anomaly_countsdf.apply(lambda col: (col == 1).sum())
    non_anomaly_counts = anomaly_countsdf.apply(lambda col: (col == 0).sum())

    total_anomalies = anomaly_counts['Voted_Anomaly'].sum()
    total_non_anomalies = non_anomaly_counts['Voted_Anomaly'].sum()

    algorithms = anomaly_counts.index.tolist()

    pooled_average_anomalies = anomaly_counts.mean()
    pooled_average_non_anomalies = non_anomaly_counts.mean()

    labels = ['Anomalies', 'Non-Anomalies']
    values = [total_anomalies, total_non_anomalies]

    fig = px.pie(
        names=labels,
        values=values,
        title='Distribution of Anomalies and Non-Anomalies for Final Classification',
        hole=0.4,
        template='plotly_white'
    )

    fig.update_traces(textinfo='percent+label')
    fig.update_layout(width=550,  height=500, margin=dict(t=50, b=20))


    pie_chart_info = f"The pie chart above shows the distribution of anomalies\nand non-anomalies Final Classification.\n"
    pie_chart_info += f"Total Anomalies: {total_anomalies}\n"
    pie_chart_info += f"Total Non-Anomalies: {total_non_anomalies}\n"

    print(pie_chart_info)
    fig.show()

    pie_chart_info += f"Pooled average anomalies in the final decision:\n{round(pooled_average_anomalies, 4)}\n"
    pie_chart_info += f"Pooled average non-anomalies in the final decision:\n{round(pooled_average_non_anomalies, 4)}"


    return pie_chart_info



In [None]:
def plot_feature_importance(data, anomaly_scores):
    """
    The function `plot_feature_importance` generates a bar chart to visualize the feature importance for
    different anomaly detection algorithms.

    :param data: The `data` parameter is a pandas DataFrame that contains the features used for anomaly
    detection. Each column represents a different feature
    :param anomaly_scores: The `anomaly_scores` parameter is a dictionary that contains the anomaly
    scores for each algorithm. The keys of the dictionary are the names of the algorithms, and the
    values are the corresponding anomaly scores
    """

    anomaly_algorithms = ['IsolationForest_AnomalyScore', 'EllipticEnvelope_AnomalyScore',
                    'OneClassSVM_AnomalyScore', 'Autoencoder_reconstruction_error']

    feature_importance = pd.DataFrame(index=data.columns)
    for name in anomaly_algorithms:
        correlations = data.corrwith(anomaly_scores[name])
        feature_importance[name] = correlations.abs()


    feature_importance['Average'] = feature_importance.mean(axis=1)
    feature_importance = feature_importance.sort_values(by='Average', ascending=False)

    colors = px.colors.qualitative.Plotly
    fig = go.Figure()
    for i, column in enumerate(feature_importance.columns):
        fig.add_trace(go.Bar(x=feature_importance.index,
                            y=feature_importance[column],
                            name=column,
                            marker_color=colors[i % len(colors)],
                            hoverinfo='y',
                            ))

    fig.update_layout(
        title='Feature Importance in Anomaly Detection Analysis',
        xaxis_title='Features',
        yaxis_title='Importance',
        barmode='group',
        yaxis_type='log',
        font=dict(size=10),
        hovermode='x unified',
        legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1),
        width=450,
        height=500)

    fig.show()

    return feature_importance

In [None]:
def anomaly_detection(datapoints, anomaly_results_rela):
    """
    The `anomaly_detection` function calculates the feature importance of different anomaly detection
    algorithms using correlation and creates a heatmap visualization of the results.

    :param datapoints: The `datapoints` parameter is a DataFrame that contains the data points for which
    you want to calculate the feature importance
    :param anomaly_results_rela: The `anomaly_results_rela` parameter is a DataFrame that contains the
    results of different anomaly detection algorithms. Each column in the DataFrame represents a
    different algorithm, and each row represents a datapoint. The values in the DataFrame indicate the
    anomaly score or label assigned by each algorithm to each datapoint
    :return: a DataFrame called "feature_importance" which contains the feature importance values
    calculated for each anomaly algorithm.
    """
    anomaly_algorithms = anomaly_results_rela.columns

    feature_importance = pd.DataFrame()

    for name in anomaly_algorithms:
        correlations = datapoints.corrwith(anomaly_results_rela[name]).abs()
        feature_importance[name] = correlations

    feature_importance['average_importance'] = feature_importance.mean(axis=1)
    text_labels = feature_importance.T.applymap(lambda x: '{:.2f}'.format(x)).values


    heatmap = go.Heatmap(
        z=feature_importance.T.values,
        x=feature_importance.T.columns,
        y=feature_importance.T.index,
        zmin=0,
        zmax=3,
        colorscale='RdBu_r',
        zmid=2.5,
        xgap=10,
        ygap=10,
        text=text_labels,
        hoverinfo='text'
    )

    layout = go.Layout(
        title='Heatmap of Feature Importance',
        xaxis=dict(tickangle=-45, nticks=30),
        yaxis=dict(nticks=30),
        font=dict(size=12),
        width=450,
        height=500
    )


    fig = go.Figure(data=[heatmap], layout=layout)

    fig.show()


    return feature_importance

In [None]:
def generate_anomaly_color_map(AnomalyLevels):
    color_gradient = {
        'Low': '#ffcccc',  # lighter red
        'Notice': '#ff9999',  # light red
        'Concern': '#ff6666',  # medium red
        'Critical': '#cc0000'  # dark red
    }
    default_color = '#cccccc'

    color_map = {level: color_gradient.get(level, default_color) for level in AnomalyLevels}

    return color_map


def make_anomaly_map(df, col, time_col,lat_col='latitude', lon_col='longitude', width=600, height=700): #time_col:str,

    color_map = generate_anomaly_color_map(df[col].unique())
    df['color'] = df[col].map(color_map)

    df[time_col] = pd.to_datetime(df[time_col]).dt.strftime('%Y-%m-%d')

    hover_data = {c: True for c in df.columns if c not in ['color','Detected_Anomaly', 'Anomaly_Score','Detected_Anomaly_Probability','Anomaly_lvl','devicetype','gatewayid']}

    fig = px.scatter_mapbox(df, lat=lat_col, lon=lon_col, color=col,
                            color_discrete_map=color_map,
                            # hover_name=col,
                            hover_data=hover_data,
                            animation_frame=time_col,
                            size_max=18, zoom=13,
                            title='Anomalies Over Time',
                            mapbox_style="open-street-map")

    fig.update_traces(marker=dict(size=15))


    fig.update_layout(mapbox=dict(center=dict(lat=df[lat_col].mean(), lon=df[lon_col].mean())),
                    margin={"r":0,"t":0,"l":0,"b":0},
                    width=width,
                    height=height)

    fig.show()

In [None]:
def plot_feature_importances_interactive(feature_scores):
    long_df = feature_scores.melt(id_vars='index', var_name='feature_importance', value_name='Score')

    marker_size = long_df['Score'].abs() * 10

    fig = px.bar(long_df, x="index", y="Score", color="index",template="simple_white",
                    hover_data=["Score"], title="Feature Importance in Anomaly Classification Analysis")

    fig.add_hline(y=0, line_dash="dash", line_color="black")

    fig.update_layout(
        width=900,
        height=600,
        hoverlabel=dict(
            bgcolor="black",
            font_size=15,
            font_family="Rockwell"
        ),
        xaxis_title="Feature",
        yaxis_title="Importance Score"
    )

    fig.show()
    return long_df

In [None]:
from statsmodels.regression.linear_model import OLS, GLSAR
import statsmodels.api as sm

import statsmodels.regression.tests.results.results_macro_ols_robust as res

def genAnmAgg(rawvaluesFull, anomalycolumns, aggregations):
    numeric_cols = rawvaluesFull.select_dtypes(np.number).columns

    groupedAnomalyLevel = rawvaluesFull.groupby('Anomaly_Level')

    # AnomalySystemDesc = rawvaluesFull.groupby(anomalycolumns).describe()
    grouped_aggregations = groupedAnomalyLevel[numeric_cols].agg(aggregations)

    grouped_aggregations.columns = ['_'.join(col).strip() if col[1] else col[0] for col in grouped_aggregations.columns.values]

    groupedhours = rawvaluesFull.groupby(['Anomaly_Level', 'hour']).agg(aggregations)
    groupedhours.columns = ['_'.join(col).strip() if col[1] else col[0] for col in groupedhours.columns.values]

    return groupedhours, grouped_aggregations

def statsRegModel(data, target: str):

    keywords = ['anomaly', 'error', 'anomaly', 'severity', 'distance', 'cluster','anomaly_level','voted_anomaly_score']
    # try:

    anomaly_cols = [col for col in data.columns if any(keyword in col.lower() for keyword in keywords)]

    y = data[target]
    X = data.drop(columns=anomaly_cols)

    # Prepare the features and target variable
    X = sm.add_constant(X)

    # Add a constant to the model (the intercept term)
    X = sm.add_constant(X)

    # Fit the linear regression model
    lin_reg = sm.OLS(y, X).fit()

    return lin_reg


In [None]:

Vertical_selection = databaseoptions[-1]

Scaled_trainData = load_from_mongo(dataselection, 'Scaled_train', dev_db).iloc[:1200].dropna(axis=1)
train_data = load_from_mongo(dataselection, '_train', dev_db).iloc[:1200].dropna(axis=1)

cleantraindata = train_data.set_index([main_datetime_col]).select_dtypes(np.number)
scaled_cleantraindata = Scaled_trainData.set_index([main_datetime_col]).select_dtypes(np.number)

RETScaledMLInputdata, RETrawinputdata = run_anomaly_detection_models(cleantraindata,scaled_cleantraindata)
RetFromSkModels = anomaly_insights(RETrawinputdata)


Autoencoder training successful.


In [None]:

RetFromSkModels.reset_index(inplace=True)
anomaly_columns, keyword_columns, scores_columns, collected_columns = extract_columns_and_data(RetFromSkModels, anomaly_condition, keyword_condition, scores_condition)

AnomalyLabelsCols = [col for col in anomaly_columns if col not in scores_columns]
scaled_cleantraindata['Detected_Anomaly'] = RetFromSkModels['Dynamic_Anomaly'].values
# traindf['Anomaly_Level'] = Anomaliestopass['Enhanced_Anomaly_Score'].values
scaled_cleantraindata['Anomaly_Level_Cat'] =  RetFromSkModels['Anomaly_Level']
scaled_cleantraindata['Anomaly_Level'] =  RetFromSkModels['Anomaly_Level'].factorize()[0]
scaled_cleantraindata['Voted_Anomaly_Score'] = RetFromSkModels['Voted_Anomaly_Score'].values



In [None]:

anomalycolumns = ['Anomaly_Level','Enhanced_Anomaly_Score','Voted_Anomaly','Anomaly_Score_STD']
aggregations = {
    'Enhanced_Anomaly_Score': ['mean', 'std', 'max'],
    'Voted_Anomaly': ['sum', 'mean'],
    'Anomaly_Score_STD':['mean', 'var', 'max']
    }

groupedhours, grouped_aggregations = genAnmAgg(RETrawinputdata, anomalycolumns, aggregations)
scaled_cleantraindata.dropna(axis=1, inplace=True)
anomaly_lin_reg = statsRegModel(scaled_cleantraindata, 'Detected_Anomaly')
anomaly_lin_reg_summary = anomaly_lin_reg.summary()

In [None]:
fullprediction, grid_search_best_estimator_, x_test, y_test, test_cm, test_classreport, accuracy = anomaly_classifier(scaled_cleantraindata, main_datetime_col, col='Detected_Anomaly')

[LightGBM] [Info] Number of positive: 650, number of negative: 310
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the train set: 960, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.677083 -> initscore=0.740400
[LightGBM] [Info] Start training from score 0.740400


In [None]:

selected_features, feature_scores = compute_feature_values(x_test, y_test)
Aggregatefeat = feature_scores[['feature_importance']].reset_index()



In [None]:

fromtrain = cleantraindata.reset_index()
fullprediction = fullprediction.reset_index()
fromtrain['Detected_Anomaly'] = fullprediction['Anomaly_Predictions']
fromtrain['Detected_Anomaly_Probability'] = fullprediction['Anomaly_PredictedProb']



In [None]:

model_name = f'{dataselection}model'
save_to_mongo(RETrawinputdata.reset_index(), dataselection, "_Anomaly_Ensemble", dev_db)
save_sklearn_model_to_mongo(grid_search_best_estimator_, Modelsdb, test_classreport)
save_to_mongo(fullprediction.reset_index(), dataselection, "_Test_Prediction", dev_db)
save_to_mongo(Aggregatefeat, dataselection, "_feature_importance_scores", Models_evaluation)
save_to_mongo(fromtrain.reset_index(), dataselection, "_Test_warning", dev_db)


anomaly_columns, keyword_columns, scores_columns, collected_columns = extract_columns_and_data(RETrawinputdata, anomaly_condition, keyword_condition, scores_condition)
ISO_anomaly_columns = [col for col in anomaly_columns if col not in scores_columns]
AnomaliesScores = RETrawinputdata[scores_columns]
dataclean = cleantraindata.iloc[:,:9]
feature_importanceCols = [k for k in dataclean.columns if k not in ['latitude', 'longitude', 'frequency']]

Saved new model LGBMClassifier with hash 0d5fa9e6e70ea5eb1f9bd87b10ee1aa9d70d04949ce8a3037f8ce4bee47adf7c.


In [None]:
pie_chart_info = display_anomaly_level_summary(RETrawinputdata)

The pie chart above shows the distribution of different anomaly levels.
Notice: 300
Low: 300
Normal: 300
Concern: 245
Critical: 55



In [None]:
distributionofanomalies = display_anomaly_complementary_summary(RETrawinputdata)

The pie chart above shows the distribution of anomalies
and non-anomalies Final Classification.
Total Anomalies: 121
Total Non-Anomalies: 1079



In [None]:
Feat_toModel_feature_importance = plot_feature_importance(dataclean[feature_importanceCols], AnomaliesScores)











In [None]:
featToLabelImportance = anomaly_detection(dataclean[feature_importanceCols], AnomaliesScores)





















In [None]:
multimodelFeatimpToAnomaly = plot_feature_importances_interactive(Aggregatefeat)

In [None]:

if 'latitude' in RETrawinputdata.columns and 'longitude' in RETrawinputdata.columns:

  make_anomaly_map(RETrawinputdata.reset_index(), 'Anomaly_Level', main_datetime_col,lat_col='latitude', lon_col='longitude', width=600, height=500)

In [None]:
display(groupedhours, grouped_aggregations, anomaly_lin_reg_summary)


Unnamed: 0_level_0,Unnamed: 1_level_0,Enhanced_Anomaly_Score_mean,Enhanced_Anomaly_Score_std,Enhanced_Anomaly_Score_max,Voted_Anomaly_sum,Voted_Anomaly_mean,Anomaly_Score_STD_mean,Anomaly_Score_STD_var,Anomaly_Score_STD_max
Anomaly_Level,hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Concern,0,0.454362,0.119459,0.647616,9,0.257143,0.181654,0.036751,0.488807
Concern,2,0.475043,0.114909,0.648333,7,0.269231,0.173489,0.026533,0.484784
Concern,4,0.423135,0.090302,0.647551,2,0.105263,0.129549,0.025389,0.485911
Concern,6,0.428658,0.105822,0.647737,4,0.173913,0.136103,0.027239,0.488735
Concern,8,0.468987,0.113202,0.647706,3,0.230769,0.179108,0.032271,0.488458
Concern,10,0.497162,0.130901,0.64906,8,0.4,0.235343,0.037272,0.488953
Concern,12,0.436467,0.099122,0.647945,3,0.142857,0.135595,0.019706,0.48919
Concern,14,0.475381,0.11624,0.648914,6,0.272727,0.18791,0.025451,0.48485
Concern,16,0.495545,0.113079,0.647574,5,0.3125,0.234566,0.029247,0.487683
Concern,18,0.502604,0.132958,0.648349,6,0.428571,0.266997,0.041354,0.487764


Unnamed: 0_level_0,Enhanced_Anomaly_Score_mean,Enhanced_Anomaly_Score_std,Enhanced_Anomaly_Score_max,Voted_Anomaly_sum,Voted_Anomaly_mean,Anomaly_Score_STD_mean,Anomaly_Score_STD_var,Anomaly_Score_STD_max
Anomaly_Level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Concern,0.466063,0.116837,0.64906,66,0.269388,0.184254,0.030885,0.48955
Critical,0.74398,0.137112,1.001569,55,1.0,0.177784,0.027692,0.493342
Low,0.118383,0.072844,0.181854,0,0.0,0.217878,0.02119,0.483611
Normal,-0.014841,0.006387,-0.005591,0,0.0,0.148405,0.004079,0.251764
Notice,0.241448,0.077067,0.370076,0,0.0,0.154629,0.026222,0.485866


0,1,2,3
Dep. Variable:,Detected_Anomaly,R-squared:,0.053
Model:,OLS,Adj. R-squared:,0.046
Method:,Least Squares,F-statistic:,7.406
Date:,"Sat, 30 Mar 2024",Prob (F-statistic):,1.41e-10
Time:,23:45:51,Log-Likelihood:,-749.05
No. Observations:,1200,AIC:,1518.0
Df Residuals:,1190,BIC:,1569.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1162.7101,1351.672,-0.860,0.390,-3814.635,1489.215
frequency,0.0854,0.046,1.856,0.064,-0.005,0.176
latitude,61.7678,8.479,7.285,0.000,45.133,78.402
longitude,10.2959,10.198,1.010,0.313,-9.712,30.304
batterylevel,0.0315,0.023,1.396,0.163,-0.013,0.076
moisture,0.0073,0.022,0.325,0.745,-0.037,0.051
ambienttemp,0.0052,0.022,0.233,0.816,-0.038,0.049
soiltemp,-0.0128,0.022,-0.586,0.558,-0.055,0.030
humidity,-0.0016,0.021,-0.076,0.940,-0.044,0.040

0,1,2,3
Omnibus:,3507.604,Durbin-Watson:,1.482
Prob(Omnibus):,0.0,Jarque-Bera (JB):,182.217
Skew:,-0.694,Prob(JB):,2.7e-40
Kurtosis:,1.689,Cond. No.,90600000.0


In [None]:
display(distributionofanomalies,
      grid_search_best_estimator_.best_iteration_, test_classreport,
      pie_chart_info)

'The pie chart above shows the distribution of anomalies\nand non-anomalies Final Classification.\nTotal Anomalies: 121\nTotal Non-Anomalies: 1079\nPooled average anomalies in the final decision:\n297.2\nPooled average non-anomalies in the final decision:\n902.8'

0

{'0': {'precision': 0.7529411764705882,
  'recall': 0.9552238805970149,
  'f1-score': 0.8421052631578946,
  'support': 67},
 '1': {'precision': 0.9806451612903225,
  'recall': 0.8786127167630058,
  'f1-score': 0.926829268292683,
  'support': 173},
 'accuracy': 0.9,
 'macro avg': {'precision': 0.8667931688804553,
  'recall': 0.9169182986800104,
  'f1-score': 0.8844672657252888,
  'support': 240},
 'weighted avg': {'precision': 0.9170777988614801,
  'recall': 0.9,
  'f1-score': 0.9031771501925545,
  'support': 240}}

'The pie chart above shows the distribution of different anomaly levels.\nNotice: 300\nLow: 300\nNormal: 300\nConcern: 245\nCritical: 55\n'

In [None]:

display(multimodelFeatimpToAnomaly,
      featToLabelImportance,
      Feat_toModel_feature_importance)

Unnamed: 0,index,feature_importance,Score
0,frequency,feature_importance,0.267884
1,latitude,feature_importance,2.802147
2,longitude,feature_importance,-2.073141
3,batterylevel,feature_importance,-0.873397
4,moisture,feature_importance,-1.585313
5,ambienttemp,feature_importance,-1.052342
6,soiltemp,feature_importance,-0.805839
7,humidity,feature_importance,-1.560833
8,hour,feature_importance,4.880833


Unnamed: 0,EllipticEnvelope_AnomalyScore,IsolationForest_AnomalyScore,OneClassSVM_AnomalyScore,Autoencoder_reconstruction_error,Voted_Anomaly_Score_Mean,Voted_Anomaly_Score,Anomaly_Score_STD,Weighted_Anomaly_Score,Enhanced_Anomaly_Score,average_importance
batterylevel,0.017227,0.012492,0.017502,0.004283,0.003594,0.010449,0.001717,0.010704,0.011762,0.00997
moisture,0.03518,0.032861,0.024286,0.036092,0.036954,0.029179,0.039244,0.035512,0.033665,0.033664
ambienttemp,0.011698,0.022388,0.045608,0.009871,0.010691,0.035857,0.003824,0.030082,0.030412,0.02227
soiltemp,0.017599,0.027039,0.020528,0.006433,0.007005,0.026729,0.001943,0.023116,0.023305,0.017078
humidity,0.037403,0.031299,0.014618,0.008485,0.009674,0.047371,0.013216,0.05403,0.054467,0.030063


Unnamed: 0,IsolationForest_AnomalyScore,EllipticEnvelope_AnomalyScore,OneClassSVM_AnomalyScore,Autoencoder_reconstruction_error,Average
moisture,0.032861,0.03518,0.024286,0.036092,0.032105
humidity,0.031299,0.037403,0.014618,0.008485,0.022951
ambienttemp,0.022388,0.011698,0.045608,0.009871,0.022391
soiltemp,0.027039,0.017599,0.020528,0.006433,0.0179
batterylevel,0.012492,0.017227,0.017502,0.004283,0.012876
date,,,,,


In [None]:
# env_data = getENVdata(dashboardFacingdf, dataselection, Vertical_selection)

# explorerPDDF_set = dashboardFacingdf.set_index([main_datetime_col])
# explorerPDDF_formatted = optimized_data_format_for_llm(explorerPDDF_set)

# analysis_focus = 'EfficiencyImprovements'

# CustomizablePrompt = f"""
# **{analysis_focus}**:
# - **Current Conditions**: {env_data['message']}
# - **Overview**: {explorerPDDF_formatted}.
# - **Temporal Insights**: Dive into data points for insights: Sampled Data insights.
# - **Environmental Impact**: How do current conditions affect our dataset? Let's explore.
# """
# print(CustomizablePrompt)

In [None]:
# schedulerfunction

# We need a function that can run the anomaly system logic on a schduled cycle for retraining purpose.

def prepare_data(dataselection, main_datetime_col, db_connection):
    # Load data once
    scaled_train = load_from_mongo(dataselection, 'Scaled_train', db_connection)
    train = load_from_mongo(dataselection, '_train', db_connection)

    # Set index and filter numeric data once
    def prepare_df(df):
        return df.set_index([main_datetime_col]).select_dtypes(np.number).dropna(axis=1).iloc[:1200]

    cleantraindata = prepare_df(train)
    scaled_cleantraindata = prepare_df(scaled_train)

    # Run anomaly detection models
    Anomaliestopass = run_anomaly_detection_models(cleantraindata, scaled_cleantraindata)

    # Assign new columns based on model results
    cleantraindata['Detected_Anomaly'] = Anomaliestopass['Dynamic_Anomaly'].values
    cleantraindata['Anomaly_Level'] = Anomaliestopass['Anomaly_Level'].factorize()[0]
    cleantraindata['Voted_Anomaly_Score'] = Anomaliestopass['Voted_Anomaly_Score'].values

    # Further processing
    fullprediction, grid_search_best_estimator_, x_test, y_test, test_cm, test_classreport, accuracy = anomaly_classifier(cleantraindata, main_datetime_col, col='Detected_Anomaly')

    # Prepare for saving
    fromtrain = cleantraindata.reset_index()
    fullprediction = fullprediction.reset_index()

    selected_features, feature_scores = compute_feature_values(x_test, y_test)
    Aggregatefeat = feature_scores[['feature_importance']].reset_index()

    # Save results
    save_to_mongo(Anomaliestopass.reset_index(), dataselection, "_Anomaly_Ensemble", db_connection)
    save_sklearn_model_to_mongo(grid_search_best_estimator_, db_connection, test_classreport)
    save_to_mongo(fullprediction.reset_index(), dataselection, "_Test_Prediction", db_connection)
    save_to_mongo(Aggregatefeat, dataselection, "_feature_importance_scores", db_connection)
    save_to_mongo(fromtrain.reset_index(), dataselection, "_Test_warning", db_connection)