In [1]:
import pandas as pd

import random

from datetime import datetime, timedelta

 

# Generate dummy timestamps

start_time = datetime(2023, 1, 1, 0, 0, 0)

end_time = datetime(2023, 1, 10, 23, 0, 0)

time_range = end_time - start_time

timestamps = [start_time + timedelta(hours=i) for i in range(int(time_range.total_seconds() / 3600))]

 

# Generate dummy OS and WebLogic metrics

data = []

for timestamp in timestamps:

    cpu_usage = random.uniform(0, 100)  # Random CPU usage percentage

    memory_usage = random.uniform(0, 100)  # Random memory usage percentage

    disk_usage = random.uniform(0, 100)  # Random disk usage percentage

    network_traffic = random.uniform(0, 1000)  # Random network traffic in KB/s

    weblogic_requests = random.randint(0, 1000)  # Random WebLogic requests

    weblogic_heap_usage = random.uniform(0, 100)  # Random WebLogic heap usage percentage

    io_wait = random.uniform(0, 100)  # Random I/O wait percentage

    thread_count = random.randint(1, 100)  # Random thread count

    response_time = random.uniform(0, 1000)  # Random response time in ms

    database_connections = random.randint(1, 50)  # Random database connections

    jvm_gc_count = random.randint(0, 100)  # Random JVM garbage collection count

    jvm_gc_time = random.uniform(0, 1000)  # Random JVM garbage collection time in ms

    data.append({

        'Timestamp': timestamp,

        'CPU_Usage': cpu_usage,

        'Memory_Usage': memory_usage,

        'Disk_Usage': disk_usage,

        'Network_Traffic': network_traffic,

        'WebLogic_Requests': weblogic_requests,

        'WebLogic_Heap_Usage': weblogic_heap_usage,

        'IO_Wait': io_wait,

        'Thread_Count': thread_count,

        'Response_Time': response_time,

        'Database_Connections': database_connections,

        'JVM_GC_Count': jvm_gc_count,

        'JVM_GC_Time': jvm_gc_time

    })


df = pd.DataFrame(data)

 

# Display the DataFrame

#print(df)
df_original = df.copy()
df

Unnamed: 0,Timestamp,CPU_Usage,Memory_Usage,Disk_Usage,Network_Traffic,WebLogic_Requests,WebLogic_Heap_Usage,IO_Wait,Thread_Count,Response_Time,Database_Connections,JVM_GC_Count,JVM_GC_Time
0,2023-01-01 00:00:00,30.197401,87.635362,1.318149,751.266804,951,27.213015,85.015518,21,75.531343,48,49,816.171301
1,2023-01-01 01:00:00,88.941371,1.217253,30.245475,292.537195,131,24.182415,9.138269,88,956.687021,40,61,802.390758
2,2023-01-01 02:00:00,73.165089,68.258250,32.613166,418.010451,397,3.078107,40.972863,45,680.180810,19,29,511.505697
3,2023-01-01 03:00:00,38.838927,90.531147,14.940235,933.070253,244,85.304744,49.633783,5,805.066618,42,60,38.405221
4,2023-01-01 04:00:00,37.442944,95.431364,64.095636,418.247873,516,71.257146,61.267984,73,880.671577,5,48,445.645436
...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,2023-01-10 18:00:00,61.455113,25.007623,57.890077,104.821295,486,89.976409,17.161216,13,541.311784,8,14,888.909358
235,2023-01-10 19:00:00,52.576436,43.873514,7.847345,459.645100,611,6.114673,54.663898,7,384.910851,13,41,639.186571
236,2023-01-10 20:00:00,86.635364,11.458160,32.635947,117.145905,743,10.679366,58.380675,1,651.315081,49,70,108.731669
237,2023-01-10 21:00:00,66.021348,38.276508,5.662059,594.661713,294,38.356914,83.073778,43,637.298509,39,79,429.391313


In [2]:
# Importing necessary libraries for model implementation
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
import datetime
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from datetime import datetime, timedelta
import seaborn as sns
import matplotlib.pyplot as plt
import boto3
import pytz
import os
import warnings

In [3]:
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)

In [4]:

# Function to convert logtime into a fraction of the day

def fraction_of_day_to_hms(fraction): 

    seconds = int(round(86400 * fraction)) 

    hours, remainder = divmod(seconds, 3600) 

    minutes, seconds = divmod(remainder, 60) 

    return '{:02d}:{:02d}:{:02d}'.format(hours, minutes, seconds)

 

# Function to extract last 4 words of a column (Targetting JobNames)

def extract_last_4(JNAME):

    words = JNAME.split('_')

    last_4 = words[-4:]

    return pd.Series(last_4)

 

# Function to get running period (Morning, Afternoon, Evening, Night)

def get_period(time_str):

    hour = int(time_str.split(':')[0])

    if hour >= 21 or hour < 9:

        return 'Night'

    elif hour < 13:

        return 'Morning'

    elif hour < 17:

        return 'Afternoon'

    else:

        return 'Evening'

def time_to_seconds(time_value):

    time_str = str(time_value).strip()

    hours = int(time_str[:2])

    minutes = int(time_str[2:4])

    seconds = int(time_str[4:6])

 

    total_seconds = (hours * 3600) + (minutes * 60) + seconds

    fraction = total_seconds/86400

 

    return fraction

 

def add_one_if_seconds_over_30(row):

    time_obj = row['ActLogTime']

    seconds = time_obj.second

    if seconds >= 30:

        row['Time_label'] += 1

    return row

 

def drop_date_part(df, column_name):

    df[column_name] = df[column_name].dt.time

    return df

In [5]:
# Feature Engineering Steps

df = df.drop_duplicates()

df['ActLogTime'] = pd.to_datetime(df['Timestamp'], format='%H%M%S').dt.strftime("%Y-%m-%d %H:%M:%S").str.split().str[1]

#df['ActTime'] = df['ActLogTime'].apply(time_to_seconds)

df["ActLogDate"] = pd.to_datetime(pd.to_datetime(df['Timestamp'], format='%Y%m%d').dt.strftime("%Y-%m-%d %H:%M:%S").str.split().str[0])

df['Run_Period'] = df['ActLogTime'].apply(get_period)

df['Weekday'] = df['ActLogDate'].apply(lambda x: 1 if x.weekday() < 5 else 0)

df['ActMonth'] = df['ActLogDate'].dt.month

df['ActDay'] = df['ActLogDate'].dt.day

 

time_labels = {}

for i in range(60*24):

    time_str = '{:02d}:{:02d}'.format(*divmod(i,60))

    label = i % (60*24)

    time_labels[time_str] = label

df['ActLogTime'] = pd.to_datetime(df['ActLogTime'] , format = '%H:%M:%S')

df['Time_label'] = df['ActLogTime'].apply(lambda x: time_labels[x.strftime('%H:%M')])

df = df.apply(add_one_if_seconds_over_30, axis=1)

 

df

Unnamed: 0,Timestamp,CPU_Usage,Memory_Usage,Disk_Usage,Network_Traffic,WebLogic_Requests,WebLogic_Heap_Usage,IO_Wait,Thread_Count,Response_Time,Database_Connections,JVM_GC_Count,JVM_GC_Time,ActLogTime,ActLogDate,Run_Period,Weekday,ActMonth,ActDay,Time_label
0,2023-01-01 00:00:00,30.197401,87.635362,1.318149,751.266804,951,27.213015,85.015518,21,75.531343,48,49,816.171301,1900-01-01 00:00:00,2023-01-01,Night,0,1,1,0
1,2023-01-01 01:00:00,88.941371,1.217253,30.245475,292.537195,131,24.182415,9.138269,88,956.687021,40,61,802.390758,1900-01-01 01:00:00,2023-01-01,Night,0,1,1,60
2,2023-01-01 02:00:00,73.165089,68.258250,32.613166,418.010451,397,3.078107,40.972863,45,680.180810,19,29,511.505697,1900-01-01 02:00:00,2023-01-01,Night,0,1,1,120
3,2023-01-01 03:00:00,38.838927,90.531147,14.940235,933.070253,244,85.304744,49.633783,5,805.066618,42,60,38.405221,1900-01-01 03:00:00,2023-01-01,Night,0,1,1,180
4,2023-01-01 04:00:00,37.442944,95.431364,64.095636,418.247873,516,71.257146,61.267984,73,880.671577,5,48,445.645436,1900-01-01 04:00:00,2023-01-01,Night,0,1,1,240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,2023-01-10 18:00:00,61.455113,25.007623,57.890077,104.821295,486,89.976409,17.161216,13,541.311784,8,14,888.909358,1900-01-01 18:00:00,2023-01-10,Evening,1,1,10,1080
235,2023-01-10 19:00:00,52.576436,43.873514,7.847345,459.645100,611,6.114673,54.663898,7,384.910851,13,41,639.186571,1900-01-01 19:00:00,2023-01-10,Evening,1,1,10,1140
236,2023-01-10 20:00:00,86.635364,11.458160,32.635947,117.145905,743,10.679366,58.380675,1,651.315081,49,70,108.731669,1900-01-01 20:00:00,2023-01-10,Evening,1,1,10,1200
237,2023-01-10 21:00:00,66.021348,38.276508,5.662059,594.661713,294,38.356914,83.073778,43,637.298509,39,79,429.391313,1900-01-01 21:00:00,2023-01-10,Night,1,1,10,1260


In [6]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def min_max(df):
    # Create a copy of the DataFrame to avoid modifying the original DataFrame
    df_copy = df.copy()

    # Convert the 'Timestamp' column to Unix timestamps
    df_copy['timestamp'] = (df_copy['Timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

    # Select only the columns you want to normalize (excluding non-numeric columns)
    numeric_columns = df_copy.select_dtypes(include=['number']).columns
    numeric_df = df_copy[numeric_columns]

    # Initialize the MinMaxScaler
    scaler = MinMaxScaler()

    # Fit and transform the scaler on the numeric DataFrame
    normalized_data = scaler.fit_transform(numeric_df)

    # Create a DataFrame with the normalized data
    normalized_df = pd.DataFrame(normalized_data, columns=numeric_columns)

    # Rename columns with a suffix
    i = 0
    new_names = []
    for cname in normalized_df.columns:
        new_names.append(f'{cname}_{i}')
        i += 1
    normalized_df.columns = new_names

    return normalized_df, scaler


In [7]:
JOBNAME_enc = LabelEncoder()
df['Run_Period'] = JOBNAME_enc.fit_transform(df['Run_Period'])

In [8]:
df_n, scaler = min_max(df)

df_n

Unnamed: 0,CPU_Usage_0,Memory_Usage_1,Disk_Usage_2,Network_Traffic_3,WebLogic_Requests_4,WebLogic_Heap_Usage_5,IO_Wait_6,Thread_Count_7,Response_Time_8,Database_Connections_9,JVM_GC_Count_10,JVM_GC_Time_11,Run_Period_12,Weekday_13,ActMonth_14,ActDay_15,Time_label_16,timestamp_17
0,0.303988,0.881983,0.010433,0.754310,0.957317,0.260369,0.857173,0.202020,0.074050,0.959184,0.49,0.818418,1.000000,0.0,0.0,0.0,0.000000,0.000000
1,0.899663,0.008431,0.302198,0.291644,0.123984,0.229566,0.091229,0.878788,0.960665,0.795918,0.61,0.804516,1.000000,0.0,0.0,0.0,0.043478,0.004202
2,0.739688,0.686111,0.326079,0.418194,0.394309,0.015062,0.412584,0.444444,0.682446,0.367347,0.29,0.511070,1.000000,0.0,0.0,0.0,0.086957,0.008403
3,0.391615,0.911255,0.147828,0.937673,0.238821,0.850812,0.500012,0.040404,0.808105,0.836735,0.60,0.033804,1.000000,0.0,0.0,0.0,0.130435,0.012605
4,0.377459,0.960789,0.643615,0.418433,0.515244,0.708032,0.617454,0.727273,0.884179,0.081633,0.48,0.444630,1.000000,0.0,0.0,0.0,0.173913,0.016807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,0.620947,0.248915,0.581025,0.102317,0.484756,0.898295,0.172217,0.121212,0.542716,0.142857,0.14,0.891797,0.333333,1.0,0.0,1.0,0.782609,0.983193
235,0.530916,0.439619,0.076288,0.460186,0.611789,0.045926,0.550789,0.060606,0.385346,0.244898,0.41,0.639875,0.333333,1.0,0.0,1.0,0.826087,0.987395
236,0.876279,0.111951,0.326309,0.114747,0.745935,0.092321,0.588308,0.000000,0.653401,0.979592,0.70,0.104750,0.333333,1.0,0.0,1.0,0.869565,0.991597
237,0.667249,0.383042,0.054247,0.596361,0.289634,0.373635,0.837573,0.424242,0.639298,0.775510,0.79,0.428233,1.000000,1.0,0.0,1.0,0.913043,0.995798


In [9]:
def IsoForest(df):

    #df['timestamp'] = df['Timestamp'].astype(int) // 10**9 

    # Create an instance of the Isolation Forest model

    isolation_forest = IsolationForest(n_estimators=1000, contamination=0.05)

 

    # Fit the model to the data

    isolation_forest.fit(df)

 

    # Predict the anomalies

    anomaly_scores = isolation_forest.decision_function(df)

    anomaly_predictions = isolation_forest.predict(df)



    # Add the anomaly scores and predictions to the original DataFrame

    df['anomaly_score'] = anomaly_scores

    df['anomaly_prediction'] = anomaly_predictions

 

    return df

In [10]:
iso_df = IsoForest(df_n)

In [11]:
iso_df.shape

(239, 20)

In [12]:
def split_and_clean_df(df):


    df_train = df[df['anomaly_prediction'] == 1].copy()

    df_test = df[df['anomaly_prediction'] == -1].copy()

 

    columns_to_remove = ['anomaly_score', 'anomaly_prediction']

    df_train.drop(columns=columns_to_remove, inplace=True)

    df_test.drop(columns=columns_to_remove, inplace=True)

 

    return df_train, df_test

In [13]:
df_train, df_test = split_and_clean_df(iso_df)

print(f'shape of training {df_train.shape}')

print(f'shape of testing {df_test.shape}')

shape of training (227, 18)
shape of testing (12, 18)


In [14]:
def autoencoder_prep(df_act):

 

    print("Dataframe --> Matrix")

    matrix = df_act.to_numpy()


    print("Setting Dimensional values")

    # input_dim = 5

    # hidden_dim_1 = 4

    # hidden_dim_2 = 3

    # bottleneck_dim = 2

    # hidden_dim_3 = 3

    # hidden_dim_4 = 4

    # output_dim = 5


    input_dim = 18

    hidden_dim_1 = 12

    hidden_dim_2 = 7

    bottleneck_dim = 3

    hidden_dim_3 = 7

    hidden_dim_4 = 12

    output_dim = 18

 

    print("Specifying Layers of the Architecture")

    input_layer = tf.keras.layers.Input(shape=(input_dim))


    hidden_layer_1 = tf.keras.layers.Dense(hidden_dim_1, activation='relu')(input_layer)

    hidden_layer_2 = tf.keras.layers.Dense(hidden_dim_2, activation='relu')(hidden_layer_1)


    bottleneck_layer = tf.keras.layers.Dense(bottleneck_dim, activation='relu')(hidden_layer_2)


    hidden_layer_3 = tf.keras.layers.Dense(hidden_dim_3, activation='relu')(bottleneck_layer)

    hidden_layer_4 = tf.keras.layers.Dense(hidden_dim_4, activation='relu')(hidden_layer_3)


    output_layer = tf.keras.layers.Dense(output_dim, activation= 'sigmoid')(hidden_layer_4)


    return matrix , input_layer , hidden_layer_1 , hidden_layer_2 , bottleneck_layer , hidden_layer_3 , hidden_layer_4 , output_layer

In [15]:
matrix , input_layer , hidden_layer_1 , hidden_layer_2 , bottleneck_layer , hidden_layer_3 , hidden_layer_4 , output_layer = autoencoder_prep(df_train)

Dataframe --> Matrix
Setting Dimensional values
Specifying Layers of the Architecture


In [16]:
#Training Process

def autoencoder_train(input_layer, output_layer, matrix,  hidden_layer_1 , hidden_layer_2 , bottleneck_layer , hidden_layer_3 , hidden_layer_4, e):

 

    print("Building the Model")

    autoencoder = tf.keras.models.Model(inputs=input_layer, outputs=output_layer)

 

    print("Training the Model")

    autoencoder.compile(optimizer='adam', loss='mse')

    autoencoder.fit(matrix, matrix, epochs=e, batch_size=64)

    print("Successfully Trained the Model")

 

    print("Calculate Reconstruction Error")

    reconstructed_data = autoencoder.predict(matrix)

    mse = np.mean(np.power(matrix - reconstructed_data, 2), axis=1)

    threshold = np.percentile(mse, 99.50) # Set threshold based on the 99.9th percentile

    print("Threshold:", threshold)


    return autoencoder, threshold

In [17]:

autoencoder, threshold = autoencoder_train(input_layer, output_layer, matrix,  hidden_layer_1 , hidden_layer_2 , bottleneck_layer , hidden_layer_3 , hidden_layer_4, 50)

Building the Model
Training the Model
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Successfully Trained the Model
Calculate Reconstruction Error
Threshold: 0.12541261957233002


In [18]:
#Identifying Anomalies

def autoencoder_anom(df_act, autoencoder, threshold, scaler):

    print("Test Dataframe --> Matrix")

 

    X = df_act.to_numpy()

 

    print("Identifying Anomalies")

 

    normalized_test_data = X

    reconstructed_test_data = autoencoder.predict(normalized_test_data)

 

    test_mse = np.mean(np.power(normalized_test_data - reconstructed_test_data, 2), axis=1)

 

    anomalies = X[test_mse > threshold]

 

    df_anom = pd.DataFrame(anomalies, columns=df_act.columns)


    df_anom["MSE"] = test_mse[test_mse > threshold]


    df_mse = pd.DataFrame(df_anom["MSE"], columns = ["MSE"])

    df_anom.drop(columns = ['MSE'], inplace = True)


    df_anom = scaler.inverse_transform(df_anom)

 

    return anomalies, df_anom, df_mse, normalized_test_data, reconstructed_test_data

In [19]:
anomalies, df_anom, df_mse, normalized_test_data, reconstructed_test_data = autoencoder_anom(df_test, autoencoder, threshold, scaler)

print(" ")

print("df_anom")

print(df_anom)

print(" ")

print("MSE")

print(df_mse)

Test Dataframe --> Matrix
Identifying Anomalies
 
df_anom
[[3.01974012e+01 8.76353616e+01 1.31814890e+00 7.51266804e+02
  9.51000000e+02 2.72130147e+01 8.50155185e+01 2.10000000e+01
  7.55313428e+01 4.80000000e+01 4.90000000e+01 8.16171301e+02
  3.00000000e+00 0.00000000e+00 1.00000000e+00 1.00000000e+00
  0.00000000e+00 1.67253120e+09]
 [8.89413709e+01 1.21725318e+00 3.02454754e+01 2.92537195e+02
  1.31000000e+02 2.41824155e+01 9.13826921e+00 8.80000000e+01
  9.56687021e+02 4.00000000e+01 6.10000000e+01 8.02390758e+02
  3.00000000e+00 0.00000000e+00 1.00000000e+00 1.00000000e+00
  6.00000000e+01 1.67253480e+09]
 [3.88389270e+01 9.05311473e+01 1.49402351e+01 9.33070253e+02
  2.44000000e+02 8.53047435e+01 4.96337831e+01 5.00000000e+00
  8.05066618e+02 4.20000000e+01 6.00000000e+01 3.84052212e+01
  3.00000000e+00 0.00000000e+00 1.00000000e+00 1.00000000e+00
  1.80000000e+02 1.67254200e+09]
 [5.46454609e-01 4.88525911e+01 7.58859346e+01 6.75407023e+01
  7.42000000e+02 8.07106344e+01 8.812

In [20]:
len(df.columns)

20

In [21]:
#Decoding

def autoencoder_decode(df_anom,df, JOBNAME_enc):


    df_anom = pd.DataFrame(df_anom)

    #df_anom = df_anom.rename(columns={0: "JOBNAME_0", 1: "ODateWeekNum_1", 2:"ODateWeekDay_2", 3:"HELD_3", 4:"FREED_4"})

    #new_names = []

    i=0

#     for cname in df.columns:

#         #print(cname)

#         new_names.append(f'{cname}_{i}')

#         i=i+1

    df_anom.columns = df.columns

    df_anom = df_anom.astype(int)

    #Decoding

    #print(new_names)

    #JOBNAME_enc = LabelEncoder()

    df_anom['Run_Period_12'] = JOBNAME_enc.inverse_transform(df_anom['Run_Period_12'])


    return df_anom

In [22]:
 df_anom = pd.DataFrame(df_anom)

In [23]:
df_anom

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,30.197401,87.635362,1.318149,751.266804,951.0,27.213015,85.015518,21.0,75.531343,48.0,49.0,816.171301,3.0,0.0,1.0,1.0,0.0,1672531000.0
1,88.941371,1.217253,30.245475,292.537195,131.0,24.182415,9.138269,88.0,956.687021,40.0,61.0,802.390758,3.0,0.0,1.0,1.0,60.0,1672535000.0
2,38.838927,90.531147,14.940235,933.070253,244.0,85.304744,49.633783,5.0,805.066618,42.0,60.0,38.405221,3.0,0.0,1.0,1.0,180.0,1672542000.0
3,0.546455,48.852591,75.885935,67.540702,742.0,80.710634,88.12763,88.0,985.732797,18.0,35.0,107.83273,3.0,0.0,1.0,1.0,480.0,1672560000.0
4,73.233092,98.644717,56.010905,495.555391,127.0,8.959627,21.051303,53.0,920.507509,43.0,83.0,233.802341,0.0,0.0,1.0,1.0,840.0,1672582000.0
5,66.215004,86.672214,88.124893,965.208655,39.0,25.135551,25.516672,65.0,174.648592,21.0,26.0,30.159599,0.0,0.0,1.0,1.0,960.0,1672589000.0
6,91.682333,32.979762,74.835842,962.27823,993.0,79.217749,4.3027,88.0,995.779948,3.0,6.0,590.870486,1.0,1.0,1.0,2.0,1200.0,1672690000.0
7,73.973604,73.749314,68.220298,976.990265,908.0,5.777651,14.119019,80.0,810.001566,15.0,95.0,139.734538,0.0,0.0,1.0,8.0,960.0,1673194000.0
8,3.026389,2.66337,85.555145,136.305143,820.0,16.806738,31.160544,93.0,614.596687,11.0,31.0,54.580862,3.0,1.0,1.0,10.0,1320.0,1673388000.0


In [24]:
df_test.columns

Index(['CPU_Usage_0', 'Memory_Usage_1', 'Disk_Usage_2', 'Network_Traffic_3',
       'WebLogic_Requests_4', 'WebLogic_Heap_Usage_5', 'IO_Wait_6',
       'Thread_Count_7', 'Response_Time_8', 'Database_Connections_9',
       'JVM_GC_Count_10', 'JVM_GC_Time_11', 'Run_Period_12', 'Weekday_13',
       'ActMonth_14', 'ActDay_15', 'Time_label_16', 'timestamp_17'],
      dtype='object')

In [25]:
df_anom = autoencoder_decode(df_anom,df_test,JOBNAME_enc)

df_anom_mse = pd.concat([df_anom, df_mse], axis = 1)

In [26]:
df_anom

Unnamed: 0,CPU_Usage_0,Memory_Usage_1,Disk_Usage_2,Network_Traffic_3,WebLogic_Requests_4,WebLogic_Heap_Usage_5,IO_Wait_6,Thread_Count_7,Response_Time_8,Database_Connections_9,JVM_GC_Count_10,JVM_GC_Time_11,Run_Period_12,Weekday_13,ActMonth_14,ActDay_15,Time_label_16,timestamp_17
0,30,87,1,751,951,27,85,21,75,48,49,816,Night,0,1,1,0,1672531200
1,88,1,30,292,131,24,9,88,956,40,61,802,Night,0,1,1,60,1672534800
2,38,90,14,933,244,85,49,5,805,42,60,38,Night,0,1,1,180,1672542000
3,0,48,75,67,742,80,88,88,985,18,35,107,Night,0,1,1,480,1672560000
4,73,98,56,495,127,8,21,52,920,43,83,233,Afternoon,0,1,1,840,1672581600
5,66,86,88,965,39,25,25,65,174,21,26,30,Afternoon,0,1,1,960,1672588800
6,91,32,74,962,993,79,4,88,995,3,6,590,Evening,1,1,2,1200,1672689600
7,73,73,68,976,908,5,14,80,810,14,95,139,Afternoon,0,1,8,960,1673193600
8,3,2,85,136,820,16,31,93,614,11,31,54,Night,1,1,10,1320,1673388000


In [27]:
df_anom_mse

Unnamed: 0,CPU_Usage_0,Memory_Usage_1,Disk_Usage_2,Network_Traffic_3,WebLogic_Requests_4,WebLogic_Heap_Usage_5,IO_Wait_6,Thread_Count_7,Response_Time_8,Database_Connections_9,JVM_GC_Count_10,JVM_GC_Time_11,Run_Period_12,Weekday_13,ActMonth_14,ActDay_15,Time_label_16,timestamp_17,MSE
0,30,87,1,751,951,27,85,21,75,48,49,816,Night,0,1,1,0,1672531200,0.157485
1,88,1,30,292,131,24,9,88,956,40,61,802,Night,0,1,1,60,1672534800,0.155207
2,38,90,14,933,244,85,49,5,805,42,60,38,Night,0,1,1,180,1672542000,0.145341
3,0,48,75,67,742,80,88,88,985,18,35,107,Night,0,1,1,480,1672560000,0.14196
4,73,98,56,495,127,8,21,52,920,43,83,233,Afternoon,0,1,1,840,1672581600,0.127305
5,66,86,88,965,39,25,25,65,174,21,26,30,Afternoon,0,1,1,960,1672588800,0.137459
6,91,32,74,962,993,79,4,88,995,3,6,590,Evening,1,1,2,1200,1672689600,0.138436
7,73,73,68,976,908,5,14,80,810,14,95,139,Afternoon,0,1,8,960,1673193600,0.133164
8,3,2,85,136,820,16,31,93,614,11,31,54,Night,1,1,10,1320,1673388000,0.13228


In [28]:
#Explainable AI

def autoencoder_insight(anomalies, normalized_test_data, reconstructed_test_data, df_anom):

 

    print("Anomaly Identification Complete")

    print("Refer to df_anom for anomalies")

    print(" ")

    print("Explaining each Anomalies")

    feature_contributions = np.abs(normalized_test_data - reconstructed_test_data)

 

    # Find the most important features for each anomaly

    most_important_features = np.argsort(feature_contributions, axis=1)[:, ::-1]

 

    # Print the most important features for each anomaly along with df_anom information

    for i, anomaly in enumerate(anomalies):

        print(f"Anomaly {i + 1}:")

        print(df_anom.iloc[i])  # Print df_anom information for the current anomaly

        print("Most important features:")

        for j, feature in enumerate(most_important_features[i]):

            print(f"   {j + 1}. Feature {feature}: Contribution = {feature_contributions[i][feature]}")

        print()

In [29]:
autoencoder_insight(anomalies, normalized_test_data, reconstructed_test_data, df_anom)

Anomaly Identification Complete
Refer to df_anom for anomalies
 
Explaining each Anomalies
Anomaly 1:
CPU_Usage_0                       30
Memory_Usage_1                    87
Disk_Usage_2                       1
Network_Traffic_3                751
WebLogic_Requests_4              951
WebLogic_Heap_Usage_5             27
IO_Wait_6                         85
Thread_Count_7                    21
Response_Time_8                   75
Database_Connections_9            48
JVM_GC_Count_10                   49
JVM_GC_Time_11                   816
Run_Period_12                  Night
Weekday_13                         0
ActMonth_14                        1
ActDay_15                          1
Time_label_16                      0
timestamp_17              1672531200
Name: 0, dtype: object
Most important features:
   1. Feature 13: Contribution = 0.7966412901878357
   2. Feature 15: Contribution = 0.5207325220108032
   3. Feature 2: Contribution = 0.5185960106861639
   4. Feature 17: Contributio

In [30]:
df_anom

Unnamed: 0,CPU_Usage_0,Memory_Usage_1,Disk_Usage_2,Network_Traffic_3,WebLogic_Requests_4,WebLogic_Heap_Usage_5,IO_Wait_6,Thread_Count_7,Response_Time_8,Database_Connections_9,JVM_GC_Count_10,JVM_GC_Time_11,Run_Period_12,Weekday_13,ActMonth_14,ActDay_15,Time_label_16,timestamp_17
0,30,87,1,751,951,27,85,21,75,48,49,816,Night,0,1,1,0,1672531200
1,88,1,30,292,131,24,9,88,956,40,61,802,Night,0,1,1,60,1672534800
2,38,90,14,933,244,85,49,5,805,42,60,38,Night,0,1,1,180,1672542000
3,0,48,75,67,742,80,88,88,985,18,35,107,Night,0,1,1,480,1672560000
4,73,98,56,495,127,8,21,52,920,43,83,233,Afternoon,0,1,1,840,1672581600
5,66,86,88,965,39,25,25,65,174,21,26,30,Afternoon,0,1,1,960,1672588800
6,91,32,74,962,993,79,4,88,995,3,6,590,Evening,1,1,2,1200,1672689600
7,73,73,68,976,908,5,14,80,810,14,95,139,Afternoon,0,1,8,960,1673193600
8,3,2,85,136,820,16,31,93,614,11,31,54,Night,1,1,10,1320,1673388000


In [31]:
final_anom_df = df_anom.copy()

In [32]:
modified_list = [s.rsplit('_', 1)[0] for s in final_anom_df.columns]
final_anom_df.columns  = modified_list

In [33]:
final_anom_df

Unnamed: 0,CPU_Usage,Memory_Usage,Disk_Usage,Network_Traffic,WebLogic_Requests,WebLogic_Heap_Usage,IO_Wait,Thread_Count,Response_Time,Database_Connections,JVM_GC_Count,JVM_GC_Time,Run_Period,Weekday,ActMonth,ActDay,Time_label,timestamp
0,30,87,1,751,951,27,85,21,75,48,49,816,Night,0,1,1,0,1672531200
1,88,1,30,292,131,24,9,88,956,40,61,802,Night,0,1,1,60,1672534800
2,38,90,14,933,244,85,49,5,805,42,60,38,Night,0,1,1,180,1672542000
3,0,48,75,67,742,80,88,88,985,18,35,107,Night,0,1,1,480,1672560000
4,73,98,56,495,127,8,21,52,920,43,83,233,Afternoon,0,1,1,840,1672581600
5,66,86,88,965,39,25,25,65,174,21,26,30,Afternoon,0,1,1,960,1672588800
6,91,32,74,962,993,79,4,88,995,3,6,590,Evening,1,1,2,1200,1672689600
7,73,73,68,976,908,5,14,80,810,14,95,139,Afternoon,0,1,8,960,1673193600
8,3,2,85,136,820,16,31,93,614,11,31,54,Night,1,1,10,1320,1673388000


In [34]:
df_original.columns

Index(['Timestamp', 'CPU_Usage', 'Memory_Usage', 'Disk_Usage',
       'Network_Traffic', 'WebLogic_Requests', 'WebLogic_Heap_Usage',
       'IO_Wait', 'Thread_Count', 'Response_Time', 'Database_Connections',
       'JVM_GC_Count', 'JVM_GC_Time'],
      dtype='object')

In [48]:
columns_keep =['CPU_Usage'] #, 'Memory_Usage', 'Disk_Usage','Network_Traffic', 'WebLogic_Requests', 'WebLogic_Heap_Usage',]
#'IO_Wait', 'Thread_Count', 'Response_Time', 'Database_Connections','JVM_GC_Count', 'JVM_GC_Time']

In [49]:
final_anom_df.columns

Index(['CPU_Usage', 'Memory_Usage', 'Disk_Usage', 'Network_Traffic',
       'WebLogic_Requests', 'WebLogic_Heap_Usage', 'IO_Wait', 'Thread_Count',
       'Response_Time', 'Database_Connections', 'JVM_GC_Count', 'JVM_GC_Time',
       'Run_Period', 'Weekday', 'ActMonth', 'ActDay', 'Time_label',
       'timestamp'],
      dtype='object')

In [50]:
final_df = pd.merge(df_original,final_anom_df[columns_keep], on=columns_keep, how='inner')
final_df

Unnamed: 0,Timestamp,CPU_Usage,Memory_Usage,Disk_Usage,Network_Traffic,WebLogic_Requests,WebLogic_Heap_Usage,IO_Wait,Thread_Count,Response_Time,Database_Connections,JVM_GC_Count,JVM_GC_Time
