In [None]:
[9:33 PM] Piumika, Tinul

import pandas as pd

import random

from datetime import datetime, timedelta

 

# Generate dummy timestamps

start_time = datetime(2023, 1, 1, 0, 0, 0)

end_time = datetime(2023, 1, 10, 23, 0, 0)

time_range = end_time - start_time

timestamps = [start_time + timedelta(hours=i) for i in range(int(time_range.total_seconds() / 3600))]

 

# Generate dummy OS and WebLogic metrics

data = []

for timestamp in timestamps:

    cpu_usage = random.uniform(0, 100)  # Random CPU usage percentage

    memory_usage = random.uniform(0, 100)  # Random memory usage percentage

    disk_usage = random.uniform(0, 100)  # Random disk usage percentage

    network_traffic = random.uniform(0, 1000)  # Random network traffic in KB/s

    weblogic_requests = random.randint(0, 1000)  # Random WebLogic requests

    weblogic_heap_usage = random.uniform(0, 100)  # Random WebLogic heap usage percentage

    io_wait = random.uniform(0, 100)  # Random I/O wait percentage

    thread_count = random.randint(1, 100)  # Random thread count

    response_time = random.uniform(0, 1000)  # Random response time in ms

    database_connections = random.randint(1, 50)  # Random database connections

    jvm_gc_count = random.randint(0, 100)  # Random JVM garbage collection count

    jvm_gc_time = random.uniform(0, 1000)  # Random JVM garbage collection time in ms

    data.append({

        'Timestamp': timestamp,

        'CPU_Usage': cpu_usage,

        'Memory_Usage': memory_usage,

        'Disk_Usage': disk_usage,

        'Network_Traffic': network_traffic,

        'WebLogic_Requests': weblogic_requests,

        'WebLogic_Heap_Usage': weblogic_heap_usage,

        'IO_Wait': io_wait,

        'Thread_Count': thread_count,

        'Response_Time': response_time,

        'Database_Connections': database_connections,

        'JVM_GC_Count': jvm_gc_count,

        'JVM_GC_Time': jvm_gc_time

    })

 

# Create DataFrame

df = pd.DataFrame(data)

 

# Display the DataFrame

#print(df)

df

In [None]:
[9:34 PM] Piumika, Tinul

# Feature Engineering Steps

df = df.drop_duplicates()

df['ActLogTime'] = pd.to_datetime(df['Timestamp'], format='%H%M%S').dt.strftime("%Y-%m-%d %H:%M:%S").str.split().str[1]

#df['ActTime'] = df['ActLogTime'].apply(time_to_seconds)

df["ActLogDate"] = pd.to_datetime(pd.to_datetime(df['Timestamp'], format='%Y%m%d').dt.strftime("%Y-%m-%d %H:%M:%S").str.split().str[0])

df['Run_Period'] = df['ActLogTime'].apply(get_period)

df['Weekday'] = df['ActLogDate'].apply(lambda x: 1 if x.weekday() < 5 else 0)

df['ActMonth'] = df['ActLogDate'].dt.month

df['ActDay'] = df['ActLogDate'].dt.day

 

 

time_labels = {}

for i in range(60*24):

    time_str = '{:02d}:{:02d}'.format(*divmod(i,60))

    label = i % (60*24)

    time_labels[time_str] = label

 

df['ActLogTime'] = pd.to_datetime(df['ActLogTime'] , format = '%H:%M:%S')

df['Time_label'] = df['ActLogTime'].apply(lambda x: time_labels[x.strftime('%H:%M')])

df = df.apply(add_one_if_seconds_over_30, axis=1)

 

df

In [None]:
[9:34 PM] Piumika, Tinul

def min_max(df):

    df['timestamp'] = df['Timestamp'].astype(int) // 10**9 

    scaler = MinMaxScaler() 

    normalized_data = scaler.fit_transform(df)

    normalized_df = pd.DataFrame(normalized_data)

    i=0

    new_names = []

    for cname in df.columns:

        new_names.append(f'{cname}_{i}')

        i=i+1

    #normalized_df = normalized_df.rename(columns={0: "JOBNAME_0", 1: "ODateWeekNum_1", 2:"ODateWeekDay_2", 3:"HELD_3", 4:"FREED_4"})

    normalized_df.columns = new_names

    return normalized_df, scaler

In [None]:
[9:34 PM] Piumika, Tinul

JOBNAME_enc = LabelEncoder()

df['Run_Period'] = JOBNAME_enc.fit_transform(df['Run_Period'])

In [None]:
[9:34 PM] Piumika, Tinul

df_n, scaler = min_max(df)

df_n

In [None]:
[9:35 PM] Piumika, Tinul

def IsoForest(df):

    #df['timestamp'] = df['Timestamp'].astype(int) // 10**9 

    # Create an instance of the Isolation Forest model

    isolation_forest = IsolationForest(n_estimators=1000, contamination=0.05)

 

    # Fit the model to the data

    isolation_forest.fit(df)

 

    # Predict the anomalies

    anomaly_scores = isolation_forest.decision_function(df)

    anomaly_predictions = isolation_forest.predict(df)



    # Add the anomaly scores and predictions to the original DataFrame

    df['anomaly_score'] = anomaly_scores

    df['anomaly_prediction'] = anomaly_predictions

 

    return df

In [None]:
[9:35 PM] Piumika, Tinul

iso_df = IsoForest(df_n)

In [None]:
[9:35 PM] Piumika, Tinul

def split_and_clean_df(df):


    df_train = df[df['anomaly_prediction'] == 1].copy()

    df_test = df[df['anomaly_prediction'] == -1].copy()

 

    columns_to_remove = ['anomaly_score', 'anomaly_prediction']

    df_train.drop(columns=columns_to_remove, inplace=True)

    df_test.drop(columns=columns_to_remove, inplace=True)

 

    return df_train, df_test

In [None]:
[9:35 PM] Piumika, Tinul

df_train, df_test = split_and_clean_df(iso_df)

print(f'shape of training {df_train.shape}')

print(f'shape of testing {df_test.shape}')

In [None]:
[9:35 PM] Piumika, Tinul

def autoencoder_prep(df_act):

 

    print("Dataframe --> Matrix")

    matrix = df_act.to_numpy()


    print("Setting Dimensional values")

    # input_dim = 5

    # hidden_dim_1 = 4

    # hidden_dim_2 = 3

    # bottleneck_dim = 2

    # hidden_dim_3 = 3

    # hidden_dim_4 = 4

    # output_dim = 5


    input_dim = 21

    hidden_dim_1 = 15

    hidden_dim_2 = 9

    bottleneck_dim = 4

    hidden_dim_3 = 9

    hidden_dim_4 = 15

    output_dim = 21

 

    print("Specifying Layers of the Architecture")

    input_layer = tf.keras.layers.Input(shape=(input_dim))


    hidden_layer_1 = tf.keras.layers.Dense(hidden_dim_1, activation='relu')(input_layer)

    hidden_layer_2 = tf.keras.layers.Dense(hidden_dim_2, activation='relu')(hidden_layer_1)


    bottleneck_layer = tf.keras.layers.Dense(bottleneck_dim, activation='relu')(hidden_layer_2)


    hidden_layer_3 = tf.keras.layers.Dense(hidden_dim_3, activation='relu')(bottleneck_layer)

    hidden_layer_4 = tf.keras.layers.Dense(hidden_dim_4, activation='relu')(hidden_layer_3)


    output_layer = tf.keras.layers.Dense(output_dim, activation= 'sigmoid')(hidden_layer_4)


    return matrix , input_layer , hidden_layer_1 , hidden_layer_2 , bottleneck_layer , hidden_layer_3 , hidden_layer_4 , output_layer

In [None]:
[9:36 PM] Piumika, Tinul

matrix , input_layer , hidden_layer_1 , hidden_layer_2 , bottleneck_layer , hidden_layer_3 , hidden_layer_4 , output_layer = autoencoder_prep(df_train)

In [None]:
[9:37 PM] Piumika, Tinul

#Training Process

def autoencoder_train(input_layer, output_layer, matrix,  hidden_layer_1 , hidden_layer_2 , bottleneck_layer , hidden_layer_3 , hidden_layer_4, e):

 

    print("Building the Model")

    autoencoder = tf.keras.models.Model(inputs=input_layer, outputs=output_layer)

 

    print("Training the Model")

    autoencoder.compile(optimizer='adam', loss='mse')

    autoencoder.fit(matrix, matrix, epochs=e, batch_size=64)

    print("Successfully Trained the Model")

 

    print("Calculate Reconstruction Error")

    reconstructed_data = autoencoder.predict(matrix)

    mse = np.mean(np.power(matrix - reconstructed_data, 2), axis=1)

    threshold = np.percentile(mse, 99.50) # Set threshold based on the 99.9th percentile

    print("Threshold:", threshold)


    return autoencoder, threshold

In [None]:
[9:36 PM] Piumika, Tinul

autoencoder, threshold = autoencoder_train(input_layer, output_layer, matrix,  hidden_layer_1 , hidden_layer_2 , bottleneck_layer , hidden_layer_3 , hidden_layer_4, 50)

In [None]:
[9:37 PM] Piumika, Tinul

#Identifying Anomalies

def autoencoder_anom(df_act, autoencoder, threshold, scaler):

    print("Test Dataframe --> Matrix")

 

    X = df_act.to_numpy()

 

    print("Identifying Anomalies")

 

    normalized_test_data = X

    reconstructed_test_data = autoencoder.predict(normalized_test_data)

 

    test_mse = np.mean(np.power(normalized_test_data - reconstructed_test_data, 2), axis=1)

 

    anomalies = X[test_mse > threshold]

 

    df_anom = pd.DataFrame(anomalies, columns=df_act.columns)


    df_anom["MSE"] = test_mse[test_mse > threshold]


    df_mse = pd.DataFrame(df_anom["MSE"], columns = ["MSE"])

    df_anom.drop(columns = ['MSE'], inplace = True)


    df_anom = scaler.inverse_transform(df_anom)

 

    return anomalies, df_anom, df_mse, normalized_test_data, reconstructed_test_data

In [None]:
[9:37 PM] Piumika, Tinul

anomalies, df_anom, df_mse, normalized_test_data, reconstructed_test_data = autoencoder_anom(df_test, autoencoder, threshold, scaler)

print(" ")

print("df_anom")

print(df_anom)

print(" ")

print("MSE")

print(df_mse)

In [None]:
[9:37 PM] Piumika, Tinul

#Decoding

def autoencoder_decode(df_anom,df):


    df_anom = pd.DataFrame(df_anom)

    #df_anom = df_anom.rename(columns={0: "JOBNAME_0", 1: "ODateWeekNum_1", 2:"ODateWeekDay_2", 3:"HELD_3", 4:"FREED_4"})

    new_names = []

    i=0

    for cname in df.columns:

        #print(cname)

        new_names.append(f'{cname}_{i}')

        i=i+1

    df_anom.columns = new_names

    df_anom = df_anom.astype(int)

    #Decoding

    #print(new_names)

    JOBNAME_enc = LabelEncoder()

    df_anom['Run_Period_15'] = JOBNAME_enc.inverse_transform(df_anom['Run_Period_15'])


    return df_anom

In [None]:
[9:38 PM] Piumika, Tinul

df_anom = autoencoder_decode(df_anom,df)

df_anom_mse = pd.concat([df_anom, df_mse], axis = 1)

In [None]:
[9:38 PM] Piumika, Tinul

#Explainable AI

def autoencoder_insight(anomalies, normalized_test_data, reconstructed_test_data, df_anom):

 

    print("Anomaly Identification Complete")

    print("Refer to df_anom for anomalies")

    print(" ")

    print("Explaining each Anomalies")

    feature_contributions = np.abs(normalized_test_data - reconstructed_test_data)

 

    # Find the most important features for each anomaly

    most_important_features = np.argsort(feature_contributions, axis=1)[:, ::-1]

 

    # Print the most important features for each anomaly along with df_anom information

    for i, anomaly in enumerate(anomalies):

        print(f"Anomaly {i + 1}:")

        print(df_anom.iloc[i])  # Print df_anom information for the current anomaly

        print("Most important features:")

        for j, feature in enumerate(most_important_features[i]):

            print(f"   {j + 1}. Feature {feature}: Contribution = {feature_contributions[i][feature]}")

        print()