In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping



#Load data
df = pd.read_csv('../merged_dataset/merged_logs20000.csv')

messages = df['message'].astype(str)

#Preprocess the column
max_words = 10000
max_len = 100


tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(messages)
sequences = tokenizer.texts_to_sequences(messages)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

#Preprocess numeral features
df['timestamp'] = pd.to_datetime(df['date']).astype(int) /10**9

#Normalise the numerical features 
scaler = StandardScaler()
numerical_features = scaler.fit_transform(df[['message_length', 'timestamp']])

# Concatenate padded sequences with numerical features
X_combined = np.hstack((padded_sequences, numerical_features))

#Binarize the combined_anomaly column
#threshold = df['combined_anomaly'].mean()
threshold = 0.5
labels = (df['combined_anomaly'] > threshold).astype(int)

#Train and split 
X_train, X_test, y_train, y_test = train_test_split(X_combined, labels, test_size=0.2, random_state=42)

#Init LSTM 
embedding_dim = 128

#Model outperformed and hence has faced overfitting with those hyperparameters
'''input_shape = X_train.shape[1]

text_input = Input(shape=(input_shape,))
x = Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len)(text_input)
x = LSTM(256, return_sequences=True)(x)
x = Dropout(0.2)(x)
x = LSTM(128)(x)
x = Dropout(0.2)(x)
output = Dense(1, activation='sigmoid')(x)'''


model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))


#Model outperformed and hence has faced overfitting with those hyperparameters
'''model = Model(inputs=text_input, outputs=output)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

#Train the model
model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2, callbacks=[early_stopping])'''

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

#Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

#Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"LSTM Model Accuracy: {accuracy}")

#Detect anomalies
y_pred = model.predict(X_test)
anomalies = (y_pred > 0.5).astype(int)

Epoch 1/10




[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 97ms/step - accuracy: 0.7950 - loss: 0.3925 - val_accuracy: 0.7097 - val_loss: 0.5000
Epoch 2/10
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 96ms/step - accuracy: 0.7166 - loss: 0.4907 - val_accuracy: 0.7097 - val_loss: 0.4927
Epoch 3/10
[1m576/800[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m20s[0m 91ms/step - accuracy: 0.7204 - loss: 0.4844

KeyboardInterrupt: 