In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM, SVR
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, r2_score, mean_absolute_error, explained_variance_score




from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec


In [2]:
###LOAD DATASET 
df = pd.read_csv('../merged_dataset/merged_logs20000.csv')

In [3]:
# Word2Vec for feature extraction
def tokenize_message(message):
    return message.lower().split()

# Train Word2Vec model
tokenized_messages = df['message'].apply(tokenize_message)
word2vec_model = Word2Vec(sentences=tokenized_messages, vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Create Word2Vec embeddings for each message
df['word2vec_vector'] = df['message'].apply(lambda msg: np.mean([word2vec_model.wv[token] for token in tokenize_message(msg) if token in word2vec_model.wv], axis=0))
word2vec_features = np.vstack(df['word2vec_vector'].values)

# Dimensionality reduction with PCA
pca = PCA(n_components=10)
word2vec_reduced = pca.fit_transform(word2vec_features)

# Reshape message_length to match dimensions
message_length_reshaped = df['message_length'].values.reshape(-1, 1)

# Combine Word2Vec features with other features
X = np.hstack([word2vec_reduced, message_length_reshaped])

#Target variable
y = df['combined_anomaly']


In [4]:
#Normalising and Handling Imbalance

#Standadize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)






In [5]:
#SVM!!!!
#Split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, df['combined_anomaly'], test_size=0.2, random_state=42)


#Init SVR
svr = SVR(kernel='linear', gamma='scale')
svr.fit(X_train, y_train)


#Converting prediction
#y_pred = [1 if i == -1 else 0 for i in y_pred]
y_pred = svr.predict(X_test)


#Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
explained_variance = explained_variance_score(y_test, y_pred)


print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R^2 Score: {r2}')
print(f'Explained Variance Score: {explained_variance}')


# Log misclassified instances

misclassified_indices = np.where(np.abs(y_pred - y_test) > 0.5)[0]
misclassified_samples = df.iloc[misclassified_indices]

# Save misclassified samples to a CSV file for further analysis
misclassified_samples.to_csv('../log/misclassified_samples.csv', index=False)






Mean Squared Error: 0.01452621395822239
Root Mean Squared Error (RMSE): 0.12052474417405909
Mean Absolute Error (MAE): 0.10524192098242793
R^2 Score: 0.7603509296376346
Explained Variance Score: 0.7903426630852783
