## Isolation Forest

In [15]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')
from loglizer.models import IsolationForest
from loglizer import dataloader, preprocessing

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file
label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file
anomaly_ratio = 0.04 # Estimate the ratio of anomaly samples in the data

if __name__ == '__main__':
    (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
                                                                label_file=label_file,
                                                                window='session', 
                                                                train_ratio=0.70,
                                                                split_type='uniform')
    feature_extractor = preprocessing.FeatureExtractor()
    x_train = feature_extractor.fit_transform(x_train)
    x_test = feature_extractor.transform(x_test)

    model = IsolationForest(contamination=anomaly_ratio)
    model.fit(x_train)

    print('Train validation:')
    precision, recall, f1 = model.evaluate(x_train, y_train)
    
    print('Test validation:')
    precision, arecall, f1 = model.evaluate(x_test, y_test)



Loading ../data/HDFS/HDFS_100k.log_structured.csv
                    BlockId                                      EventSequence
0  blk_-1608999687919862906  [E5, E22, E5, E5, E11, E11, E9, E9, E11, E9, E...
1   blk_7503483334202473044  [E5, E5, E22, E5, E11, E9, E11, E9, E11, E9, E...
2  blk_-3544583377289625738  [E5, E22, E5, E5, E11, E9, E11, E9, E11, E9, E...
3  blk_-9073992586687739851  [E5, E22, E5, E5, E11, E9, E11, E9, E11, E9, E...
4   blk_7854771516489510256  [E5, E5, E22, E5, E11, E9, E11, E9, E11, E9, E...
219 94
Total: 7940 instances, 313 anomaly, 7627 normal
Train: 5557 instances, 219 anomaly, 5338 normal
Test: 2383 instances, 94 anomaly, 2289 normal

Train data shape: 5557-by-16

Test data shape: 2383-by-16

Train validation:
Precision: 1.000, recall: 0.402, F1-measure: 0.573

Test validation:
Precision: 0.972, recall: 0.372, F1-measure: 0.538



In [11]:
sum(y_train)/len(y_train)

0.039304610733182165

In [28]:
import sys
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from loglizer.models import IsolationForest
from loglizer import dataloader, preprocessing

sys.path.append('../')

# File paths
struct_log = '../data/HDFS/HDFS_100k.log_structured.csv'  # Structured log file
label_file = '../data/HDFS/anomaly_label.csv'  # Anomaly label file

# Load dataset
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(
    struct_log, label_file=label_file, window='session', train_ratio=0.50, split_type='uniform'
)

# Feature extraction
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train)
x_test = feature_extractor.transform(x_test)

# Hyperparameter grid
contamination_values = [ 0.04]
n_estimators_values = [100, 200, 300]
max_samples_values = [0.5, 0.75, 1.0]

# Function to evaluate model
def evaluate_model(model, x_data, y_true):
    y_pred = model.predict(x_data)
    y_pred = np.where(y_pred == 1, 0, 1)  # Convert {1: normal, -1: anomaly} to {0: normal, 1: anomaly}

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    return precision, recall, f1, auc_roc

# Grid search for best hyperparameters
best_f1 = 0
best_params = {}

for contamination in contamination_values:
    for n_estimators in n_estimators_values:
        for max_samples in max_samples_values:
            print(f"Training with contamination={contamination}, n_estimators={n_estimators}, max_samples={max_samples}...")

            # Train model
            model = IsolationForest(
                contamination=contamination,
                n_estimators=n_estimators,
                max_samples=max_samples,
            )
            model.fit(x_train)

            # Evaluate on test set
            precision, recall, f1, auc_roc = evaluate_model(model, x_test, y_test)

            print(f" F1-score: {f1:.4f}\n")

            # Store best parameters based on AUC-ROC score
            if f1 > best_f1:
                best_f1 = f1
                best_params = {
                    'contamination': contamination,
                    'n_estimators': n_estimators,
                    'max_samples': max_samples,
                }

# Train final model with best hyperparameters
print(f"Best Parameters: {best_params}")
final_model = IsolationForest(**best_params)
final_model.fit(x_train)

# Final Evaluation
print('Final Model Evaluation:')
precision, recall, f1, auc_roc = evaluate_model(final_model, x_test, y_test)
print(f'Precision: {precision:.3f}, Recall: {recall:.3f}, F1-score: {f1:.3f}, AUC-ROC: {auc_roc:.3f}')


Loading ../data/HDFS/HDFS_100k.log_structured.csv
                    BlockId                                      EventSequence
0  blk_-1608999687919862906  [E5, E22, E5, E5, E11, E11, E9, E9, E11, E9, E...
1   blk_7503483334202473044  [E5, E5, E22, E5, E11, E9, E11, E9, E11, E9, E...
2  blk_-3544583377289625738  [E5, E22, E5, E5, E11, E9, E11, E9, E11, E9, E...
3  blk_-9073992586687739851  [E5, E22, E5, E5, E11, E9, E11, E9, E11, E9, E...
4   blk_7854771516489510256  [E5, E5, E22, E5, E11, E9, E11, E9, E11, E9, E...
156 157
Total: 7940 instances, 313 anomaly, 7627 normal
Train: 3969 instances, 156 anomaly, 3813 normal
Test: 3971 instances, 157 anomaly, 3814 normal

Train data shape: 3969-by-14

Test data shape: 3971-by-14

Training with contamination=0.04, n_estimators=100, max_samples=0.5...
 F1-score: 0.0439

Training with contamination=0.04, n_estimators=100, max_samples=0.75...
 F1-score: 0.0439

Training with contamination=0.04, n_estimators=100, max_samples=1.0...
 F1-score: 0.

In [30]:
import sys
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from loglizer import dataloader, preprocessing

sys.path.append('../')

# File paths
struct_log = '../data/HDFS/HDFS_100k.log_structured.csv'  # Structured log file
label_file = '../data/HDFS/anomaly_label.csv'  # Anomaly label file

# Load dataset
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(
    struct_log, label_file=label_file, window='session', train_ratio=0.50, split_type='uniform'
)

# Feature extraction
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train)
x_test = feature_extractor.transform(x_test)

# Define model
rf_model = RandomForestClassifier()

# Set up grid search parameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
}

# Use GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(rf_model, param_grid, scoring='precision', cv=3, n_jobs=-1, verbose=2)

# Fit grid search
grid_search.fit(x_train, y_train)

# Get the best model and hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Evaluate best model on test set
y_pred = best_model.predict(x_test)

# Calculate metrics
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc_roc = roc_auc_score(y_test, y_pred)

print(f"Best Parameters: {best_params}")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1-score: {f1:.3f}, AUC-ROC: {auc_roc:.3f}")

# Ensure precision is above 90%
if precision >= 0.90:
    print("Model meets precision constraint!")
else:
    print("Precision is below the threshold of 90%")


Loading ../data/HDFS/HDFS_100k.log_structured.csv
                    BlockId                                      EventSequence
0  blk_-1608999687919862906  [E5, E22, E5, E5, E11, E11, E9, E9, E11, E9, E...
1   blk_7503483334202473044  [E5, E5, E22, E5, E11, E9, E11, E9, E11, E9, E...
2  blk_-3544583377289625738  [E5, E22, E5, E5, E11, E9, E11, E9, E11, E9, E...
3  blk_-9073992586687739851  [E5, E22, E5, E5, E11, E9, E11, E9, E11, E9, E...
4   blk_7854771516489510256  [E5, E5, E22, E5, E11, E9, E11, E9, E11, E9, E...
156 157
Total: 7940 instances, 313 anomaly, 7627 normal
Train: 3969 instances, 156 anomaly, 3813 normal
Test: 3971 instances, 157 anomaly, 3814 normal

Train data shape: 3969-by-14

Test data shape: 3971-by-14

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Precision: 0.985, Recall: 0.427, F1-score: 0.596, AUC-ROC: 0.713
Model meet

243 fits failed out of a total of 729.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
76 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\naren\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\naren\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\naren\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\naren\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParamet

In [36]:
import sys
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from loglizer import dataloader, preprocessing

sys.path.append('../')

# File paths
struct_log = '../data/HDFS/HDFS_100k.log_structured.csv'  # Structured log file
label_file = '../data/HDFS/anomaly_label.csv'  # Anomaly label file

# Load dataset
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(
    struct_log, label_file=label_file, window='session', train_ratio=0.50, split_type='uniform'
)

# Feature extraction
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train)
x_test = feature_extractor.transform(x_test)

# Define model
model = IsolationForest()

# Define the parameter grid
param_grid = {
    'n_estimators': [100,150,50,75],
    'max_samples': [0.5, 1.0, 0.25],
    'contamination': [0.05, 0.03, 0.01, 0.04]
}

# Use GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(model, param_grid, scoring='recall', cv=5, n_jobs=-1, verbose=2)

# Fit grid search
grid_search.fit(x_train)

# Get the best model and hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Evaluate best model on test set
y_pred = best_model.predict(x_test)

# Calculate metrics
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
auc_roc = roc_auc_score(y_test, y_pred)

print(f"Best Parameters: {best_params}")
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1-score: {f1:.3f}, AUC-ROC: {auc_roc:.3f}")

Loading ../data/HDFS/HDFS_100k.log_structured.csv
                    BlockId                                      EventSequence
0  blk_-1608999687919862906  [E5, E22, E5, E5, E11, E11, E9, E9, E11, E9, E...
1   blk_7503483334202473044  [E5, E5, E22, E5, E11, E9, E11, E9, E11, E9, E...
2  blk_-3544583377289625738  [E5, E22, E5, E5, E11, E9, E11, E9, E11, E9, E...
3  blk_-9073992586687739851  [E5, E22, E5, E5, E11, E9, E11, E9, E11, E9, E...
4   blk_7854771516489510256  [E5, E5, E22, E5, E11, E9, E11, E9, E11, E9, E...
156 157
Total: 7940 instances, 313 anomaly, 7627 normal
Train: 3969 instances, 156 anomaly, 3813 normal
Test: 3971 instances, 157 anomaly, 3814 normal

Train data shape: 3969-by-14

Test data shape: 3971-by-14

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'contamination': 0.05, 'max_samples': 0.5, 'n_estimators': 100}
Precision: 0.986, Recall: 0.433, F1-score: 0.602, AUC-ROC: 0.716


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan]


## Invariants Miner

In [37]:
import sys
import numpy as np
from loglizer.models import InvariantsMiner
from loglizer import dataloader, preprocessing
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

sys.path.append('../')

# File paths
struct_log = '../data/HDFS/HDFS_100k.log_structured.csv'  # Structured log file
label_file = '../data/HDFS/anomaly_label.csv'  # Anomaly label file

# Load dataset
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(
    struct_log, label_file=label_file, window='session', train_ratio=0.6, split_type='sequential'
)

# Feature extraction
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train)
x_test = feature_extractor.transform(x_test)

# Hyperparameter grid (refined for better tuning)
epsilon_values = np.arange(0.05, 0.91, 0.05)  # More fine-grained search
percentage_values = [0.85, 0.9, 0.95, 0.98, 1.0]

# Function to evaluate model
def evaluate_model(model, x_data, y_true):
    y_pred = model.predict(x_data)  # Get predictions (0 or 1)
    
    # Compute scores
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    # Compute AUC-ROC Score
    auc_roc = roc_auc_score(y_true, y_pred)
    
    return precision, recall, f1, auc_roc

# Grid search for best hyperparameters (maximize recall + F1)
best_f1 = 0
best_recall = 0
best_auc = 0
best_params = {}

for epsilon in epsilon_values:
    for percentage in percentage_values:
        print(f"Training with epsilon={epsilon:.2f}, percentage={percentage}...")

        # Train model
        model = InvariantsMiner(epsilon=epsilon, percentage=percentage)
        model.fit(x_train)

        # Evaluate on test set
        precision, recall, f1, auc_roc = evaluate_model(model, x_test, y_test)

        print(f"Test Precision: {precision:.3f}, Recall: {recall:.3f}, F1-score: {f1:.3f}, AUC-ROC: {auc_roc:.3f}\n")

        # Optimize for high recall and F1-score (since anomalies are rare)
        if f1 > best_f1 or recall > best_recall:
            best_f1 = f1
            best_recall = recall
            best_auc = auc_roc
            best_params = {'epsilon': epsilon, 'percentage': percentage}

# Train final model with best hyperparameters
print(f"Best Parameters: {best_params}")
final_model = InvariantsMiner(**best_params)
final_model.fit(x_train)

# Final Evaluation
print('Final Model Evaluation:')
precision, recall, f1, auc_roc = evaluate_model(final_model, x_test, y_test)
print(f'Precision: {precision:.3f}, Recall: {recall:.3f}, F1-score: {f1:.3f}, AUC-ROC: {auc_roc:.3f}')


Loading ../data/HDFS/HDFS_100k.log_structured.csv
                    BlockId                                      EventSequence
0  blk_-1608999687919862906  [E5, E22, E5, E5, E11, E11, E9, E9, E11, E9, E...
1   blk_7503483334202473044  [E5, E5, E22, E5, E11, E9, E11, E9, E11, E9, E...
2  blk_-3544583377289625738  [E5, E22, E5, E5, E11, E9, E11, E9, E11, E9, E...
3  blk_-9073992586687739851  [E5, E22, E5, E5, E11, E9, E11, E9, E11, E9, E...
4   blk_7854771516489510256  [E5, E5, E22, E5, E11, E9, E11, E9, E11, E9, E...
154 159
Total: 7940 instances, 313 anomaly, 7627 normal
Train: 4764 instances, 154 anomaly, 4610 normal
Test: 3176 instances, 159 anomaly, 3017 normal

Train data shape: 4764-by-14

Test data shape: 3176-by-14

Training with epsilon=0.05, percentage=0.85...
Invariant space dimension: 11
Mined 10 invariants: {(0, 1): [-3, 1], (0, 2): [-3, 1], (0, 3): [-3, 1], (0, 4): [-3, 1], (6, 7): [1, -15], (6, 8): [1, -15], (9, 10): [1, -2], (9, 11): [1, -2], (9, 12): [1, -2], (9, 13):

## PCA model

In [15]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')
from loglizer.models import PCA
from loglizer import dataloader, preprocessing

struct_log = '../data/HDFS/HDFS_2k.log_structured.csv' # The structured log file
label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file

pkl_path = "../../proceeded_data/BGL"



(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log,
                                                            label_file=label_file,
                                                            window='session', 
                                                            train_ratio=0.8,
                                                            split_type='uniform')
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', 
                                            normalization='zero-mean')
x_test = feature_extractor.transform(x_test)

model = PCA()
model.fit(x_train)

print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)

print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)


Loading ../data/HDFS/HDFS_2k.log_structured.csv
                    BlockId EventSequence
0     blk_38865049064139660         [E10]
1  blk_-6952295868487656571         [E10]
2   blk_7128370237687728475          [E6]
3   blk_8229193803249955061         [E10]
4  blk_-6670958622368987959         [E10]
55 14
Total: 2200 instances, 69 anomaly, 2131 normal
Train: 1759 instances, 55 anomaly, 1704 normal
Test: 441 instances, 14 anomaly, 427 normal

Train data shape: 1759-by-13

Test data shape: 441-by-13

n_components: 10
Project matrix shape: 13-by-13
SPE threshold: 0.5023636527823024

Train validation:
Precision: 0.500, recall: 0.036, F1-measure: 0.068

Test validation:
Precision: 0.500, recall: 0.143, F1-measure: 0.222



In [39]:
import sys
import numpy as np
from loglizer.models import PCA
from loglizer import dataloader, preprocessing
from sklearn.metrics import precision_score, recall_score, f1_score

sys.path.append('../')

# File paths
struct_log = '../data/HDFS/HDFS_2k.log_structured.csv'  # Structured log file
label_file = '../data/HDFS/anomaly_label.csv'  # Anomaly label file

# Load dataset
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(
    struct_log, label_file=label_file, window='session', train_ratio=0.8, split_type='uniform'
)

# Feature extraction
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', normalization='zero-mean')
x_test = feature_extractor.transform(x_test)

# Hyperparameter grid
n_components_values = [0.95, 0.90, 0.85, 0.80]  # PCA explained variance
threshold_values = [0.97, 0.99, 1.0]  # Anomaly threshold

# Function to evaluate model
def evaluate_model(model, x_data, y_true):
    y_pred = model.predict(x_data)  # Get predictions (0 or 1)
    
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    return precision, recall, f1

# Grid search for best hyperparameters
best_f1 = 0
best_params = {}

for n_components in n_components_values:
    for threshold in threshold_values:
        print(f"Training with n_components={n_components}, threshold={threshold}...")

        # Train PCA model
        model = PCA(n_components=n_components, threshold=threshold)
        model.fit(x_train)

        # Evaluate model
        precision, recall, f1 = evaluate_model(model, x_test, y_test)

        print(f"Test F1-score: {f1:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}\n")

        # Store best hyperparameters based on F1-score
        if f1 > best_f1:
            best_f1 = f1
            best_params = {'n_components': n_components, 'threshold': threshold}

# Train final model with best hyperparameters
print(f"Best Parameters: {best_params}")
final_model = PCA(**best_params)
final_model.fit(x_train)

# Final Evaluation
print('Final Model Evaluation:')
precision, recall, f1 = evaluate_model(final_model, x_test, y_test)
print(f'Precision: {precision:.3f}, Recall: {recall:.3f}, F1-score: {f1:.3f}')


Loading ../data/HDFS/HDFS_2k.log_structured.csv
                    BlockId EventSequence
0     blk_38865049064139660         [E10]
1  blk_-6952295868487656571         [E10]
2   blk_7128370237687728475          [E6]
3   blk_8229193803249955061         [E10]
4  blk_-6670958622368987959         [E10]
55 14
Total: 2200 instances, 69 anomaly, 2131 normal
Train: 1759 instances, 55 anomaly, 1704 normal
Test: 441 instances, 14 anomaly, 427 normal

Train data shape: 1759-by-13

Test data shape: 441-by-13

Training with n_components=0.95, threshold=0.97...
n_components: 10
Project matrix shape: 13-by-13
SPE threshold: 0.97

Test F1-score: 0.1333, Recall: 0.0714, Precision: 1.0000

Training with n_components=0.95, threshold=0.99...
n_components: 10
Project matrix shape: 13-by-13
SPE threshold: 0.99

Test F1-score: 0.1333, Recall: 0.0714, Precision: 1.0000

Training with n_components=0.95, threshold=1.0...
n_components: 10
Project matrix shape: 13-by-13
SPE threshold: 1.0

Test F1-score: 0.1333, 