In [46]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, accuracy_score
import joblib
import numpy as np
import matplotlib.pyplot as plt

In [47]:
df = pd.read_csv('jboss_parsed_logs_with_labels.csv')

In [48]:
df.head()

Unnamed: 0,timestamp,host,component,pid,message,proxy_anomaly_label
0,2025-09-28T07:48:03.512301+00:00,ubuntu-intern-vm,sshd-session,721017,pam_unix(sshd:auth): check pass; user unknown,0
1,2025-09-28T07:48:03.512710+00:00,ubuntu-intern-vm,sshd-session,721017,pam_unix(sshd:auth): authentication failure; l...,0
2,2025-09-28T07:48:05.733891+00:00,ubuntu-intern-vm,sshd-session,721017,Failed password for invalid user steam from 50...,1
3,2025-09-28T07:48:07.271393+00:00,ubuntu-intern-vm,sshd-session,721017,Connection closed by invalid user steam 50.6.2...,1
4,2025-09-28T07:48:10.484451+00:00,ubuntu-intern-vm,filebeat,671379,2025-09-28T07:48:10.483Z#011INFO#011[monitorin...,0


In [49]:
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

In [50]:
# Time-based features
df['hour'] = df['timestamp'].dt.hour
df['dayofweek'] = df['timestamp'].dt.dayofweek
df['day'] = df['timestamp'].dt.day

In [51]:
# Log pattern features (binary flags)
df['auth_failure'] = df['message'].str.contains('authentication failure', case=False, na=False).astype(int)
df['failed_password'] = df['message'].str.contains('failed password', case=False, na=False).astype(int)
df['error'] = df['message'].str.contains('error', case=False, na=False).astype(int)
df['exception'] = df['message'].str.contains('exception', case=False, na=False).astype(int)
df['timeout'] = df['message'].str.contains('timeout', case=False, na=False).astype(int)
df['critical'] = df['message'].str.contains('critical', case=False, na=False).astype(int)

In [52]:
# PID (numeric, fill missing with 0)
df['pid'] = pd.to_numeric(df['pid'], errors='coerce').fillna(0)

In [53]:
# Text-based features
df['msg_len'] = df['message'].str.len()
df['num_tokens'] = df['message'].str.split().apply(len)

In [54]:
features = df[['hour', 'dayofweek', 'day', 'auth_failure', 'failed_password',
               'error', 'exception', 'timeout', 'critical',
               'pid', 'msg_len', 'num_tokens']]

In [55]:
if 'proxy_anomaly_label' not in df.columns:
    raise ValueError("Missing 'proxy_anomaly_label' column. Please run create_proxy_labels.py first.")

true_labels = df['proxy_anomaly_label']

In [56]:
df.head()

Unnamed: 0,timestamp,host,component,pid,message,proxy_anomaly_label,hour,dayofweek,day,auth_failure,failed_password,error,exception,timeout,critical,msg_len,num_tokens
0,2025-09-28 07:48:03.512301+00:00,ubuntu-intern-vm,sshd-session,721017,pam_unix(sshd:auth): check pass; user unknown,0,7,6,28,0,0,0,0,0,0,45,5
1,2025-09-28 07:48:03.512710+00:00,ubuntu-intern-vm,sshd-session,721017,pam_unix(sshd:auth): authentication failure; l...,0,7,6,28,1,0,0,0,0,0,100,9
2,2025-09-28 07:48:05.733891+00:00,ubuntu-intern-vm,sshd-session,721017,Failed password for invalid user steam from 50...,1,7,6,28,0,1,0,0,0,0,71,11
3,2025-09-28 07:48:07.271393+00:00,ubuntu-intern-vm,sshd-session,721017,Connection closed by invalid user steam 50.6.2...,1,7,6,28,0,0,0,0,0,0,72,10
4,2025-09-28 07:48:10.484451+00:00,ubuntu-intern-vm,filebeat,671379,2025-09-28T07:48:10.483Z#011INFO#011[monitorin...,0,7,6,28,0,0,0,0,0,0,1107,8


In [57]:
# Train Isolation Forest
clf = IsolationForest(
    n_estimators=200,
    contamination=0.01,   # assume ~5% anomalies
    random_state=42,
    n_jobs=-1
)

clf.fit(features)

0,1,2
,n_estimators,200
,max_samples,'auto'
,contamination,0.01
,max_features,1.0
,bootstrap,False
,n_jobs,-1
,random_state,42
,verbose,0
,warm_start,False


In [58]:
# Predictions
preds = clf.predict(features)  # -1 = anomaly, 1 = normal
preds_binary = (preds == -1).astype(int)

In [59]:
# Anomaly scores (decision_function: lower = more anomalous)
df['anomaly_pred'] = preds_binary
df['anomaly_score'] = clf.decision_function(features)

In [60]:
# Evaluation
print("Model Performance (compared to proxy labels):")
print("Accuracy:", accuracy_score(true_labels, preds_binary))
print(classification_report(true_labels, preds_binary, digits=4))

Model Performance (compared to proxy labels):
Accuracy: 0.7405555555555555
              precision    recall  f1-score   support

           0     0.7464    0.9888    0.8507      8070
           1     0.1667    0.0066    0.0127      2730

    accuracy                         0.7406     10800
   macro avg     0.4565    0.4977    0.4317     10800
weighted avg     0.5998    0.7406    0.6388     10800



In [61]:
# Save Model & Results
joblib.dump(clf, 'isolation_forest_model.joblib')
df.to_csv('jboss_logs_with_predictions.csv', index=False)

print("✅ Training complete. Model and results saved as 'isolation_forest_model.joblib' and 'jboss_logs_with_predictions.csv'.")

✅ Training complete. Model and results saved as 'isolation_forest_model.joblib' and 'jboss_logs_with_predictions.csv'.
