In [176]:
import pandas as pd

In [177]:
df = pd.read_csv('data/logs.csv')
df.head()

Unnamed: 0,timestamp,source,log_message,target_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status


In [178]:
df.describe()

Unnamed: 0,timestamp,source,log_message,target_label
count,2410,2410,2410,2410
unique,2407,6,2265,9
top,11/19/2025 13:14,ThirdPartyAPI,Backup completed successfully.,HTTP Status
freq,2,496,60,1017


In [179]:
df['source'].unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [180]:
df['target_label'].unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

# Clustering
#### train DBSCAN on log messages to find clusters (patterns) to create regular expressions.

In [67]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN

  from .autonotebook import tqdm as notebook_tqdm


In [182]:
print("Loading sentence transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')


print(f"Generating embeddings for {len(df)} log messages...")
embeddings = model.encode(df['log_message'].tolist(), show_progress_bar=True)
embeddings.shape

Loading sentence transformer model...
Generating embeddings for 2410 log messages...
Generating embeddings for 2410 log messages...


Loading sentence transformer model...
Generating embeddings for 2410 log messages...
Generating embeddings for 2410 log messages...


Batches: 100%|██████████| 76/76 [00:02<00:00, 36.60it/s]
Batches: 100%|██████████| 76/76 [00:02<00:00, 36.60it/s]


Loading sentence transformer model...
Generating embeddings for 2410 log messages...
Generating embeddings for 2410 log messages...


Batches: 100%|██████████| 76/76 [00:02<00:00, 36.60it/s]
Batches: 100%|██████████| 76/76 [00:02<00:00, 36.60it/s]


(2410, 384)

In [183]:
clustering = DBSCAN(eps=0.2, min_samples=1, metric='cosine').fit(embeddings)
df['cluster'] = clustering.labels_

Loading sentence transformer model...
Generating embeddings for 2410 log messages...
Generating embeddings for 2410 log messages...


Loading sentence transformer model...
Generating embeddings for 2410 log messages...
Generating embeddings for 2410 log messages...


Batches: 100%|██████████| 76/76 [00:02<00:00, 36.55it/s]
Batches: 100%|██████████| 76/76 [00:02<00:00, 36.55it/s]


Loading sentence transformer model...
Generating embeddings for 2410 log messages...
Generating embeddings for 2410 log messages...


Batches: 100%|██████████| 76/76 [00:02<00:00, 36.55it/s]
Batches: 100%|██████████| 76/76 [00:02<00:00, 36.55it/s]


(2410, 384)

In [184]:
import re
def regex_classification(log_message):
    patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action",
        r"User \d+ made multiple incorrect login attempts": "Security Alert",
        r"Data replication task (for|failed) .*": "Error",
        r"Account Account(\d+).*login.*": "Security Alert",
        r"Server \d+ experienced potential security incident, review required": "Security Alert"
    }
    for pattern, label in patterns.items():
        if re.match(pattern, log_message, re.IGNORECASE):
            return label
    return None

In [185]:
df['regex_label'] = df['log_message'].apply(regex_classification)
df[df['regex_label'].notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0


In [193]:
df_non_legacy = df_non_regex[df_non_regex['source'] != 'LegacyCRM']
df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [195]:
print(f"Generating embeddings for {len(df)} log messages...")
filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist(), show_progress_bar=True)
filtered_embeddings.shape

Generating embeddings for 2410 log messages...


Batches:   0%|          | 0/59 [00:00<?, ?it/s]

Generating embeddings for 2410 log messages...


Batches:   0%|          | 0/59 [00:00<?, ?it/s]

Batches: 100%|██████████| 59/59 [00:01<00:00, 29.98it/s]
Batches: 100%|██████████| 59/59 [00:01<00:00, 29.98it/s]


Generating embeddings for 2410 log messages...


Batches:   0%|          | 0/59 [00:00<?, ?it/s]

Batches: 100%|██████████| 59/59 [00:01<00:00, 29.98it/s]
Batches: 100%|██████████| 59/59 [00:01<00:00, 29.98it/s]


(1876, 384)

In [198]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = filtered_embeddings
y = df_non_legacy['target_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

                precision    recall  f1-score   support

Critical Error       0.96      0.98      0.97        48
         Error       0.96      0.96      0.96        56
   HTTP Status       1.00      1.00      1.00       299
Resource Usage       1.00      1.00      1.00        45
Security Alert       0.99      0.98      0.99       115

      accuracy                           0.99       563
     macro avg       0.98      0.99      0.98       563
  weighted avg       0.99      0.99      0.99       563



In [None]:
import pickle