In [2]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
df = pd.read_csv('dataset/synthetic_logs.csv')

In [4]:
df = df.drop(columns =["complexity", "timestamp"])

df.head()

Unnamed: 0,source,log_message,target_label
0,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,ModernCRM,Email service experiencing issues with sending,Critical Error
2,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
3,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status
4,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status


In [5]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [6]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity

# Generate embeddings using Sentence Transformers
model = SentenceTransformer('all-MiniLM-L12-v2')
embeddings = model.encode(df['log_message'].tolist(), show_progress_bar=True)


print(embeddings[:2])

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._mps_is_available()
Batches: 100%|██████████| 76/76 [02:31<00:00,  1.99s/it]


[[-1.06847817e-02  2.62709334e-02  3.14305797e-02  7.36195073e-02
   4.19753194e-02 -6.38545081e-02 -1.62427314e-02  4.18685414e-02
   5.71526922e-02  1.07434697e-01 -2.88750697e-02 -1.56388909e-01
   5.41996434e-02  9.45324358e-03 -9.60125551e-02  8.10302868e-02
  -2.89856177e-02 -5.53457476e-02  1.32359955e-02 -3.47629236e-03
   3.22376974e-02 -3.60625610e-02  5.85770514e-03 -3.72246001e-03
  -2.89225448e-02  6.16006516e-02 -2.71365214e-02  1.03628546e-01
   5.43873273e-02 -1.03216562e-02  8.98893699e-02 -3.95845948e-03
  -8.23875740e-02  8.25562850e-02  2.15567462e-02  1.01610990e-02
   3.29374187e-02 -5.47897890e-02 -4.71227728e-02  5.68042658e-02
   6.90639690e-02 -3.76313515e-02 -1.85300838e-02  6.11200705e-02
  -7.91109800e-02 -7.31293187e-02 -4.10521924e-02 -2.89517753e-02
   1.59848258e-02 -8.03534389e-02 -5.70986457e-02 -5.70563227e-02
  -4.63480577e-02 -9.08232387e-03  3.48885655e-02 -5.82006164e-02
  -2.52451394e-02  4.24719043e-02 -2.52153687e-02 -6.69790506e-02
  -2.42394

In [9]:
#Perform  DBSCAN clustering


dbscan = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embeddings)

# Add cluster labels to the DataFrame   
df['cluster'] = clusters

df.head(20)

Unnamed: 0,source,log_message,target_label,cluster
0,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
1,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2
3,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0
5,ModernHR,nova.osapi_compute.wsgi.server [req-f0bffbc3-5...,HTTP Status,0
6,ModernHR,Shard 6 replication task ended in failure,Error,3
7,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,4
8,ThirdPartyAPI,nova.compute.claims [req-a07ac654-8e81-416d-bf...,Resource Usage,5
9,ModernHR,nova.osapi_compute.wsgi.server [req-2bf7cfee-a...,HTTP Status,0


In [10]:
df[df.cluster == 16]

Unnamed: 0,source,log_message,target_label,cluster
36,BillingSystem,System reboot initiated by user User243.,System Notification,16
92,BillingSystem,System reboot initiated by user User471.,System Notification,16
139,ModernHR,System reboot initiated by user User216.,System Notification,16
140,AnalyticsEngine,System reboot initiated by user User639.,System Notification,16
161,BillingSystem,System reboot initiated by user User819.,System Notification,16
163,BillingSystem,System reboot initiated by user User938.,System Notification,16
307,BillingSystem,System reboot initiated by user User929.,System Notification,16
365,ModernHR,System reboot initiated by user User533.,System Notification,16
508,ThirdPartyAPI,System reboot initiated by user User591.,System Notification,16
552,ModernHR,System reboot initiated by user User421.,System Notification,16


In [14]:
cluster_counts = df['cluster'].value_counts()
large_clusters = cluster_counts[cluster_counts > 10].index

for cluster in large_clusters:
    print(f"Cluster {cluster}:")
    print(df[df['cluster'] == cluster]['log_message'].head(5).to_string(index=False))
    print("")

Cluster 0:
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
nova.osapi_compute.wsgi.server [req-2bf7cfee-a2...

Cluster 5:
nova.compute.claims [req-a07ac654-8e81-416d-bfb...
nova.compute.claims [req-d6986b54-3735-4a42-907...
nova.compute.claims [req-72b4858f-049e-49e1-b31...
nova.compute.claims [req-5c8f52bd-8e3c-41f0-95a...
nova.compute.claims [req-d38f479d-9bb9-4276-968...

Cluster 11:
User User685 logged out.
 User User395 logged in.
 User User225 logged in.
User User494 logged out.
 User User900 logged in.

Cluster 13:
Backup started at 2025-05-14 07:06:55.
Backup started at 2025-02-15 20:00:19.
  Backup ended at 2025-08-08 13:06:23.
Backup started at 2025-11-14 08:27:43.
Backup started at 2025-12-09 10:19:11.

Cluster 26:
        Privilege elevation detected for user 5038
  Detection of admin privilege misuse by user 1449
Elevat

In [None]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (out|in)." : "User Action",
        r"Account with ID .* created by .*" : "User Action",
        r"System reboot initiated by user .*" : "System Notification",
        r"System updated to version .*" : "System Notification",
        r"Backup started at .*" : "System Notification",
        r"Backup ended at .*": "System Notification",
        r"File .* uploaded successfully by user .*" : "System Notification",
        r"Disk cleanup completed successfully." : "System Notification",
        r"Server .* suffered an abrupt restart during .*" : "System Diagnostic",
        r"Server .* .* potential security .*" :    "Security Notification",
        r"Security breach suspected from IP address .*" : "Security Notification",
        r"Possible hacking attempt identified from IP .*" : "Security Notification",
        r"Unauthorised access attempt from .*": "Security Notification",
        r"Unusual access attempt from .*" : "Security Notification",
        r"Account Account.* .* login" : "Login Issues",
        r".* RAID .*" : "Hard Disk Issues",
        r".*Critical system" : "Critical Error",
        r".*critical system" : "Critical Error"

    }    
    for pattern,label in regex_patterns.items():
    
        if re.search(pattern, log_message, re.IGNORECASE): 
            return label
        return None

In [36]:
classify_with_regex("Account with ID 2346 created by Os")