In [4]:
import pandas as pd

df = pd.read_csv("dataset/synthetic_logs.csv", sep=",", on_bad_lines="skip")
df.head(3)

Unnamed: 0,timestamp,source,log_message,target_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert


In [5]:
print(len(df))  # Prints the total number of rows


2203


In [6]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [7]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'User Action', 'Resource Usage',

In [8]:
with open("dataset/synthetic_logs.csv", "r") as file:
    lines = file.readlines()
    for i, line in enumerate(lines[:15]):  # Print first 15 lines
        print(f"Line {i+1}: {line.strip()}")


Line 1: timestamp,source,log_message,target_label
Line 2: 2025-06-27 07:20:25,ModernCRM,"nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2675118",HTTP Status
Line 3: 1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error
Line 4: 1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
Line 5: 2025-07-12 00:24:16,ModernHR,"nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" HTTP status code -  200 len: 211 time: 0.0968180",HTTP Status
Line 6: 2025-06-02 18:25:23,BillingSystem,"nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 

In [9]:
df[df.target_label=='System Notification'].sample(10)

Unnamed: 0,timestamp,source,log_message,target_label
2133,8/28/2025 2:09,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification
990,4/17/2025 2:50,AnalyticsEngine,File data_5038.csv uploaded successfully by us...,System Notification
1291,1/14/2025 3:37,AnalyticsEngine,System updated to version 2.1.3.,System Notification
2004,11/23/2025 1:08,BillingSystem,File data_5237.csv uploaded successfully by us...,System Notification
907,5/4/2025 12:18,AnalyticsEngine,Backup ended at 2025-01-24 09:07:54.,System Notification
1642,6/21/2025 6:48,BillingSystem,Backup completed successfully.,System Notification
609,9/5/2025 7:14,ModernHR,System reboot initiated by user User297.,System Notification
1823,2/20/2025 11:45,ModernHR,Backup ended at 2025-06-02 10:47:12.,System Notification
1805,3/15/2025 14:18,BillingSystem,Backup ended at 2025-02-12 08:23:09.,System Notification
118,8/3/2025 11:43,BillingSystem,File data_1206.csv uploaded successfully by us...,System Notification


In [1]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from sklearn.cluster import DBSCAN

In [12]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings = model.encode(df['log_message'].tolist())

100%|██████████| 83.5M/83.5M [00:25<00:00, 3.28MB/s]


In [14]:
embeddings[:2]

array([[-1.02939621e-01,  3.35459523e-02, -2.20260415e-02,
         1.55102124e-03, -9.86917224e-03, -1.78956211e-01,
        -6.34410456e-02, -6.01761676e-02,  2.81108972e-02,
         5.99620380e-02, -1.72618628e-02,  1.43363688e-03,
        -1.49560019e-01,  3.15283425e-03, -5.66031337e-02,
         2.71685850e-02, -1.49890371e-02, -3.54038067e-02,
        -3.62936184e-02, -1.45410309e-02, -5.61491214e-03,
         8.75538662e-02,  4.55120690e-02,  2.50963680e-02,
         1.00187594e-02,  1.24267275e-02, -1.39923573e-01,
         7.68696368e-02,  3.14095393e-02, -4.15245816e-03,
         4.36902456e-02,  1.71250906e-02, -8.00951123e-02,
         5.74006103e-02,  1.89091768e-02,  8.55261758e-02,
         3.96398529e-02, -1.34371787e-01, -1.44367362e-03,
         3.06701986e-03,  1.76854044e-01,  4.44884412e-03,
        -1.69275030e-02,  2.24266835e-02, -4.35048826e-02,
         6.09030016e-03, -9.98164527e-03, -6.23973049e-02,
         1.07372673e-02, -6.04892522e-03, -7.14661330e-0

In [15]:
dbscan = DBSCAN(eps=0.2,min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embeddings)
df['cluster'] = clusters
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0


In [16]:
df[df.cluster==1]

Unnamed: 0,timestamp,source,log_message,target_label,cluster
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
9,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,1
199,1/22/2025 5:45,BillingSystem,Mail service encountered a delivery glitch,Error,1
229,5/2/2025 23:04,ModernHR,Service disruption caused by email sending error,Critical Error,1
246,3/30/2025 23:53,ModernCRM,Email system had a problem sending emails,Error,1
333,11/19/2025 23:06,BillingSystem,Email service experienced a sending issue,Error,1
413,10/27/2025 5:59,ThirdPartyAPI,Email delivery system encountered an error,Error,1
433,12/2/2025 10:30,AnalyticsEngine,Email transmission error caused service impact,Critical Error,1
519,11/7/2025 18:08,ThirdPartyAPI,Email service impacted by sending failure,Critical Error,1
619,4/28/2025 15:13,AnalyticsEngine,Email delivery problem affected system,Critical Error,1


In [17]:
cluster_counts =  df['cluster'].value_counts()
large_clusters = cluster_counts[cluster_counts > 10].index

for cluster in large_clusters:
    print(f"Cluster {cluster}:")
    print(df[df['cluster'] == cluster]['log_message'].head(5).to_string(index=False))
    print()

Cluster 0:
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
nova.osapi_compute.wsgi.server [req-2bf7cfee-a2...

Cluster 10:
User User685 logged out.
 User User395 logged in.
 User User225 logged in.
User User494 logged out.
 User User900 logged in.

Cluster 12:
Backup started at 2025-05-14 07:06:55.
Backup started at 2025-02-15 20:00:19.
  Backup ended at 2025-08-08 13:06:23.
Backup started at 2025-11-14 08:27:43.
Backup started at 2025-12-09 10:19:11.

Cluster 6:
Multiple bad login attempts detected on user 85...
Multiple login failures occurred on user 9052 a...
  User 7153 made multiple incorrect login attempts
  User 8300 made multiple incorrect login attempts
Multiple login failures were detected for user ...

Cluster 7:
Backup completed successfully.
Backup completed successfully.
Backup completed successfully.
Backup completed

Classification Stage 1: Regex

In [31]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return None

In [19]:
classify_with_regex("User User123 logged in.")

'User Action'

In [30]:
classify_with_regex("User User494 logged OUT.")

'User Action'

In [20]:
classify_with_regex("System reboot initiated by user User179.")

'System Notification'

In [21]:
classify_with_regex("Hey you, chill bro")

In [34]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)
df[df.regex_label.notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,4,System Notification
13,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,4,System Notification
14,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,7,System Notification
17,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,8,User Action
25,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,10,User Action
...,...,...,...,...,...,...
2171,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,19,System Notification
2176,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,30,System Notification
2188,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,30,System Notification
2189,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,12,System Notification


only 500 rows are classified sucessfully out of 2100 records.

In [35]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(1703, 6)

so for the above 1703 rows we have to use either llm or bert. if enough sample is there then use bert else use llm. how will you know if there are enough samples? do it programmitically or manually from csv file. for depreciation warning and workflow error there are only few records hence do llm.

In [36]:
print(df_non_regex['target_label'].value_counts()[df_non_regex['target_label'].value_counts() <= 5].index.tolist())



classification stage 2: classification using embeddings

In [37]:
df_legacy = df_non_regex[df_non_regex.source=="LegacyCRM"]
df_legacy

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
55,2025-10-06 16:55:23,LegacyCRM,Lead conversion failed for prospect ID 7842 du...,Workflow Error,22,
236,2025-05-03 16:55:35,LegacyCRM,API endpoint 'getCustomerDetails' is deprecate...,Deprecation Warning,44,
348,2025-06-24 12:16:29,LegacyCRM,Customer follow-up process for lead ID 5621 fa...,Workflow Error,57,
1205,2025-04-17 07:33:44,LegacyCRM,Escalation rule execution failed for ticket ID...,Workflow Error,96,
1587,2025-04-30 07:47:30,LegacyCRM,The 'ExportToCSV' feature is outdated. Please ...,Deprecation Warning,109,
1668,2025-01-23 10:33:36,LegacyCRM,Support for legacy authentication methods will...,Deprecation Warning,112,
2030,2025-05-12 09:46:54,LegacyCRM,Task assignment for TeamID 3425 could not comp...,Workflow Error,122,


In [38]:
df_non_legacy = df_non_regex[df_non_regex.source!="LegacyCRM"]
df_non_legacy

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0,
...,...,...,...,...,...,...
2198,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,0,
2199,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,6,
2200,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,0,
2201,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,1,


In [39]:
df_non_legacy.shape

(1696, 6)

In [40]:
df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [41]:
filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist())
filtered_embeddings[:2]

array([[-1.02939621e-01,  3.35459523e-02, -2.20260415e-02,
         1.55102124e-03, -9.86917224e-03, -1.78956211e-01,
        -6.34410456e-02, -6.01761676e-02,  2.81108972e-02,
         5.99620380e-02, -1.72618628e-02,  1.43363688e-03,
        -1.49560019e-01,  3.15283425e-03, -5.66031337e-02,
         2.71685850e-02, -1.49890371e-02, -3.54038067e-02,
        -3.62936184e-02, -1.45410309e-02, -5.61491214e-03,
         8.75538662e-02,  4.55120690e-02,  2.50963680e-02,
         1.00187594e-02,  1.24267275e-02, -1.39923573e-01,
         7.68696368e-02,  3.14095393e-02, -4.15245816e-03,
         4.36902456e-02,  1.71250906e-02, -8.00951123e-02,
         5.74006103e-02,  1.89091768e-02,  8.55261758e-02,
         3.96398529e-02, -1.34371787e-01, -1.44367362e-03,
         3.06701986e-03,  1.76854044e-01,  4.44884412e-03,
        -1.69275030e-02,  2.24266835e-02, -4.35048826e-02,
         6.09030016e-03, -9.98164527e-03, -6.23973049e-02,
         1.07372673e-02, -6.04892522e-03, -7.14661330e-0

In [43]:
X = filtered_embeddings
y = df_non_legacy['target_label'].values

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.94      1.00      0.97        51
         Error       1.00      0.94      0.97        52
   HTTP Status       1.00      1.00      1.00       311
Resource Usage       1.00      1.00      1.00         4
Security Alert       1.00      1.00      1.00        91

      accuracy                           0.99       509
     macro avg       0.99      0.99      0.99       509
  weighted avg       0.99      0.99      0.99       509



In [47]:
import joblib
joblib.dump(clf, '../models/log_classifier.joblib')

['../models/log_classifier.joblib']