In [3]:
import pandas as pd

df=pd.read_csv('dataset/synthetic_logs.csv')
df.head(3)


Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert


In [4]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [15]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [16]:
df[df.target_label=='System Notification'].sample(10)

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
1954,3/26/2025 22:01,ModernHR,Backup ended at 2025-09-26 13:08:49.,System Notification,regex,8
30,4/26/2025 7:54,AnalyticsEngine,Backup started at 2025-05-14 07:06:55.,System Notification,regex,8
1456,11/14/2025 17:55,AnalyticsEngine,Disk cleanup completed successfully.,System Notification,regex,16
208,11/2/2025 22:20,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,regex,16
1589,4/26/2025 12:16,BillingSystem,System updated to version 1.0.3.,System Notification,regex,13
2126,5/13/2025 5:04,ThirdPartyAPI,File data_2777.csv uploaded successfully by us...,System Notification,regex,2
833,7/20/2025 21:26,ModernHR,Backup started at 2025-09-15 18:27:39.,System Notification,regex,8
324,4/2/2025 12:27,ThirdPartyAPI,Backup completed successfully.,System Notification,regex,4
36,11/19/2025 13:14,BillingSystem,System reboot initiated by user User243.,System Notification,regex,11
1106,12/28/2025 13:32,ModernHR,System reboot initiated by user User246.,System Notification,regex,11


In [17]:
df[df.log_message.str.startswith("System reboot initiated by user")]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
36,11/19/2025 13:14,BillingSystem,System reboot initiated by user User243.,System Notification,regex,11
92,12/4/2025 21:20,BillingSystem,System reboot initiated by user User471.,System Notification,regex,11
139,5/8/2025 16:34,ModernHR,System reboot initiated by user User216.,System Notification,regex,11
140,9/11/2025 8:49,AnalyticsEngine,System reboot initiated by user User639.,System Notification,regex,11
161,3/31/2025 19:40,BillingSystem,System reboot initiated by user User819.,System Notification,regex,11
163,6/6/2025 15:29,BillingSystem,System reboot initiated by user User938.,System Notification,regex,11
307,4/12/2025 0:41,BillingSystem,System reboot initiated by user User929.,System Notification,regex,11
365,10/20/2025 22:32,ModernHR,System reboot initiated by user User533.,System Notification,regex,11
508,4/15/2025 2:04,ThirdPartyAPI,System reboot initiated by user User591.,System Notification,regex,11
552,9/22/2025 20:54,ModernHR,System reboot initiated by user User421.,System Notification,regex,11


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['log_message'].tolist())
embeddings[:2]

(2410, 384)

In [13]:
dbscan= DBSCAN(eps=0.5,min_samples=5,metric='euclidean')
clusters=dbscan.fit_predict(embeddings)

df['cluster']= clusters
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,-1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,-1
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0


In [14]:
cluster_counts=df['cluster'].value_counts()
large_clusters= cluster_counts[cluster_counts>10].index

for cluster in large_clusters:
    print(f"Cluster{cluster}:")
    print(df[df['cluster']==cluster]['log_message'].head(4).to_string(index=False))
    print

Cluster0:
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
Cluster-1:
    Email service experiencing issues with sending
         Unauthorized access to data was attempted
          Email server encountered a sending fault
Multiple bad login attempts detected on user 85...
Cluster9:
nova.metadata.wsgi.server [-] 10.11.21.138,10.1...
nova.metadata.wsgi.server [req-27e91939-3ba4-4d...
nova.metadata.wsgi.server [-] 10.11.21.143,10.1...
nova.metadata.wsgi.server [req-61196723-e034-48...
Cluster7:
User User685 logged out.
 User User395 logged in.
 User User225 logged in.
User User494 logged out.
Cluster8:
Backup started at 2025-05-14 07:06:55.
Backup started at 2025-02-15 20:00:19.
  Backup ended at 2025-08-08 13:06:23.
Backup started at 2025-11-14 08:27:43.
Cluster3:
nova.compute.claims [req-a07ac654-8e81-416d-bfb...
nova.compute.claims 

Classification Stage 1:Regex

In [18]:
import re
def classify_with_regex(log_message):
    regex_pattern= {
         r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_pattern.items():
        if re.search(pattern,log_message):
            return label
    return None

In [19]:
classify_with_regex("User User134 logged in." )

'User Action'

In [20]:
classify_with_regex("System reboot initiated by user 123.")

'System Notification'

In [21]:
classify_with_regex("girişe girişelim")

In [22]:
df['regex_label'] = df['log_message'].apply(lambda x: classify_with_regex(x))
df[df['regex_label'].notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,2,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,regex,2,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,regex,4,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex,5,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,regex,7,User Action
...,...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,regex,13,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,regex,16,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,regex,16,System Notification
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,regex,8,System Notification


In [None]:
df[df['regex_label'].isnull()].head(5)

In [23]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape


(1910, 7)

In [31]:
df_non_legacy = df_non_regex[df_non_regex.source=="LegacyCRM"]
df_non_legacy

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
60,2025-10-06 16:55:23,LegacyCRM,Lead conversion failed for prospect ID 7842 du...,Workflow Error,llm,-1,
255,2025-05-03 16:55:35,LegacyCRM,API endpoint 'getCustomerDetails' is deprecate...,Deprecation Warning,llm,-1,
377,2025-06-24 12:16:29,LegacyCRM,Customer follow-up process for lead ID 5621 fa...,Workflow Error,llm,-1,
1325,2025-04-17 07:33:44,LegacyCRM,Escalation rule execution failed for ticket ID...,Workflow Error,llm,-1,
1734,2025-04-30 07:47:30,LegacyCRM,The 'ExportToCSV' feature is outdated. Please ...,Deprecation Warning,llm,-1,
1826,2025-01-23 10:33:36,LegacyCRM,Support for legacy authentication methods will...,Deprecation Warning,llm,-1,
2217,2025-05-12 09:46:54,LegacyCRM,Task assignment for TeamID 3425 could not comp...,Workflow Error,llm,-1,


In [32]:
df_non_legacy.shape

(7, 7)

In [33]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings_filtered = model.encode(df_non_legacy['log_message'].tolist())

In [34]:
len(embeddings_filtered)

7

In [35]:
X = embeddings_filtered
y = df_non_legacy['target_label'].values

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics  import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
report = classification_report(y_test,y_pred)
print (report)


                     precision    recall  f1-score   support

     Workflow Error       0.33      1.00      0.50         1

           accuracy                           0.33         3
          macro avg       0.17      0.50      0.25         3
       weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [44]:
import joblib
joblib.dump(clf,'../model/log_classifier.joblib')

['../model/log_classifier.joblib']