In [1]:
import pandas as pd
df=pd.read_csv('Dataset/synthetic_logs.csv')
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [2]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [3]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [4]:
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer



# Initialize model, you can choose any other model if needed
model = SentenceTransformer('all-MiniLM-L6-v2')

# Converts sentences to vectors
embeddings = model.encode(df['log_message'].tolist())





In [5]:
# Initialize DBScan
clustering_model = DBSCAN(eps=0.3, min_samples=2,metric='euclidean')
labels = clustering_model.fit_predict(embeddings)
  
df['cluster'] = labels
df[df.cluster==1]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,1
1824,1/1/2025 18:53,BillingSystem,File data_3868.csv uploaded successfully by us...,System Notification,regex,1


In [6]:
# Group the DataFrame by 'cluster' and count the number of records in each cluster
cluster_counts = df['cluster'].value_counts()

# Sort by cluster size and filter clusters that have more than 10 records
large_clusters = cluster_counts[cluster_counts > 10].index

# For each large cluster, print 5 log messages
for cluster in large_clusters:
    print(f'Cluster: {cluster}:')
    print(df[df['cluster'] == cluster]['log_message'].head(5).to_string(index=False))
    print("\n")


Cluster: -1:
 Email service experiencing issues with sending
      Unauthorized access to data was attempted
      Shard 6 replication task ended in failure
       Email server encountered a sending fault
Critical system unit error: unit ID Component55


Cluster: 0:
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
nova.osapi_compute.wsgi.server [req-2bf7cfee-a2...


Cluster: 17:
nova.metadata.wsgi.server [req-61196723-e034-48...
nova.metadata.wsgi.server [req-7d3eeb2d-3948-43...
nova.metadata.wsgi.server [-] 10.11.21.137,10.1...
nova.metadata.wsgi.server [req-e0d4ce94-e0cb-41...
nova.metadata.wsgi.server [req-c1795e2c-0a17-4a...


Cluster: 4:
Backup completed successfully.
Backup completed successfully.
Backup completed successfully.
Backup completed successfully.
Backup completed successfully.


Cluster: 21:
Backup ended at 2025-08-

In [7]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message):
            return label
    return None

In [8]:
classify_with_regex("Backup started at 2025-05-14 07:06:55")

'System Notification'

In [9]:
# Apply regex classification
df['regex_label'] = df['log_message'].apply(lambda x: classify_with_regex(x))
df[df['regex_label'].notnull()]


Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,1,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,regex,3,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,regex,4,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex,-1,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,regex,-1,User Action
...,...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,regex,79,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,regex,20,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,regex,20,System Notification
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,regex,21,System Notification


In [10]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(1910, 7)

In [11]:
df_n_legacy = df_non_regex[df_non_regex.source!="LegacyCRM"]
df_n_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [12]:
n_legacy_embeddings = model.encode(df_n_legacy['log_message'].tolist())
n_legacy_embeddings[:2]

array([[-1.02939717e-01,  3.35459895e-02, -2.20260434e-02,
         1.55100622e-03, -9.86912940e-03, -1.78956345e-01,
        -6.34409934e-02, -6.01761267e-02,  2.81108730e-02,
         5.99619709e-02, -1.72618106e-02,  1.43368833e-03,
        -1.49559975e-01,  3.15280259e-03, -5.66031225e-02,
         2.71685645e-02, -1.49890687e-02, -3.54037769e-02,
        -3.62936147e-02, -1.45410709e-02, -5.61491819e-03,
         8.75539333e-02,  4.55120951e-02,  2.50963438e-02,
         1.00188032e-02,  1.24267070e-02, -1.39923558e-01,
         7.68695921e-02,  3.14095505e-02, -4.15250845e-03,
         4.36903723e-02,  1.71250105e-02, -8.00951421e-02,
         5.74006326e-02,  1.89091861e-02,  8.55261832e-02,
         3.96398939e-02, -1.34371832e-01, -1.44363695e-03,
         3.06711602e-03,  1.76854104e-01,  4.44891676e-03,
        -1.69273838e-02,  2.24266760e-02, -4.35050540e-02,
         6.09031972e-03, -9.98173840e-03, -6.23971745e-02,
         1.07371574e-02, -6.04891405e-03, -7.14659989e-0

In [13]:
X = n_legacy_embeddings
y = df_n_legacy['target_label'].values

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.91      1.00      0.95        48
         Error       0.98      0.89      0.93        47
   HTTP Status       1.00      1.00      1.00       304
Resource Usage       1.00      1.00      1.00        49
Security Alert       1.00      0.99      1.00       123

      accuracy                           0.99       571
     macro avg       0.98      0.98      0.98       571
  weighted avg       0.99      0.99      0.99       571


In [15]:
import joblib
joblib.dump(clf, '../model/logistic_model.joblib')

['../model/logistic_model.joblib']