In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import string
import re
import joblib
import pandas as pd
import re

from datetime import datetime

In [2]:
log_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}) (\[SW_CTX:\[.*?\]\]) (\[.*?\]) (\S+)  (\S+) (.*)'
sw_pattern = r'\[SW_CTX:\[(.*?)\]\]'
 
format_str = "%Y-%m-%d %H:%M:%S.%f"  
# Function to parse the SW_CTX field
def parse_sw_ctx(sw_ctx):
    match = re.match(sw_pattern, sw_ctx)
    if match:
        sw_data = match.groups()[0]
        return sw_data.split(',')

# Function to parse a single line of log data
def parse_log_line(line):
    match = re.match(log_pattern, line)
    if match:
        timestamp, sw_ctx, thread_info, log_level, class_name, message = match.groups()
        service_name, instance_id, trace_id, segment_id, span_id = parse_sw_ctx(sw_ctx)
        
        # Convert the string to a datetime object
        date_time_obj = datetime.strptime(timestamp, format_str)

        # Get the total number of milliseconds since the Unix epoch
        milliseconds = int(date_time_obj.timestamp() * 1000)
        thread_info = thread_info.strip('[]')
        message = message.strip("-")
        return milliseconds, service_name, instance_id, trace_id, segment_id, span_id, thread_info, log_level, class_name, message
    else:
        return None

In [3]:
def parse_log_file(log_file_path):
    print(f"Parsing the logfile at {log_file_path}")
    parsed_lines = []
    count = 0
    with open(log_file_path, 'r') as file:
        for line in file:
            count += 1
            parsed_line = parse_log_line(line)
            if parsed_line:
                parsed_lines.append(parsed_line)
 
    columns = ['timeStamp', 'service', 'instanceId', 'traceId', 'traceSegmentId','spanId', 'threadInfo', 'logLevel', 'className', 'message']
    df = pd.DataFrame(parsed_lines, columns=columns)
    
    return df

In [4]:
no_fault_df = parse_log_file("..\\data-extraction\\parsed_data\\logdata_no_fault.log")
fault_1_df = parse_log_file("..\\data-extraction\\parsed_data\\logdata_f1_fault.log")

Parsing the logfile at ..\data-extraction\parsed_data\logdata_no_fault.log
Parsing the logfile at ..\data-extraction\parsed_data\logdata_f1_fault.log


In [5]:

no_fault_df['isFault'] = 0
fault_1_df['isFault'] = 1


In [6]:

print(no_fault_df['isFault'].value_counts())
print(fault_1_df['isFault'].value_counts())


0    347153
Name: isFault, dtype: int64
1    41141
Name: isFault, dtype: int64


In [7]:
 

## Combining the both logs
combined_log_df = pd.concat([no_fault_df,fault_1_df])
combined_log_df.head()
print(combined_log_df['isFault'].value_counts())


0    347153
1     41141
Name: isFault, dtype: int64


In [8]:
# Load Tracing CSV files
tracing_no_fault_df = pd.read_csv('..\\data-extraction\\parsed_data\\tracedata_no_fault.csv')
tracing_f1_fault_df = pd.read_csv('..\\data-extraction\\parsed_data\\tracedata_f1_fault.csv')

# Includes the fault and no fault column
tracing_no_fault_df['isFault'] = 0
tracing_f1_fault_df['isFault'] = 1

combined_trace_df = pd.concat([tracing_no_fault_df, tracing_f1_fault_df])
print(combined_trace_df['isFault'].value_counts())


0    612892
1     71708
Name: isFault, dtype: int64


In [9]:

combined_trace_df.head()

Unnamed: 0,traceId,traceSegmentId,startTime,parentSpanId,spanId,endTime,operationName,peer,spanType,spanLayer,componentId,isError,service,isFault
0,e0bb359289af4b129d8386ec58e643fa.80056.1701994...,e0bb359289af4b129d8386ec58e643fa.80056.1701994...,1701994076397,-1,0,1701994076398,Mysql/JDBI/Connection/close,tsdb-mysql-leader:3306,Exit,Database,33,False,ts-inside-payment-service,0
1,7366467488d1453a91968b3c6506e81e.75517.1701993...,7366467488d1453a91968b3c6506e81e.75517.1701993...,1701993956502,-1,0,1701993956503,Mysql/JDBI/Connection/close,tsdb-mysql-leader:3306,Exit,Database,33,False,ts-assurance-service,0
2,7366467488d1453a91968b3c6506e81e.75565.1701994...,7366467488d1453a91968b3c6506e81e.75565.1701994...,1701994160659,-1,0,1701994160659,Mysql/JDBI/Connection/close,tsdb-mysql-leader:3306,Exit,Database,33,False,ts-assurance-service,0
3,2e7092b02ad6458a92e8bde3ece6f205.79446.1701994...,2e7092b02ad6458a92e8bde3ece6f205.79446.1701994...,1701994177277,-1,0,1701994177277,Mysql/JDBI/Connection/close,tsdb-mysql-leader:3306,Exit,Database,33,False,ts-security-service,0
4,95f4451659aa490a8fca995d754d080b.75320.1701994...,95f4451659aa490a8fca995d754d080b.75320.1701994...,1701994022718,-1,0,1701994022719,Mysql/JDBI/Connection/close,tsdb-mysql-leader:3306,Exit,Database,33,False,ts-config-service,0


In [10]:
combined_log_df.head()

Unnamed: 0,timeStamp,service,instanceId,traceId,traceSegmentId,spanId,threadInfo,logLevel,className,message,isFault
0,1702068981002,ts-order-service,30f7b547673e4bc5a42613387fe2a0fa@192.168.39.243,8a798882e9cc40e789cf5dd9746e1fad.81.1702011380...,8a798882e9cc40e789cf5dd9746e1fad.81.1702011380...,0,http-nio-12031-exec-10,INFO,order.service.OrderServiceImpl,[deleteOrder][Delete order Success][OrderId: 6...,0
1,1702068980997,ts-order-service,30f7b547673e4bc5a42613387fe2a0fa@192.168.39.243,8a798882e9cc40e789cf5dd9746e1fad.81.1702011380...,8a798882e9cc40e789cf5dd9746e1fad.81.1702011380...,0,http-nio-12031-exec-10,INFO,order.controller.OrderController,[deleteOrder][Delete Order][OrderId: 67f471f6-...,0
2,1702068981272,ts-order-service,30f7b547673e4bc5a42613387fe2a0fa@192.168.39.243,8a798882e9cc40e789cf5dd9746e1fad.76.1702011381...,8a798882e9cc40e789cf5dd9746e1fad.76.1702011381...,0,http-nio-12031-exec-5,INFO,order.service.OrderServiceImpl,[deleteOrder][Delete order Success][OrderId: 9...,0
3,1702068981465,ts-order-service,30f7b547673e4bc5a42613387fe2a0fa@192.168.39.243,8a798882e9cc40e789cf5dd9746e1fad.73.1702011381...,8a798882e9cc40e789cf5dd9746e1fad.73.1702011381...,0,http-nio-12031-exec-2,INFO,order.controller.OrderController,[deleteOrder][Delete Order][OrderId: 4ab91b9e-...,0
4,1702068981679,ts-order-service,30f7b547673e4bc5a42613387fe2a0fa@192.168.39.243,8a798882e9cc40e789cf5dd9746e1fad.74.1702011381...,8a798882e9cc40e789cf5dd9746e1fad.74.1702011381...,0,http-nio-12031-exec-3,INFO,order.controller.OrderController,[deleteOrder][Delete Order][OrderId: ed430868-...,0


In [11]:
## Calculating the duration field from endtime and starttime
combined_trace_df['duration'] = combined_trace_df['endTime'] - combined_trace_df['startTime']

In [12]:
print(f"Trace : {combined_trace_df.columns}")
print(f"Log : {combined_log_df.columns}")
print(f"Log : {combined_log_df.dtypes}")
print(f"Trace : {combined_trace_df.dtypes}")

combined_log_df['spanId'] =combined_log_df['spanId'].astype('int64') 

Trace : Index(['traceId', 'traceSegmentId', 'startTime', 'parentSpanId', 'spanId',
       'endTime', 'operationName', 'peer', 'spanType', 'spanLayer',
       'componentId', 'isError', 'service', 'isFault', 'duration'],
      dtype='object')
Log : Index(['timeStamp', 'service', 'instanceId', 'traceId', 'traceSegmentId',
       'spanId', 'threadInfo', 'logLevel', 'className', 'message', 'isFault'],
      dtype='object')
Log : timeStamp          int64
service           object
instanceId        object
traceId           object
traceSegmentId    object
spanId            object
threadInfo        object
logLevel          object
className         object
message           object
isFault            int64
dtype: object
Trace : traceId           object
traceSegmentId    object
startTime          int64
parentSpanId       int64
spanId             int64
endTime            int64
operationName     object
peer              object
spanType          object
spanLayer         object
componentId        int64


In [13]:
#tmp=merged_df


In [14]:
merged_df = pd.merge(combined_trace_df,combined_log_df,on=['traceId','traceSegmentId'])
rows,columns = merged_df.shape

print("Number of Rows:", rows)
print("Number of Columns:", columns)
print("Columns: ",merged_df.columns)

Number of Rows: 2242009
Number of Columns: 24
Columns:  Index(['traceId', 'traceSegmentId', 'startTime', 'parentSpanId', 'spanId_x',
       'endTime', 'operationName', 'peer', 'spanType', 'spanLayer',
       'componentId', 'isError', 'service_x', 'isFault_x', 'duration',
       'timeStamp', 'service_y', 'instanceId', 'spanId_y', 'threadInfo',
       'logLevel', 'className', 'message', 'isFault_y'],
      dtype='object')


In [15]:
tmp=merged_df

In [16]:
percentage_matching_fault = ((merged_df['isFault_x'] == merged_df['isFault_y']).sum() / len(merged_df)) * 100
percentage_matching_span = ((merged_df['spanId_x'] == merged_df['spanId_y']).sum() / len(merged_df)) * 100
percentage_matching_service = ((merged_df['service_x'] == merged_df['service_x']).sum() / len(merged_df)) * 100
print("Percentage of matching rows in fault:", percentage_matching_fault)
print("Percentage of matching rows in span:", percentage_matching_span)
print("Percentage of matching rows in service:", percentage_matching_service)

Percentage of matching rows in fault: 100.0
Percentage of matching rows in span: 17.20296394885123
Percentage of matching rows in service: 100.0


In [17]:
# Dropping unwanted columns
merged_df.drop('isFault_y', axis=1, inplace=True)
merged_df.drop('spanId_x', axis=1, inplace=True)
merged_df.drop('spanId_y', axis=1, inplace=True)
merged_df.drop('service_x', axis=1, inplace=True)
merged_df.drop('traceId', axis=1, inplace=True)
merged_df.drop('traceSegmentId', axis=1, inplace=True)
merged_df.drop('startTime', axis=1, inplace=True)
merged_df.drop('endTime', axis=1, inplace=True)
merged_df.drop('peer', axis=1, inplace=True)
merged_df.drop('instanceId', axis=1, inplace=True)
merged_df.drop('timeStamp', axis=1, inplace=True)

# Rename 'isFault_x' to 'isFault'
merged_df.rename(columns={'isFault_x': 'isFault'}, inplace=True)
merged_df.rename(columns={'service_y': 'service'}, inplace=True)

merged_df.columns

Index(['parentSpanId', 'operationName', 'spanType', 'spanLayer', 'componentId',
       'isFault', 'duration', 'service', 'threadInfo', 'logLevel', 'className',
       'message'],
      dtype='object')

In [18]:
merged_df.head()

Unnamed: 0,parentSpanId,operationName,spanType,spanLayer,componentId,isFault,duration,service,threadInfo,logLevel,className,message
0,-1,GET:/api/v1/verifycode/generate,Entry,Http,14,0,3,ts-verification-code-service,http-nio-15678-exec-2,WARN,v.service.impl.VerifyCodeServiceImpl,[getImageCode][Get image code warn.Cookie not ...
1,-1,GET:/api/v1/verifycode/generate,Entry,Http,14,0,3,ts-verification-code-service,http-nio-15678-exec-2,INFO,v.controller.VerifyCodeController,[imageCode][Image code]
2,-1,GET:/api/v1/verifycode/generate,Entry,Http,14,0,3,ts-verification-code-service,http-nio-15678-exec-2,INFO,v.service.impl.VerifyCodeServiceImpl,[getImageCode][strEnsure: LT8W]
3,0,HikariCP/Connection/getConnection,Local,Unknown,116,0,2,ts-order-service,http-nio-12031-exec-4,INFO,order.controller.OrderController,[queryOrdersForRefresh][Query Orders][for Logi...
4,0,HikariCP/Connection/getConnection,Local,Unknown,116,0,2,ts-order-service,http-nio-12031-exec-4,WARN,order.service.OrderServiceImpl,[queryOrders][Orders don't fit the requirement...


In [19]:
def nan_empty_percentage(column):
    # Count of NaN/empty values
    count_nan_empty = column.isna().sum() + (column == '').sum()
    # Percentage calculation
    return (count_nan_empty / len(column)) * 100

# Applying the function to each column


In [20]:
nan_empty_percentages = merged_df.apply(nan_empty_percentage)

print(nan_empty_percentages)

parentSpanId     0.0
operationName    0.0
spanType         0.0
spanLayer        0.0
componentId      0.0
isFault          0.0
duration         0.0
service          0.0
threadInfo       0.0
logLevel         0.0
className        0.0
message          0.0
dtype: float64


In [21]:
merged_df['operationName'] = merged_df['operationName'].astype('category').cat.codes
merged_df['spanType'] = merged_df['spanType'].astype('category').cat.codes
merged_df['spanLayer'] = merged_df['spanLayer'].astype('category').cat.codes
merged_df['logLevel'] = merged_df['logLevel'].astype('category').cat.codes
merged_df['threadInfo'] = merged_df['threadInfo'].astype('category').cat.codes
merged_df['service'] = merged_df['service'].astype('category').cat.codes
merged_df['className'] = merged_df['className'].astype('category').cat.codes

In [22]:
merged_df.head()

Unnamed: 0,parentSpanId,operationName,spanType,spanLayer,componentId,isFault,duration,service,threadInfo,logLevel,className,message
0,-1,13161,0,1,14,0,3,31,123,1,60,[getImageCode][Get image code warn.Cookie not ...
1,-1,13161,0,1,14,0,3,31,123,0,59,[imageCode][Image code]
2,-1,13161,0,1,14,0,3,31,123,0,60,[getImageCode][strEnsure: LT8W]
3,0,13164,2,3,116,0,2,16,24,0,38,[queryOrdersForRefresh][Query Orders][for Logi...
4,0,13164,2,3,116,0,2,16,24,1,39,[queryOrders][Orders don't fit the requirement...


In [23]:
merged_df

Unnamed: 0,parentSpanId,operationName,spanType,spanLayer,componentId,isFault,duration,service,threadInfo,logLevel,className,message
0,-1,13161,0,1,14,0,3,31,123,1,60,[getImageCode][Get image code warn.Cookie not ...
1,-1,13161,0,1,14,0,3,31,123,0,59,[imageCode][Image code]
2,-1,13161,0,1,14,0,3,31,123,0,60,[getImageCode][strEnsure: LT8W]
3,0,13164,2,3,116,0,2,16,24,0,38,[queryOrdersForRefresh][Query Orders][for Logi...
4,0,13164,2,3,116,0,2,16,24,1,39,[queryOrders][Orders don't fit the requirement...
...,...,...,...,...,...,...,...,...,...,...,...,...
2242004,0,10788,1,1,13,1,6,11,99,0,23,[executeTicket][Execute][Id: ac33d3f9-8fe9-492...
2242005,0,10788,1,1,13,1,6,11,99,0,24,[Execute Service][Get Order] Getting....
2242006,-1,13138,0,1,14,1,9,11,99,0,24,[Execute Service][Execute Order] Executing....
2242007,-1,13138,0,1,14,1,9,11,99,0,23,[executeTicket][Execute][Id: ac33d3f9-8fe9-492...


In [24]:
import gc


In [25]:
gc.collect()

0

In [26]:
print("Vectorizing... started")
# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=100)  
vectorized_log_data = vectorizer.fit_transform(merged_df['message'])

# Convert to DataFrame
vectorized_log_df = pd.DataFrame(vectorized_log_data.toarray(), columns=vectorizer.get_feature_names_out())
print("Vectorizing... ended")

Vectorizing... started
Vectorizing... ended


In [27]:
# Assuming alignment is based on the order of records
merged_df = merged_df.reset_index(drop=True)
merged_df.drop('message', axis=1, inplace=True)

fused_df = pd.concat([merged_df, vectorized_log_df], axis=1)
rows,columns = fused_df.shape

print("Number of Rows:", rows)
print("Number of Columns:", columns)

Number of Rows: 2242009
Number of Columns: 111


In [28]:
fused_df.columns

Index(['parentSpanId', 'operationName', 'spanType', 'spanLayer', 'componentId',
       'isFault', 'duration', 'service', 'threadInfo', 'logLevel',
       ...
       'totalnum', 'train', 'trainnumber', 'traintype', 'traintypename',
       'travel', 'traveldate', 'trip', 'tripid', 'type'],
      dtype='object', length=111)

In [29]:
# Splitting the dataset into features and target variable
X = fused_df.drop('isFault', axis=1)
y = fused_df['isFault']

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Scalling done!")

Scalling done!


In [30]:
# Train a Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test_scaled)

# Evaluate the model
print(classification_report(y_test, y_pred))
joblib.dump(classifier, 'random_forest.pkl')

              precision    recall  f1-score   support

           0       0.91      0.99      0.94    600684
           1       0.57      0.15      0.24     71919

    accuracy                           0.90    672603
   macro avg       0.74      0.57      0.59    672603
weighted avg       0.87      0.90      0.87    672603



['random_forest.pkl']

In [31]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier(max_depth=10, min_samples_split=50, min_samples_leaf=20)
dtree.fit(X_train_scaled, y_train)
y_pred = dtree.predict(X_test_scaled)

print(classification_report(y_test, y_pred))
joblib.dump(dtree, 'dtree.pkl')

              precision    recall  f1-score   support

           0       0.90      1.00      0.95    600684
           1       0.70      0.09      0.16     71919

    accuracy                           0.90    672603
   macro avg       0.80      0.54      0.55    672603
weighted avg       0.88      0.90      0.86    672603



['dtree.pkl']

In [32]:
from xgboost import XGBClassifier

xgb = XGBClassifier(class_weight='balanced')
xgb.fit(X_train_scaled, y_train)
y_pred = xgb.predict(X_test_scaled)

print(classification_report(y_test, y_pred))
joblib.dump(xgb, 'xgb.pkl')

Parameters: { "class_weight" } are not used.



              precision    recall  f1-score   support

           0       0.90      0.99      0.95    600684
           1       0.74      0.12      0.21     71919

    accuracy                           0.90    672603
   macro avg       0.82      0.56      0.58    672603
weighted avg       0.89      0.90      0.87    672603



['xgb.pk1']

In [None]:
from sklearn.neighbors import KNeighborsClassifier

for k in range(1, 4):  # Example: trying k from 1 to 9
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    y_pred = knn.predict(X_test_scaled)
    print(f"Results for k={k}:")
    print(classification_report(y_test, y_pred))
    name = f'knn_k{k}.pkl'
    joblib.dump(knn,name)

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()
mlp.fit(X_train_scaled, y_train)
y_pred = mlp.predict(X_test_scaled)

print(classification_report(y_test, y_pred))


In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'hidden_layer_sizes': [(100,), (50, 50)],
    'activation': ['relu', 'tanh'],
    # Add more parameters as needed
}

mlp = MLPClassifier(max_iter=1000)
clf = GridSearchCV(mlp, parameters, scoring='precision')
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print(clf.best_params_)
