# Packages

In [1]:
%pip install pandas==2.0.3
%pip install tqdm==4.66.1
%pip install pm4py==2.7.8.2
# %pip install scikit-learn spacy
# !python -m spacy download en_core_web_sm

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import re
from tqdm.notebook import tqdm
import concurrent.futures
from collections import defaultdict, Counter
# Process Mining
import pandas as pd
import pm4py
from io import StringIO
import os
import json
from uuid import uuid4

from collections import Counter
# from sklearn.feature_extraction.text import TfidfVectorizer
# import spacy


from collections import defaultdict
from enum import Enum, auto

# Log Mining Functions

In [6]:
# ======================================================================
def _extract_log_data(logs, regex_pattern, service_name):
    log_data = []
    pattern = re.compile(regex_pattern)
    lines = logs.split('\n')

    for line in tqdm(lines, total=len(lines), desc=f"Processing log data for {service_name}"):
        match = pattern.match(line)
        if match:
            log_message = match.groupdict()
            log_message['service'] = service_name
            log_message['message'] = match.groupdict()['message'].rstrip() # Remove whitespace at message end
            log_message['id'] = str(uuid4())
            log_data.append(log_message)          
        else:       
            if log_data: # If the line doesn't match the pattern, append it to the previous log message
                log_data[-1]['message'] += line.strip()

    return log_data

# ======================================================================
def _extract_logging_statements(log_data):  
    # Group the logs
    log_groups = defaultdict(list)
    for log in log_data:
        key = (log["service"], log["process"], log["subprocess"], log["level"], log["class"], log["method"], log["file"], log["line"])
        log_groups[key].append(log)
    
    # Assign a logging statement ID to each unique combination and count their occurrences
    logging_statements = []
    logging_statement_id_mapping = {}
    
    for key, logs in tqdm(log_groups.items(), desc="Assigning logging statement IDs"):
        service, process, subprocess, level, class_name, method, file, line = key
        associated_log_ids = [log['id'] for log in logs]  # List of log IDs
           
        logging_statement_id = f"{service} - {file} - {method} - {line}"
        
        logging_statement = {
            "logging_statement_id": logging_statement_id,
            "service": service,
            "process": process,
            "subprocess" : subprocess,
            "level" : level,
            "class": class_name,
            "method": method,
            "file": file,
            "line": line,
            "associated_logs": len(logs),
            "associated_log_ids": associated_log_ids           
        }
        
        logging_statements.append(logging_statement)
        logging_statement_id_mapping[key] = logging_statement_id

    # Assign the logging statement ID to each log
    for log in tqdm(log_data, desc="Assigning logging statement IDs to logs"):
        key = (log["service"], log["process"], log["subprocess"], log["level"], log["class"], log["method"], log["file"], log["line"])
        log["logging_statement_id"] = logging_statement_id_mapping[key]

    return log_data, logging_statements

class ProcessingMode(Enum):
    DEFAULT = auto()
    COMBINED_SUBPROCESSES = auto()
    COMBINED_SERVICES = auto()
    
def process_logs_variant(all_logs, logging_statements, regex_pattern, mode: ProcessingMode):
    processed_logs = defaultdict(lambda: defaultdict(dict))

    # Create a dictionary for BPMN-specific logs
    bpmn_logs = defaultdict(list)

    for log in all_logs:
        service, subprocess = log.get('service'), log.get('subprocess')

        if mode == ProcessingMode.COMBINED_SUBPROCESSES:
            bpmn_key = (service, "combined subprocesses")
        elif mode == ProcessingMode.COMBINED_SERVICES:
            bpmn_key = ("combined services", "combined subprocesses")
        else:
            bpmn_key = (service, subprocess)

        bpmn_logs[bpmn_key].append(log.copy())  # Store a copy of the log

    for (bpmn_service, bpmn_subprocess), bpmn_specific_logs in bpmn_logs.items():
        # Filtering the appropriate logs and logging statements from all_logs according to the mode
        if mode == ProcessingMode.COMBINED_SERVICES:
            target_logs = [log for log in all_logs]
            target_logging_statements = [statement for statement in logging_statements]
        elif mode == ProcessingMode.COMBINED_SUBPROCESSES:
            target_logs = [log for log in all_logs if log.get('service') == bpmn_service]
            target_logging_statements = [statement for statement in logging_statements if statement.get('service') == bpmn_service]
        else:
            target_logs = [log for log in all_logs if log.get('service') == bpmn_service and log.get('subprocess') == bpmn_subprocess]
            target_logging_statements = [statement for statement in logging_statements if statement.get('service') == bpmn_service and statement.get('subprocess') == bpmn_subprocess]

        processed_logs[bpmn_service][bpmn_subprocess] = {
            "logs": target_logs,
            "log_statements": target_logging_statements,
        }

    return processed_logs   

# Main

In [23]:
def read_file(file_path):
    with open(file_path, 'r') as f:
        return f.read()
    
regex_pattern = (  
    r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{6}) '
    r'(?P<level>[A-Z]+) '
    r'(?P<pid>\d+) '
    r'\s*---\s* '
    r'(?:\[(?P<process>[^\]]+)\]) '
    r'(?:\[(?P<subprocess>[^\]]+)\]) '
    r'\s*---\s* '
    r'\[(?P<thread>[^\]]+)\] '
    r'(?P<class>[^\.]+)\.(?P<method>[^\(]+)\((?P<file>.*\.java):(?P<line>\d+)\) '
    r'\s*-\s* '
    r'(?P<user>.*) '
    r'\s*:\s* '
    r'(?P<message>.*)'
)

def process_multiple_models(relative_path, process_name, use_contrast_mode=False):
    if use_contrast_mode:
        model_dirs = ["Model A", "Model B"]
    else:
        model_dirs = [relative_path]

    for model in model_dirs:
        print(f"=== Processing logs for {model} ===")
        log_files = []

        if use_contrast_mode:
            log_dir = os.path.join(relative_path, model)
        else:
            log_dir = relative_path

        for file in os.listdir(log_dir):
            if file.endswith('.log'):
                service_name = file[:-4]  # Extract service name
                file_path = os.path.join(log_dir, file)
                log_files.append((read_file(file_path), service_name))

        all_logs = []
        for log_file, service_name in log_files:
            log_data = _extract_log_data(log_file, regex_pattern, service_name)
            log_data = [log for log in log_data if log.get('process') == process_name]

            # Add service name to logs
            for log in log_data:
                log['service'] = service_name
            all_logs.extend(log_data)

        removed_logs = []  # Store removed logs

        # Identify and print logs to be removed
        for log in all_logs:
            if '[FAULT]' in log['message']:
                removed_logs.append(log)
                print(f"Removed log: {log}")  # Print each removed log

        # Remove logs of level WARN and logs containing the word "publish"
        all_logs = [log for log in all_logs if log not in removed_logs]

        # Extract logging statements once
        all_logs, logging_statements = _extract_logging_statements(all_logs)

        # Produce JSONs with different modes
        for mode in ProcessingMode:
            processed_logs = process_logs_variant(all_logs, logging_statements, regex_pattern, mode)

            # Modify output_dir to include both the last part of the relative_path and the model name in contrast mode
            if use_contrast_mode:
                base_name = os.path.basename(os.path.normpath(relative_path))
                output_dir_name = os.path.join(base_name, model)
            else:
                output_dir_name = os.path.basename(os.path.normpath(relative_path))

            output_dir = os.path.join('Log Data', output_dir_name)

            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            with open(os.path.join(output_dir, f'processed_logs_mode_{mode.name}.json'), 'w') as f:
                json.dump(processed_logs, f, indent=2)


if __name__ == "__main__":
    
    # CONTRAST GRAPH     
    # process_multiple_models("./Raw Logs/Contrast", "Email Researcher", True)
    
    # # COMPLEX GRAPH    
    # process_multiple_models("./Raw Logs/Single", "PCM Data Export", False)
    
    # COMPLEX GRAPH    
    process_multiple_models("./Raw Logs/Single", "Participant Enrolment", False)
    
    print("\nDone\n")

=== Processing logs for ./Raw Logs/Single ===


Processing log data for experiment-service:   0%|          | 0/60200 [00:00<?, ?it/s]

Processing log data for pcm-service:   0%|          | 0/10675 [00:00<?, ?it/s]

Assigning logging statement IDs:   0%|          | 0/234 [00:00<?, ?it/s]

Assigning logging statement IDs to logs:   0%|          | 0/4046 [00:00<?, ?it/s]


Done



In [21]:
import os
import json

def fill_missing_keys(base_dir, model_names):
    """Ensure both models have keys for all services and subprocesses."""
    all_data = {}

    # Load all data first
    for model_name in model_names:
        model_path = f'{base_dir}/{model_name}'
        with open(model_path, 'r') as f:
            all_data[model_name] = json.load(f)

    # Get all unique services and subprocesses
    all_services = set()
    for model_data in all_data.values():
        all_services.update(model_data.keys())

    # Now ensure each model has all services
    for model_name, model_data in all_data.items():
        for service in all_services:
            if service not in model_data:
                # Add empty service if not present
                all_data[model_name][service] = {}

            # For each service, ensure all subprocesses are present
            all_subprocesses = set()
            for other_model_data in all_data.values():
                all_subprocesses.update(other_model_data.get(service, {}).keys())

            for subprocess in all_subprocesses:
                if subprocess not in model_data[service]:
                    # Add empty subprocess if not present
                    all_data[model_name][service][subprocess] = {"logs": [], "log_statements": []}

    # Write the modified data back to the JSON files
    for model_name in model_names:
        model_path = f'{base_dir}/{model_name}'
        with open(model_path, 'w') as f:
            json.dump(all_data[model_name], f, indent=2)

# Define the base directory and model names
base_dir = './Log Data/Contrast'
model_names = ['Model A/processed_logs_mode_DEFAULT.json', 'Model B/processed_logs_mode_DEFAULT.json']

# Call the function
fill_missing_keys(base_dir, model_names)


# Test

In [5]:
import json
import os

def test_saved_json_files(model: str, output_dir: str):
    """
    Test the saved JSON files to ensure they meet the specified criteria.
    
    Parameters:
        model (str): The name of the model (e.g., "Model A", "Model B").
        output_dir (str): The directory where the JSON files are saved.
        
    Returns:
        None
    """
    
    # Initialize an empty dictionary to store the logging statement IDs for each mode
    logging_statement_ids_per_mode = {}
    
    # Loop through each processing mode
    for mode in ProcessingMode:
        # Construct the file path for the saved JSON file corresponding to the model and mode
        file_path = os.path.join(output_dir, model, f'processed_logs_mode_{mode.name}.json')
        
        # Read the JSON file
        with open(file_path, 'r') as f:
            processed_logs = json.load(f)
        
        # Initialize an empty set to store unique logging statement IDs
        unique_logging_statement_ids = set()
        
        # Loop through the processed logs to extract the logging statement IDs
        for service, subprocess_data in processed_logs.items():
            for subprocess, log_data in subprocess_data.items():
                for log_statement in log_data["log_statements"]: 
                    
                    log_statement_id = log_statement["logging_statement_id"]
                     # Check if this log_statement_id is already in the set
                    if log_statement_id in unique_logging_statement_ids:
                        print(f"Duplicate log_statement_id found: {log_statement_id}")
                        
                    unique_logging_statement_ids.add(log_statement_id)
                    
                print(f"{len(log_data['log_statements'])} vs {len(unique_logging_statement_ids)}")   
        
        # Store the unique logging statement IDs for this mode
        logging_statement_ids_per_mode[mode] = unique_logging_statement_ids
        print(f"Number of unique logging statement IDs for {mode.name}: {len(unique_logging_statement_ids)}")
    
    # Test 1: Check that the total number of logging statement IDs are the same in all the JSON files of a model
    num_ids = None
    for mode, ids in logging_statement_ids_per_mode.items():
        if num_ids is None:
            num_ids = len(ids)
        else:
            assert num_ids == len(ids), f"Number of logging statement IDs mismatch in mode {mode.name}."
    
    print(f"Test 1 Passed: All JSON files for {model} have the same number of unique logging statement IDs.")
    
    # Test 2: Test that the same IDs are present in all the JSON files of a model
    first_mode_ids = None
    for mode, ids in logging_statement_ids_per_mode.items():
        if first_mode_ids is None:
            first_mode_ids = ids
        else:
            assert first_mode_ids == ids, f"Logging statement IDs mismatch in mode {mode.name}."
    
    print(f"Test 2 Passed: All JSON files for {model} contain the same unique logging statement IDs.")

# Example usage
# Assuming the JSON files are saved in a directory named "Log Data"
output_dir = 'Log Data'
for model in ["Model A", "Model B"]:
    test_saved_json_files(model, output_dir)


25 vs 25
Duplicate log_statement_id found: experiment-service_SqlStatementLogger.java_logStatement_144
Duplicate log_statement_id found: experiment-service_TwoPhaseLoad.java_initializeEntityEntryLoadedState_171
Duplicate log_statement_id found: experiment-service_TwoPhaseLoad.java_initializeEntityEntryLoadedState_184
Duplicate log_statement_id found: experiment-service_TwoPhaseLoad.java_initializeEntityEntryLoadedState_215
Duplicate log_statement_id found: experiment-service_TwoPhaseLoad.java_initializeEntityFromEntityEntryLoadedState_352
Duplicate log_statement_id found: experiment-service_SharedEntityManagerCreator.java_invoke_302
Duplicate log_statement_id found: experiment-service_CriteriaQueryImpl.java_interpret_303
Duplicate log_statement_id found: experiment-service_QueryTranslatorImpl.java_parse_292
Duplicate log_statement_id found: experiment-service_ErrorTracker.java_throwQueryException_97
Duplicate log_statement_id found: experiment-service_QueryTranslatorImpl.java_showHqlAs