In [None]:
# The following code was made with help of an example code from HuggingFace/Phi3mini4k page
# Keep in mind that the following code was designed for iisys servers of Hof University

In [None]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from tqdm import tqdm
import logging

torch.random.manual_seed(0)
# Choosing the model and configuring
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

# During development the iisys servers sometimes stop the process, which is not ideal when processing 2k files that is usually 2h.
# So in order to fix this problem, the log system was created to not process the same files twice
def create_log_from_output(output_directory, log_file):
    processed_files = set()
    if os.path.exists(output_directory):
        for filename in os.listdir(output_directory):
            if filename.startswith("processed_") and filename.endswith(".txt"):
                original_filename = filename[len("processed_"):]
                processed_files.add(original_filename)
    
    with open(log_file, 'w') as log:
        for filename in processed_files:
            log.write(f"{filename}\n")

    return processed_files

# A script to process all files in chosen directory
def process_text_files(input_directory, output_directory, log_file):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    processed_files = create_log_from_output(output_directory, log_file)

    all_files = [f for f in os.listdir(input_directory) if f.endswith(".txt")]
    total_files = len(all_files)

    files_to_process = [
        f for f in all_files 
        if f not in processed_files
    ]

    with open(log_file, 'a') as log, tqdm(total=total_files, initial=len(processed_files), desc="Processing files") as pbar:
        for filename in files_to_process:
            input_file_path = os.path.join(input_directory, filename)
            output_file_path = os.path.join(output_directory, f"processed_{filename}")

            try:
                with open(input_file_path, 'r') as file:
                    # inputing parsed bpmn files in order to put it into prompt
                    content = file.read()
                # The optimal prompt with content input in order to create a good quality prompt from our data
                prompt = f"Can you describe the following into a more natural and understandable flow description using names? Please fit it into one paragraph and not separate tables. The following: {content}"

                messages = [
                    {"role": "user", "content": prompt}
                ]

                output = pipe(messages, **generation_args)
                generated_text = output[0]['generated_text']
                # Updating the log
                with open(output_file_path, 'w') as out_file:
                    out_file.write(generated_text)

                log.write(f"{filename}\n")
                log.flush()
                pbar.update(1)

                print(f"Processed {filename} and saved to {output_file_path}")

            except Exception as e:
                logging.error(f"Error processing {filename}: {e}")

# Setting up log file
logging.basicConfig(filename='bpmn_processing.log', level=logging.ERROR)

# Directory for input files - parsed bpmn files
input_directory = './input/output/'
# Directory to save processed prompts into txt files
output_directory = 'finished_output/'
# Log file to keep track of processed files
log_file = 'finished log.log'

# Launching the script
process_text_files(input_directory, output_directory, log_file)
