In [1]:
import pandas as pd
import json
import xml.etree.ElementTree as ET
import numpy as np
import os 
import re
import random


In [2]:
# Define folder and output paths
folder = r"C:\Users\moham\Desktop\SwarmChat_github\SwarmChat\Dataset generator\raw_data"
output_path = r"C:\Users\moham\Desktop\SwarmChat_github\SwarmChat\Dataset generator\combined_datasets.jsonl"

In [3]:
# Read files from folder
dataset = []
input_paths = os.listdir(folder)
for f in input_paths:
    file_path = os.path.join(folder, f)
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
        dataset.append(data)

In [4]:
# Define the parsing function first
def parse_generated_text_with_proper_bt(sections):
    instructions = []
    # 'sections' is a list of examples already split by "###"
    for section in sections:
        # Only process sections that contain all required markers
        if all(marker in section for marker in ['SYSTEM:', 'INSTRUCTIONS:', 'USER COMMAND:', 'OUTPUT:']):
            # Find indices of each marker
            system_index = section.find('SYSTEM:')
            instructions_index = section.find('INSTRUCTIONS:')
            user_command_index = section.find('USER COMMAND:')
            output_index = section.find('OUTPUT:')
            
            # Extract text for each block by slicing using the indices
            system_text = section[system_index + len('SYSTEM:'):instructions_index].strip()
            instructions_text = section[instructions_index + len('INSTRUCTIONS:'):user_command_index].strip()
            user_command_text = section[user_command_index + len('USER COMMAND:'):output_index].strip()
            output_text = section[output_index + len('OUTPUT:'):].strip()
            
            # Check if the output is a "sorry" message
            if "Sorry, I can't do the task" in output_text:
                instructions.append({
                    'SYSTEM': system_text,
                    'INSTRUCTIONS': instructions_text,
                    'USER COMMAND': user_command_text,
                    'OUTPUT': output_text
                })
            # If the output is a behavior tree XML (contains both <root and </root>)
            elif '<root' in output_text and '</root>' in output_text:
                # Optionally, format the XML to be more readable
                formatted_xml_output = output_text.replace('><', '>\n<')
                instructions.append({
                    'SYSTEM': system_text,
                    'INSTRUCTIONS': instructions_text,
                    'USER COMMAND': user_command_text,
                    'OUTPUT': formatted_xml_output
                })
            else:
                pass

    return instructions


def extract_behavior_tree(response: str) -> str:
    """
    Extracts an XML behavior tree from the given response text.
    Looks for a block of XML enclosed in <root ... </root> tags.
    """
    pattern = re.compile(r'(<root.*?</root>)', re.DOTALL)
    match = pattern.search(response)
    if match:
        return match.group(1).strip()
    else:
        return response.strip()
    
    

In [5]:
# Process each file and write to JSONL file
# Open the output file in write mode ('w') if you want to create a fresh file each run
with open(output_path, 'w', encoding='utf-8') as jsonl_file:
    for file_content in dataset:
        # Split each file content by "###"
        pieces = [piece.strip() for piece in file_content.split("###") if piece.strip()]
        # Parse the sections
        instructions = parse_generated_text_with_proper_bt(pieces)
        
        # (Optional) Create a DataFrame if you need to inspect it
        df_instructions = pd.DataFrame(instructions)
        
        # Write each parsed instruction as a JSON line
        for instruction in instructions:
            jsonl_file.write(json.dumps(instruction) + "\n")

print("Processing complete. Data saved to JSONL.")

Processing complete. Data saved to JSONL.


### Train, validation, and test split

In [11]:

# Read the data from the JSONL file
with open(output_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f if line.strip()]

# Shuffle the data to ensure random splitting
random.shuffle(data)

# Example splits: 70% training, 20% validation, 10% test
n = len(data)
train_end = int(0.7 * n)
val_end = int(0.9 * n)

train_data = data[:train_end]
val_data = data[train_end:val_end]
test_data = data[val_end:]

# Define paths for the training, validation, and test files
train_path = r"C:\Users\moham\Desktop\SwarmChat_github\SwarmChat\Dataset generator\train_data.jsonl"
val_path = r"C:\Users\moham\Desktop\SwarmChat_github\SwarmChat\Dataset generator\validation_data.jsonl"
test_path = r"C:\Users\moham\Desktop\SwarmChat_github\SwarmChat\Dataset generator\test_data.jsonl"

# Write the training set to a new JSONL file
with open(train_path, "w", encoding="utf-8") as f:
    for entry in train_data:
        f.write(json.dumps(entry) + "\n")

# Write the validation set to a new JSONL file
with open(val_path, "w", encoding="utf-8") as f:
    for entry in val_data:
        f.write(json.dumps(entry) + "\n")

# Write the test set to a new JSONL file
with open(test_path, "w", encoding="utf-8") as f:
    for entry in test_data:
        f.write(json.dumps(entry) + "\n")

print("Data split complete! Training, validation, and test files have been generated.")


Data split complete! Training, validation, and test files have been generated.
