In [4]:
ROOT_FOLDER = ""

In [7]:
from dotenv import load_dotenv
assert load_dotenv()

In [23]:
TASK = """You will be asked by the user to create a plant UMl model from specification text. Do so in the most
clear way possible, avoid class properties and assign molteplicity. 

Do not include attributes for classes. For example the class Book would be:

class Book{{}}

Use only bi-directional arc for relations and no description. For example a relation between
the class Book and the class Page, if the Book can have from one to many pages and the 
pages could have exactly one book, would be:

Book "1..1" -- "1..*" Page

Adapt the cardinality to each case. If the cardinality would be "0..*", the default one, omit it.

The plantuml has to be the class diagram. In generating the diagram perform this steps in order

1. Extract class from text
2. Extract relations form text
3. Assign the relation to the corresponding class
4. Add cardinality to the relations

Put everything in this order: first all classes and then all relations. In our example would be:

@startuml

class Book{{}}
class Page{{}}

Book "1..1" -- "1..*" Page

@enduml

Output plantuml without futher text or explaination.

##############

The specification text is:

{text}

##############

The uml output is:
"""

In [2]:
import os
import pandas as pd
from tqdm import tqdm
import re

In [45]:
def create_dataset(root_folder_path):
    """
    Explores subfolders of the root folder (depth 1), processes each subfolder's `text.txt`
    and `uml.txt` and creates and instruction dataset in pandas format

    Args:
        root_folder_path (str): Path to the root folder.
    """
    # init dataframe to return
    
    df_d = {"name": [], "instruction": [], "input": [], "output":[]}
    
    for subfolder_name in tqdm(os.listdir(root_folder_path)):
        subfolder_path = os.path.join(root_folder_path, subfolder_name)
        
        # Ensure the current item is a subfolder
        if os.path.isdir(subfolder_path):
            subfolder_name = subfolder_path.split("/")[-1]
            text_file_path = os.path.join(subfolder_path, "text.txt")
            uml_file_path = os.path.join(subfolder_path, "uml.txt")
            
            
            # Check if `text.txt` and `uml.txt` exist in the subfolder
            if os.path.isfile(text_file_path) and os.path.isfile(uml_file_path):
                df_d["name"].append(subfolder_name) 
                df_d["instruction"].append(TASK) 
                df_d["input"].append(open(text_file_path).read()) 
                df_d["output"].append(open(uml_file_path).read())
                with open("train_f.jsonl","a") as f:
                    s0 = TASK.replace("\n", " ").replace('"', r'\"')
                    s1 = open(text_file_path).read().replace("\n", " ").replace('"', r'\"')
                    s2 = open(uml_file_path).read().replace("\n", " ").replace('"', r'\"')
                    f.write(f'{{"messages": [{{"role": "system", "content": "{s0}"}}, {{"role": "user", "content": "{s1}"}}, {{"role": "assistant", "content": "{s2}"}}]}}\n')

    df = pd.DataFrame(df_d)
    return df
                

In [46]:
df = create_dataset(ROOT_FOLDER)

100%|█████████████████████████████████████████| 44/44 [00:00<00:00, 2186.79it/s]


In [26]:
df.head()

Unnamed: 0,name,instruction,input,output
0,EmployeesAndLeaders,You will be asked by the user to create a plan...,"For each project, a number of employees are as...",@startuml\n\nclass Project{}\nclass MemberAssi...
1,EmployeesAndDepartment,You will be asked by the user to create a plan...,A department can have several employees. At an...,@startuml\n\nclass Department{} \nclass Employ...
2,ProductsAndSuppliers,You will be asked by the user to create a plan...,"A supplier can supply various products, each a...",@startuml\n\nclass Supplier{}\nclass Product{}...
3,Menso,You will be asked by the user to create a plan...,"Flander's bus company ""De Lijn"" has a number o...",@startuml\n\nclass BusLine {\n}\n\nclass Categ...
4,FactoriesAndProducts,You will be asked by the user to create a plan...,Each individual product is produced by exactly...,@startuml\n\nclass Factory{}\nclass Product{}\...


In [27]:
len(df)

27

In [3]:
from datasets import load_dataset

In [4]:
dataset = load_dataset("LaserOverrider/text2uml")

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['name', 'instruction', 'input', 'output'],
        num_rows: 27
    })
})

In [8]:
df = pd.DataFrame(dataset['train'])

In [9]:
df.head()

Unnamed: 0,name,instruction,input,output
0,EmployeesAndLeaders,You will be asked by the user to create a plan...,"For each project, a number of employees are as...",@startuml\n\nclass Project{}\nclass MemberAssi...
1,EmployeesAndDepartment,You will be asked by the user to create a plan...,A department can have several employees. At an...,@startuml\n\nclass Department{} \nclass Employ...
2,ProductsAndSuppliers,You will be asked by the user to create a plan...,"A supplier can supply various products, each a...",@startuml\n\nclass Supplier{}\nclass Product{}...
3,Menso,You will be asked by the user to create a plan...,"Flander's bus company ""De Lijn"" has a number o...",@startuml\n\nclass BusLine {\n}\n\nclass Categ...
4,FactoriesAndProducts,You will be asked by the user to create a plan...,Each individual product is produced by exactly...,@startuml\n\nclass Factory{}\nclass Product{}\...


In [32]:
from datasets import Dataset, DatasetDict

In [33]:
dataset = Dataset.from_pandas(df)

In [39]:

def format_row_as_instruction_prompt(example):
    # Check if 'input' key exists and has content
    has_input = example.get('input', None) is not None

    # Define the prompts based on the presence of input
    if has_input:
        primer_prompt = ("Below is an instruction that describes a task, paired with an input "
                         "that provides further context. Write a response that appropriately completes the request.")
        input_template = f"### Input: \n{example['input']}\n\n"
    else:
        primer_prompt = ("Below is an instruction that describes a task. "
                         "Write a response that appropriately completes the request.")
        input_template = ""

    instruction_template = f"### Instruction: \n{example['instruction']}\n\n"

    # Check if 'output' key exists
    if example.get('output', None):
        response_template = f"### Response: \n{example['output']}\n\n"
    else:
        response_template = ""

    return f"{primer_prompt}\n\n{instruction_template}{input_template}{response_template}"

In [40]:
# Test with an example dictionary
test_example = {
    'instruction': "Open the door.",
    'input': "The door is locked.",
    'output': "Use the key to unlock and then open the door."
}

print(format_row_as_instruction_prompt(test_example))

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: 
Open the door.

### Input: 
The door is locked.

### Response: 
Use the key to unlock and then open the door.




In [41]:
df.to_json('temp.json', orient='records', lines=True)