In [1]:
import os

import pandas as pd
from docx import Document

## Convert scenarios form the 'docs' format to 'csv' format

In [2]:
files = ["scenario1.csv", "scenario2.csv", "scenario3.csv", "scenario4.csv", "scenario5.csv", "scenario6.csv"]

In [114]:
def docx_to_csv(docx_path, csv_path):
    # Load the DOCX file
    doc = Document(docx_path)
    
    # Extract paragraphs
    paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip() != ""]
    
    # Create a DataFrame with each paragraph as a column
    df = pd.DataFrame(paragraphs[1:], columns=['Scenario'])
    print(f"{docx_path} :  size: {len(df)}")

    duplicated_rows = df[df.duplicated()]
    if len(duplicated_rows):
        print("Duplicated rows:")
        print(duplicated_rows)
    # Save DataFrame to a CSV file
    df.to_csv(csv_path, index=False)
    return df

In [115]:
def remove_matching_extensions_str(file_name, extensions):
    base_name = file_name
    for ext in extensions:
        if base_name.lower().endswith(ext.lower()):
            base_name = base_name[: -len(ext)]
    return base_name

In [116]:
# Function to remove the "_versions" suffix from the filename
def remove_versions_suffix(filename):
    return filename.replace('_versions', '')

In [117]:
def convert_files_to_csv(directory, extensions, output_directory):
    for root, dirs, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith(extensions):
                file_path = os.path.join(root, file_name)
                print(f" File: {file_path}")

                docx_path = file_path
                os.makedirs(output_directory, exist_ok=True)

                # base_name= remove_matching_extensions_str(file_name, extensions)
                base_name, ext = os.path.splitext(file_name)
                csv_path = output_directory+remove_versions_suffix(base_name)+".csv"
                docx_to_csv(docx_path, csv_path)

In [118]:
# Specify the directory you want to list
directory_path = './scenarios'
output_directory = "./scenarios_versions_csv_format/"
extensions = ".docx"
convert_files_to_csv(directory_path, extensions, output_directory)

 File: ./scenarios/scenario3_versions.docx
./scenarios/scenario3_versions.docx :  size: 40
 File: ./scenarios/scenario4_versions.docx
./scenarios/scenario4_versions.docx :  size: 40
 File: ./scenarios/scenario5_versions.docx
./scenarios/scenario5_versions.docx :  size: 40
 File: ./scenarios/scenario1_versions.docx
./scenarios/scenario1_versions.docx :  size: 40
 File: ./scenarios/scenario6_versions.docx
./scenarios/scenario6_versions.docx :  size: 40
 File: ./scenarios/scenario2_versions.docx
./scenarios/scenario2_versions.docx :  size: 40


## Convert document table to csv files

In [119]:
def read_tables_from_docx(docx_path):
    # Load the DOCX file
    print(docx_path)
    doc = Document(docx_path)

    tables = []
    # Iterate over tables in the document
    for table in doc.tables:
        table_data = []
        # Iterate over rows in the table
        for row in table.rows:
            row_data = [cell.text.strip() for cell in row.cells]
            table_data.append(row_data)
        tables.append(table_data)
    return tables

def convert_tables_to_dataframes(tables):
    dataframes = []
    for table in tables:
        df = pd.DataFrame(table)
        dataframes.append(df)
    return dataframes

In [120]:
# Path to your DOCX file
docx_path = "multi_tasks_output_table.docx"
output_csv_path = os.path.splitext(docx_path)[0]+".csv"

# Read tables from DOCX
tables = read_tables_from_docx(docx_path)

# Convert tables to DataFrames
dataframes = convert_tables_to_dataframes(tables)

# for the first tables
df = dataframes[0]
df = df.rename(columns={0: 'Tasks', 1: 'Classes Corresponding to Task'})
df.to_csv(output_csv_path, index = False)

multi_tasks_output_table.docx


### Dataset Columns name: Scenarios (X), Tasks Names (Lables) (Y)

In [121]:
# Extract column names
columns_name = df.set_index("Tasks").T.columns

columns_name = ["Scenario"] + list(columns_name)
output_path = "dataset_columns_names.csv"
# Convert to DataFrame with a single row and save to CSV
pd.DataFrame(columns = columns_name).to_csv(output_path, index=False)

#### Correct or Match the column name of the new dataset with the old one

In [122]:
def correction_in_column_names(path1):    
    files1 = os.listdir(path1)
    
    # Filter CSV files if needed
    csv_files1 = [os.path.join(path1,file) for file in files1 if file.endswith('.csv')]
    print(csv_files1)
    for file_path in csv_files1:
        df1 = pd.read_csv(file_path)
        if "Sub - mission" not in df1.columns:
            df1.insert(loc=2, column='Sub - mission', value="")
        df1['Sub - mission'] = ""
        rename_column_names = {'Hard Constraints': 'Hard Constrains' , 'Soft Constraints' : 'Soft Constrains (Preferences)'}
        df1.rename(columns=rename_column_names, inplace=True)

        # df1.drop(columns=['Unnamed: 0.1'], inplace=True)

        df1.to_csv(file_path,index= False)

In [123]:
path= "scenarios_examples_multi_tasks_output_labels"
path_old_labelled = "old_dataset/labelled/"
path_old_unlabelled = "old_dataset/unlabelled/"

path_correct = "dataset_columns_names.csv"

correction_in_column_names(path_old_unlabelled)
correction_in_column_names(path_old_labelled)
correction_in_column_names(path)

['old_dataset/unlabelled/scenario4.csv', 'old_dataset/unlabelled/scenario5.csv', 'old_dataset/unlabelled/scenario6.csv', 'old_dataset/unlabelled/scenario2.csv', 'old_dataset/unlabelled/scenario3.csv', 'old_dataset/unlabelled/scenario1.csv']
['old_dataset/labelled/scenario4.csv', 'old_dataset/labelled/scenario5.csv', 'old_dataset/labelled/scenario6.csv', 'old_dataset/labelled/scenario2.csv', 'old_dataset/labelled/scenario3.csv', 'old_dataset/labelled/scenario1.csv']
['scenarios_examples_multi_tasks_output_labels/scenario4.csv', 'scenarios_examples_multi_tasks_output_labels/scenario5.csv', 'scenarios_examples_multi_tasks_output_labels/scenario6.csv', 'scenarios_examples_multi_tasks_output_labels/scenario2.csv', 'scenarios_examples_multi_tasks_output_labels/scenario3.csv', 'scenarios_examples_multi_tasks_output_labels/scenario1.csv']


##  Assign labels from scenarios examples to scenarios versions

In [3]:
def combine_scenarios_examples_and_version(df1,df2):
    columns_to_add = df1.columns[1:]
    values_to_add = df1.iloc[0, 1:]

    # Add the columns to df2
    for col in columns_to_add:
        df2[col] = values_to_add[col]
    return df2

In [4]:
def assign_labels_from_scenarios_examples(path1, path2, output_directory):

    os.makedirs(output_directory, exist_ok=True)

    files1 = os.listdir(path1)
    files2 = os.listdir(path2)
    # Filter CSV files if needed
    csv_files1 = [file for file in files1 if file.endswith('.csv')]
    
    csv_files2 = [file for file in files2 if file.endswith('.csv')]
    
    for file1 in csv_files1:
        for file2 in csv_files2:
            if file1 == file2:
                df1 = pd.read_csv(os.path.join(path1,file1))
                df2 = pd.read_csv(os.path.join(path2,file2))
                df = combine_scenarios_examples_and_version(df1,df2)
                df.to_csv(os.path.join(output_directory, file2), index = False)

    print(csv_files1)

In [129]:
scenarios_examples_path = "./scenarios_examples_multi_tasks_output_labels"
scenarios_versions_path = "./scenarios_versions_csv_format"

scenarios_with_labels = "./scenarios_with_labels"

assign_labels_from_scenarios_examples(scenarios_examples_path, scenarios_versions_path, scenarios_with_labels)

['scenario4.csv', 'scenario5.csv', 'scenario6.csv', 'scenario2.csv', 'scenario3.csv', 'scenario1.csv']


## Combine all the scenarios

In [16]:
def combine_scenarios(path):
    files = os.listdir(path)
    
    # Filter CSV files if needed
    csv_files = [os.path.join(path,file) for file in files if file.endswith('.csv')]
    dfs = [pd.read_csv(file_path) for file_path in csv_files]
    
    df = pd.concat(dfs,ignore_index=True)
    return df

In [131]:
path_new = "./scenarios_with_labels"
path_old_labelled = "./old_dataset/labelled/"
path_old_unlabelled = "./old_dataset/unlabelled/"

new_dataset_path = './dataset_new.csv'
old_dataset_path = './dataset_old.csv'

df_new = combine_scenarios(path_new)
df_new.to_csv(new_dataset_path, index = False)


In [132]:
df_labelled = combine_scenarios(path_old_labelled)
df_unlabelled = combine_scenarios(path_old_unlabelled)

df_old = pd.concat([df_labelled , df_unlabelled])

# df_old.to_csv(old_dataset_path, index = False)

df_old.to_csv(old_dataset_path ,index = False)

#### Assign label to old data set

In [332]:
scenarios_examples_path = "./scenarios_examples_multi_tasks_output_labels"
scenarios_versions_path = "./scenarios_versions_csv_format_old"

scenarios_with_labels = "./scenarios_with_labels_old"

assign_labels_from_scenarios_examples(scenarios_examples_path, scenarios_versions_path, scenarios_with_labels)

['scenario4.csv', 'scenario5.csv', 'scenario6.csv', 'scenario2.csv', 'scenario3.csv', 'scenario1.csv']


#### Assign label to generated dataset

In [5]:
scenarios_examples_path = "./scenarios_examples_multi_tasks_output_labels"
scenarios_versions_path = "./scenarios_generated/paragraph_wise_2/"

scenarios_with_labels = "./scenarios_with_labels_generated_split/"

assign_labels_from_scenarios_examples(scenarios_examples_path, scenarios_versions_path, scenarios_with_labels)

['scenario4.csv', 'scenario5.csv', 'scenario6.csv', 'scenario2.csv', 'scenario3.csv', 'scenario1.csv']


In [None]:
scenarios_examples_path = "./scenarios_examples_multi_tasks_output_labels"
scenarios_versions_path = "./scenarios_generated/paragraph_wise/"

scenarios_with_labels = "./scenarios_with_labels_generated/"

assign_labels_from_scenarios_examples(scenarios_examples_path, scenarios_versions_path, scenarios_with_labels)

In [358]:
scenarios_examples_path = "./scenarios_examples_multi_tasks_output_labels"
scenarios_versions_path = "./experiment/test/"

scenarios_with_labels = "./experiment/test_label"

assign_labels_from_scenarios_examples(scenarios_examples_path, scenarios_versions_path, scenarios_with_labels)

['scenario4.csv', 'scenario5.csv', 'scenario6.csv', 'scenario2.csv', 'scenario3.csv', 'scenario1.csv']


## Combined different dataset, find the duplicate

In [133]:
def duplicates_in_each_scenarios(path):
    files = os.listdir(path)

    csv_files = [os.path.join(path,file) for file in files if file.endswith('.csv')]

    for file_path in csv_files:
        df = pd.read_csv(file_path)
        print(file_path)
        print(len(df), len(df.drop_duplicates()))

In [107]:
duplicates_in_each_scenarios(path_old_unlabelled)

./old_dataset/unlabelled/scenario4.csv
20 5
./old_dataset/unlabelled/scenario5.csv
20 11
./old_dataset/unlabelled/scenario6.csv
20 11
./old_dataset/unlabelled/scenario2.csv
20 10
./old_dataset/unlabelled/scenario3.csv
20 12
./old_dataset/unlabelled/scenario1.csv
20 18


In [169]:
def combine_scenarios_from_different_sources(path1, path2, output_directory):
    
    os.makedirs(output_directory, exist_ok=True)

    files1 = os.listdir(path1)
    files2 = os.listdir(path2)


    csv_files1 = [file for file in files1 if file.endswith('.csv')]
    csv_files2 = [file for file in files2 if file.endswith('.csv')]

    for file_path1 in csv_files1:
        for file_path2 in csv_files2:
            
            if file_path1==file_path2:
                print(file_path1,file_path2)
                df1 = pd.read_csv(os.path.join(path1,file_path1))
                df2 = pd.read_csv(os.path.join(path2,file_path2))
                df = pd.concat([df1['Scenario'],df2['Scenario']], ignore_index=True)
                
               
                print(len(df), len(df.drop_duplicates()))

                df.drop_duplicates().to_csv(os.path.join(output_directory,file_path1),index = False)


In [335]:
# output_directory = "./scenarios_versions_csv_format_old/"
# combine_scenarios_from_different_sources(path_old_labelled,path_old_unlabelled, output_directory)

In [326]:
def remove_duplicates_from_old_scenarios(path1 , path2):

    files1 = os.listdir(path1)
    files2 = os.listdir(path2)


    csv_files1 = [file for file in files1 if file.endswith('.csv')]
    csv_files2 = [file for file in files2 if file.endswith('.csv')]

    for file_path1 in csv_files1:
        for file_path2 in csv_files2:
            
            if file_path1==file_path2:
                print(file_path1,file_path2)

                df1 = pd.read_csv(os.path.join(path1,file_path1))
                df2 = pd.read_csv(os.path.join(path2,file_path2))

            
                df = pd.concat([df1['Scenario'],df2['Scenario']], ignore_index=True)
                print("Combined datapoint: ")
                print(f"Original: {len(df)}, Unique: {len(df.drop_duplicates())}")

                df = df2[~df2.Scenario.isin(df1.Scenario)]
                
                df.to_csv(os.path.join(path2,file_path2), index = False)
                
                print("Old datapoint")
                print(f"Original: {len(df)}, Unique: {len(df.drop_duplicates())}")

                print()


In [327]:
path1 = "./scenarios_versions_csv_format"
path2 = "./scenarios_versions_csv_format_old/"
remove_duplicates_from_old_scenarios(path1 , path2)

scenario4.csv scenario4.csv
Combined datapoint: 
Original: 60, Unique: 60
Old datapoint
Original: 20, Unique: 20

scenario5.csv scenario5.csv
Combined datapoint: 
Original: 60, Unique: 60
Old datapoint
Original: 20, Unique: 20

scenario6.csv scenario6.csv
Combined datapoint: 
Original: 60, Unique: 60
Old datapoint
Original: 20, Unique: 20

scenario2.csv scenario2.csv
Combined datapoint: 
Original: 60, Unique: 60
Old datapoint
Original: 20, Unique: 20

scenario3.csv scenario3.csv
Combined datapoint: 
Original: 60, Unique: 60
Old datapoint
Original: 20, Unique: 20

scenario1.csv scenario1.csv
Combined datapoint: 
Original: 60, Unique: 60
Old datapoint
Original: 20, Unique: 20



In [21]:
path_old_extracted = './scenarios_with_labels_split/train'

df = combine_scenarios(path_old_extracted)

df.to_csv("./train_new.csv", index = False)

In [23]:
path_generated = './scenarios_with_labels_split/'


# df = combine_scenarios(path_generated)

# df.to_csv("train_generated.csv", index = False)

In [24]:
df_old = pd.read_csv(path_generated+"dataset_old.csv")
df_new = pd.read_csv(path_generated+"train_new.csv")
df_generated = pd.read_csv(path_generated+"train_generated.csv")

In [26]:
len(df_old["Scenario"]), len(df_old["Scenario"])

(120, 120)

In [27]:
print(len(df_new), len(df_new.drop_duplicates()))

168 168


In [28]:
print(len(df_old), len(df_old.drop_duplicates()))

120 120


In [29]:
print(len(df_generated), len(df_generated.drop_duplicates()))

1680 1680


In [33]:
# Concatenate dataframes
df_combined = pd.concat([df_old, df_new], ignore_index=True)
df_combined = pd.concat([df_combined, df_generated], ignore_index=True)
df_combined.to_csv(path_generated+"train.csv", index = False)

In [31]:
print(len(df_combined), len(df_combined.drop_duplicates()))


1968 1968


In [32]:
dup = df_combined.duplicated()
dup.sum()

0

In [11]:
def train_test_split(path1, path2, path3, train = 28):
    os.makedirs(path2, exist_ok=True)
    os.makedirs(path3, exist_ok=True)

    files1 = os.listdir(path1)
    csv_files1 = [file for file in files1 if file.endswith('.csv')]

    for file_path1 in csv_files1:
        df1 = pd.read_csv(os.path.join(path1,file_path1))
        df1[:train].to_csv(os.path.join(path2,file_path1), index= False)
        df1[train:].to_csv(os.path.join(path3,file_path1), index= False)

In [14]:
path1 = "./scenarios_with_labels_split/test/"
path2 = "./scenarios_with_labels_split/val/"
path3 = "./scenarios_with_labels_split/test/"
train_test_split(path1, path2, path3, train = 6)


In [372]:
path_old_extracted = './scenarios_with_labels_first32/train/'


df = combine_scenarios(path_old_extracted)

df.to_csv("./dataset_new_first32.csv", index = False)

#### remove labeled of dataset

In [355]:
def remove_label(path1, path2):
    os.makedirs(path2, exist_ok=True)

    files1 = os.listdir(path1)
    csv_files1 = [file for file in files1 if file.endswith('.csv')]

    for file_path1 in csv_files1:

        df1 = pd.read_csv(os.path.join(path1,file_path1))
        df1["Scenario"].to_csv(os.path.join(path2,file_path1), index= False)

In [356]:
path1 = "./scenarios_with_labels_old/"
path2 = "./scenarios_versions_csv_format_old/"
remove_label(path1 , path2)

## Appendix Code

#### Print all directory and folders in the given path

In [None]:
def list_files_and_folders(directory):
    for root, dirs, files in os.walk(directory):
        print(f"Root: {root}")
        for dir_name in dirs:
            print(f" Directory: {dir_name}")
        for file_name in files:
            print(f" File: {file_name}")

# Specify the directory you want to list
directory_path = './'
list_files_and_folders(directory_path)