In [1]:
#utils
import pandas as pd
import os
import numpy as np
from nltk.tokenize import word_tokenize

In [10]:
dataset_names = [
    'D2T-1-CFA_WebNLG_CounterFactual',
    'D2T-1-FA_WebNLG_Factual',
    'D2T-1-FI_WebNLG_Fictional',
    'D2T-2-CFA_Wikidata_CounterFactual',
    'D2T-2-FA_Wikidata_Factual',
    'D2T-2-FI_Wikidata_Fictional',
    'dev',
    'train',
]

## FUNCTIONS

In [3]:
def merge_dicts(dict1, dict2):
    #print(f'START MERGE_DICTS')
    merged_dict = {}
    for key, value in dict1.items():
        merged_dict[key] = dict1[key]

        if key in dict2:
            for triple in value:
                flag = True
                for key, value in merged_dict.items():
                    if triple in value:
                        flag = False
                        break

                if flag:
                    merged_dict[key].insert(0, triple)
        else:
            merged_dict[key] = value

    keys = list(merged_dict.keys())
    values = list(merged_dict.values())
    keys = [str(i) for i in range(len(keys))]
    merged_dict = dict(zip(keys, values))
    print(f'merged_dict: {merged_dict}')

    #print(f'END MERGE_DICTS')
    return merged_dict

def split_mixed(triples):
    subject_dict = {}
    object_dict = {}
    #print(f'START SPLIT_MIXED')
    for triple in triples:
        triple = triple.split(' ')

        if len(triple) > 1:
            # sibling
            subject = triple[0]

            if subject in subject_dict:
                subject_dict[subject].append(triple)
            else:
                subject_dict[subject] = [triple]

            # chain
            object = triple[0]

            for triple in triples:
                triple = triple.split(' ')

                if len(triple) > 1:
                    if triple[2] == object:
                        if object in object_dict:
                            object_dict[object].append(triple)
                        else:
                            object_dict[object] = [triple]

    print(f'subject_dict: {subject_dict}')
    print(f'object_dict: {object_dict}')

    #print(f'subject_dict: {subject_dict}')
    #print(f'object_dict: {object_dict}')

    merged_dict = merge_dicts(subject_dict, object_dict)

    #print(f'END SPLIT_MIXED')
    return merged_dict

In [4]:
#splitta il dizionario in gruppi di 3 nel caso ci fossero chiavi con piu di 3 valori
def split_dict(dict_to_split):
    new_dict = {}
    for key, value in dict_to_split.items():
        num_chunks = (len(value) + 2) // 3  # Calcoliamo il numero di chunk necessari
        for i in range(num_chunks):
            new_key = f"{key}_{i}" if i > 0 else key  # Creiamo una nuova chiave con indice se necessario
            start = i * 3
            end = min((i + 1) * 3, len(value))
            new_dict[new_key] = value[start:end]  # Aggiungiamo i valori corrispondenti
    return new_dict

In [5]:
def create_dict_with_max_values(array, max_values_per_key):
    array = [triple for triple in array if triple.strip()]  # Rimuovi le stringhe vuote
    result_dict = {}
    current_key_index = 0
    current_key_values = []
    for triple in array:
        current_key_values.append(triple.split(' '))
        if len(current_key_values) == max_values_per_key:
            result_dict[current_key_index] = current_key_values
            current_key_index += 1
            current_key_values = []
    # Aggiungi eventuali valori rimanenti
    if current_key_values:
        result_dict[current_key_index] = current_key_values
    return result_dict

## MAIN

In [11]:
for dataset_name in dataset_names:
    print(f'Processing {dataset_name}')
    #dataset = pd.read_csv(f'../GEM/data_formatted/{dataset_name}.csv')
    full_dataset = pd.read_csv(f'pre-processed-data\\{dataset_name}.csv')
    # get only 1000 random rows from dataset with random seed = 42
    perc_d = 1000 / len(full_dataset)
    dataset = full_dataset.sample(frac=perc_d, random_state=42)

    # refresh index
    dataset = dataset.reset_index(drop=True)

    triples_dict = {}
    j = 0
    for i in range(len(dataset)):
        triples = dataset['triple'][i].split(' | ')

        if (dataset['shape_type'][i] == 'mixed' or dataset['shape_type'][i] == 'NA' or dataset['shape_type'][i] == 'unknown') and dataset['size'][i] > 2:
            print('1', triples)
            dictionary = split_mixed(triples)
            dictionary = split_dict(dictionary)
        else:
            dictionary = create_dict_with_max_values(triples, 3)

        
            
        for key, value in dictionary.items():
            value_text = ''
            for triple in value:
                value_text += ' '.join(triple) + ' '

            triples_dict[j] = {
                'id': i,
                'triples': value_text,
                'data_unit' : dataset['triple'][i],
                'actual' : dataset['sentence'][i],
                'shape' : dataset['shape'][i],
                'shape_type' : dataset['shape_type'][i],
                'local_size' : len(value),
                'size' : dataset['size'][i],
            }
            j += 1

    triples_df = pd.DataFrame.from_dict(triples_dict, "index")
    triples_df.to_csv(f'splitted-data\\{dataset_name}.csv', index=False)

triples_df.head()

Processing D2T-1-CFA_WebNLG_CounterFactual
1 ['Larry_Sanger birthPlace Białystok Białystok timeZone Moscow_Time Larry_Sanger weight 20378.5 ']
subject_dict: {'Larry_Sanger': [['Larry_Sanger', 'birthPlace', 'Białystok', 'Białystok', 'timeZone', 'Moscow_Time', 'Larry_Sanger', 'weight', '20378.5', '']]}
object_dict: {}
merged_dict: {'0': [['Larry_Sanger', 'birthPlace', 'Białystok', 'Białystok', 'timeZone', 'Moscow_Time', 'Larry_Sanger', 'weight', '20378.5', '']]}
1 ['Moritzbastei address 200_Public_Square Vlerick_Business_School country "United_States" Moritzbastei currentTenants Vlerick_Business_School Moritzbastei location New_Hampshire ']
subject_dict: {'Moritzbastei': [['Moritzbastei', 'address', '200_Public_Square', 'Vlerick_Business_School', 'country', '"United_States"', 'Moritzbastei', 'currentTenants', 'Vlerick_Business_School', 'Moritzbastei', 'location', 'New_Hampshire', '']]}
object_dict: {}
merged_dict: {'0': [['Moritzbastei', 'address', '200_Public_Square', 'Vlerick_Business_

Unnamed: 0,id,triples,data_unit,actual,shape,shape_type,local_size,size
0,0,A-Rosa_Luna shipOrdered 2004-01-22,A-Rosa_Luna shipOrdered 2004-01-22,The A-Rosa Luna ship was ordered on January 22...,(X (X)),,1,1
1,1,"Aaron_S._Daggett birthPlace Greene,_Maine","Aaron_S._Daggett birthPlace Greene,_Maine","The birth place of Aaron S. Daggett is Greene,...",(X (X)),,1,1
2,2,Houston_Texans city Texas Houston_Texans city ...,Houston_Texans city Texas Houston_Texans city ...,"Akeem Dent, who debuted with the Atlanta Falco...",(X (X (X)) (X (X) (X))),mixed,1,5
3,3,"Al_Asad_Airbase location ""Al Anbar Province, I...","Al_Asad_Airbase location ""Al Anbar Province, I...",The Al Asad Airbase is situated in the Al Anba...,(X (X) (X) (X)),sibling,1,3
4,4,Antwerp_International_Airport cityServed Antwe...,Antwerp_International_Airport cityServed Antwe...,"Bart De Wever is the mayor of Antwerp, which i...",(X (X (X))),chain,1,2


In [12]:
def split_dict(dict_to_split):
    new_dict = {}
    for key, value in dict_to_split.items():
        num_chunks = (len(value) + 2) // 3  # Calcoliamo il numero di chunk necessari
        for i in range(num_chunks):
            new_key = f"{key}_{i}" if i > 0 else key  # Creiamo una nuova chiave con indice se necessario
            start = i * 3
            end = min((i + 1) * 3, len(value))
            new_dict[new_key] = value[start:end]  # Aggiungiamo i valori corrispondenti
    return new_dict

dict_to_split = {'0': [['William_Anders', 'mission', 'Apollo_8'], ['Apollo_8', 'crew1Up', 'Frank_Borman'], ['Apollo_8', 'crew2Up', 'Buzz_Aldrin'], ['Apollo_8', 'operator', 'NASA']]}

split_dict_result = split_dict(dict_to_split)

for key, value in split_dict_result.items():
    print(f"{key}: {value}")

0: [['William_Anders', 'mission', 'Apollo_8'], ['Apollo_8', 'crew1Up', 'Frank_Borman'], ['Apollo_8', 'crew2Up', 'Buzz_Aldrin']]
0_1: [['Apollo_8', 'operator', 'NASA']]
