# Dataset 

## setup

In [4]:
import os
import pandas as pd
from script.dataset_utils import cleanup, triplet_to_text

In [5]:
dataset_dir='dataset/'
clean_dataset_dir = 'clean_dataset/'
modified_dataset_dir = 'modified_dataset/'
for data_dir in (dataset_dir,clean_dataset_dir,modified_dataset_dir):
    os.makedirs(data_dir, exist_ok=True)

splits= ('train', 'test', 'dev')
relations = {'Physical': ['ObjectUse', 'CapableOf', 'MadeUpOf', 'HasProperty', 'Desires', 'NotDesires',
                                           'AtLocation'],
                     'Event': ['Causes', 'HinderedBy', 'xReason', 'isAfter', 'isBefore', 'HasSubEvent',
                                        'isFilledBy'],
                     'Intent': ['xIntent', 'xNeed', 'xWant', 'oWant'],
             'Reaction': ['xReact', 'oReact', 'xAttr', 'xEffect','oEffect']
            }


## make clean dataset

In [6]:
for split_name in splits:
    load_path = dataset_dir + split_name + '.tsv'
    save_path = clean_dataset_dir + split_name + '.tsv'
    df = pd.read_csv(load_path, sep='\t', names=['head', 'relation', 'tail'])
    df = cleanup(df)
    df.to_csv(save_path, index=False, sep='\t') 

old size (1076880, 3), after removing None new size (956451, 3)
old size (152209, 3), after removing None new size (137281, 3)
old size (102024, 3), after removing None new size (88849, 3)


## make full supervised train

In [7]:
for split_name in ['train']:
    df = pd.read_csv(clean_dataset_dir + split_name + '.tsv', sep='\t')
    df.apply(triplet_to_text, axis='columns')
    df = df.drop(columns='relation')
    df = df.sample(frac=1).reset_index(drop=True)
    df.to_json(f'{modified_dataset_dir}{split_name}.json', orient='records')    

## make modified dataset for each relation type

In [8]:
for split_name in splits:
    df = pd.read_csv(clean_dataset_dir + split_name + '.tsv', sep='\t')
    for key,value in relations.items():
        new_df = df.loc[df['relation'].isin(value)]
        new_df.apply(triplet_to_text, axis='columns')
        new_df = new_df.drop(columns='relation')
        if split_name in ['test','dev']:
            new_df = new_df.groupby('head')['tail'].apply(list).reset_index(name='tail')
        new_df = new_df.sample(frac=1).reset_index(drop=True)
        new_df.to_json(f'{modified_dataset_dir}{key} {split_name}.json', orient='records')    