In [16]:
import csv
import os
from typing import List, Dict, Iterable

import numpy as np
import pandas as pd
from kbc_pul.project_info import data_dir as kbc_pul_data_dir
from kbc_pul.experiments_utils.datasets.data_cleaning import clean_triples
from kbc_pul.experiments_utils.file_utils import print_file_exists


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Yago Yago3_10 data cleaning

In [17]:
dataset_name: str = 'yago3_10'
cleaned_data_dir = os.path.join(kbc_pul_data_dir, dataset_name, 'cleaned_csv')
original_data_dir = os.path.join(kbc_pul_data_dir, dataset_name, 'original')
print_file_exists(cleaned_data_dir)
print_file_exists(original_data_dir)

? file exists: /home/joschout/Documents/Repos/KBC-e-metrics/data/yago3_10/cleaned_csv
-> True
? file exists: /home/joschout/Documents/Repos/KBC-e-metrics/data/yago3_10/original
-> True


In [18]:
data_partition_sets: List[str] = ['train', 'valid', 'test']

original_data_map = dict()
column_names: List[str] =["E1", "Rel", "E2"]

In [19]:
for dataset_part_name in data_partition_sets:
    original_data_part_filename = os.path.join(
            original_data_dir,
            f"{dataset_part_name}.txt"
    )
    print_file_exists(original_data_part_filename)
    original_data_map[dataset_part_name] = pd.read_csv(
        original_data_part_filename,
        sep="\t",
        header=None, names=column_names
    )

? file exists: /home/joschout/Documents/Repos/KBC-e-metrics/data/yago3_10/original/train.txt
-> True
? file exists: /home/joschout/Documents/Repos/KBC-e-metrics/data/yago3_10/original/valid.txt
-> True
? file exists: /home/joschout/Documents/Repos/KBC-e-metrics/data/yago3_10/original/test.txt
-> True


In [20]:
original_data_map['train'].head()


Unnamed: 0,E1,Rel,E2
0,Chatou,isLocatedIn,France
1,Boo_Young-tae,playsFor,Yangju_Citizen_FC
2,Toni_Kuivasto,isAffiliatedTo,Helsingin_Jalkapalloklubi
3,Josh_Smith_(soccer),playsFor,Trinity_University_(Texas)
4,Albrecht_Dürer,diedIn,Nuremberg


In [21]:
for data_part in data_partition_sets:
    print(f"{data_part} - {original_data_map[data_part].shape}")

train - (1079040, 3)
valid - (5000, 3)
test - (5000, 3)


## Duplicate detection on the original data

In [22]:
for dataset_part_name in data_partition_sets:
    original_data_df: pd.DataFrame = original_data_map[dataset_part_name]
    duplicate_rows = original_data_df[original_data_df.duplicated()]
    print(f"{dataset_part_name} - {duplicate_rows.shape}")

train - (0, 3)
valid - (0, 3)
test - (0, 3)


## Cleaning data, PER dataset part

In [23]:
delimiter = "\t"
reclean = True
should_sort: bool = True
if reclean:
    # for train, test, valid
    for dataset_part_name in data_partition_sets:
        print(f"cleaning {dataset_part_name}")
        dataset_part_output_csv_filename: str = os.path.join(
            cleaned_data_dir,
            f"{dataset_part_name}.csv"
        )
        print(f"Writing {dataset_part_name} to {dataset_part_output_csv_filename}")

        dataset_part_df = original_data_map[dataset_part_name]
        duplicate_rows = dataset_part_df[dataset_part_df.duplicated()]
        print(f"{dataset_part_name} - {duplicate_rows.shape}")
        dataset_part = dataset_part_df.values
        clean_triples(
            dataset_part=dataset_part,
            dataset_part_output_csv_filename=dataset_part_output_csv_filename,
            should_sort=should_sort,
            separator=delimiter
        )
print("DONE")

cleaning train
Writing train to /home/joschout/Documents/Repos/KBC-e-metrics/data/yago3_10/cleaned_csv/train.csv
train - (0, 3)
Sorting on Object...
Sorting on Subject...
Sorting on Rel...


100%|██████████| 1079040/1079040 [00:18<00:00, 56934.77it/s]


cleaning valid
Writing valid to /home/joschout/Documents/Repos/KBC-e-metrics/data/yago3_10/cleaned_csv/valid.csv
valid - (0, 3)
Sorting on Object...
Sorting on Subject...
Sorting on Rel...


100%|██████████| 5000/5000 [00:00<00:00, 61156.61it/s]


cleaning test
Writing test to /home/joschout/Documents/Repos/KBC-e-metrics/data/yago3_10/cleaned_csv/test.csv
test - (0, 3)
Sorting on Object...
Sorting on Subject...
Sorting on Rel...


100%|██████████| 5000/5000 [00:00<00:00, 50031.54it/s]

DONE





## Drop any duplicates

In [24]:
should_drop_duplicates = True
if should_drop_duplicates:
    dataset_name: str
    for dataset_part_name in data_partition_sets:
        print(f"Cleaning {dataset_part_name}")
        dataset_part_output_csv_filename: str = os.path.join(
            cleaned_data_dir,
            f"{dataset_part_name}.csv"
        )

        # --- removing inverted relations and duplicates ---
        header=["E1", "Rel", "E2"]
        triples_df: pd.DataFrame = pd.read_csv(
            dataset_part_output_csv_filename, header=None, names=header,
            sep=delimiter
        )
        # print(f"Length before dropping inverted relations: {len(triples_df)}")
        # triples_df = triples_df[~triples_df['Rel'].isin(inverted_relations)]
        # print(f"Length after dropping inverted_relations: {len(triples_df)}")

        print(f"Length before dropping duplicates: {len(triples_df)}")
        triples_df: pd.DataFrame = triples_df.drop_duplicates()
        print(f"Length after dropping duplicates: {len(triples_df)}")
        triples_df.to_csv(
            dataset_part_output_csv_filename, header=False, index=False,
            sep=delimiter
        )
        print("---")
print("DONE")



Cleaning train
Length before dropping duplicates: 1079040
Length after dropping duplicates: 1078898
---
Cleaning valid
Length before dropping duplicates: 5000
Length after dropping duplicates: 5000
---
Cleaning test
Length before dropping duplicates: 5000
Length after dropping duplicates: 5000
---
DONE


In [25]:
dataset_part_output_csv_filename: str = os.path.join(
    cleaned_data_dir,
    f"train.csv"
)

# --- removing inverted relations and duplicates ---
header=["E1", "Rel", "E2"]
triples_df: pd.DataFrame = pd.read_csv(dataset_part_output_csv_filename, header=None, names=header)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [26]:
duplicate_rows = triples_df[triples_df.duplicated()]
duplicate_rows.shape

(0, 3)

In [27]:
duplicate_rows.head()

Unnamed: 0,E1,Rel,E2
