### Data exploration

This notebook aims to explore some elements like missing description and missing wn_id from datasets.

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [None]:
cd ..

In [None]:
from src.utils import load_fb15k237, load_wn18rr, get_hist

PATH_FB15k237 = "data/datasets_knowledge_embedding/FB15k-237"
PATH_WN18RR = "data/datasets_knowledge_embedding/WN18RR/text"

In [None]:
train, valid, test, entity2wikidata = load_fb15k237(PATH_FB15k237)

### FB15k237 Exploration

1 - Explore entities with missing description. \
2 - Plot histogram of entities and relations to avaliate the balance.

In [None]:
import pandas as pd

all_data_fb = pd.concat([train, valid, test])

In [None]:
all_data_fb.head()

In [None]:
all_data_fb.shape

### Filtering only data with wikidata info

In [None]:
df_entity = pd.DataFrame(entity2wikidata.keys(), columns=["head"])
all_data_fb_filtered = all_data_fb[
    all_data_fb["head"].isin(df_entity["head"])
    & all_data_fb["tail"].isin(df_entity["head"])
]

In [None]:
all_data_fb_filtered.shape

In [None]:
df_count = (
    all_data_fb_filtered[["head"]]
    .groupby(["head"], as_index=False)
    .value_counts()
    .sort_values(by="count", ascending=True)
)

df_count.head()

In [None]:
df_count["count"].groupby(df_count["count"]).count()

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

plt.hist(df_count["count"], bins=1000)
plt.ylabel("Entity frequency")
plt.xlabel("Data")

In [None]:
# Friedman Diaconis Rule - https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
%matplotlib inline

import numpy as np

x = df_count["count"]

q25, q75 = np.percentile(x, [25, 75])
bin_width = 2 * (q75 - q25) * len(x) ** (-1 / 3)
bins = round((x.max() - x.min()) / bin_width)
print("Freedman–Diaconis number of bins:", bins)
plt.hist(x, bins=bins)
plt.show()

In [None]:
get_hist(x)

### WN18RR Exploration

1 - Plot histogram of entities and relations to avaliate the balance.

In [None]:
train, valid, test = load_wn18rr(PATH_WN18RR)
all_data_wn = pd.concat([train, valid, test])

In [None]:
all_data_wn.head()

In [None]:
all_data_wn.shape

In [None]:
_df = (
    all_data_wn[["head"]]
    .groupby(["head"], as_index=False)
    .value_counts()
    .sort_values(by="count", ascending=True)
)

In [None]:
x = _df["count"]

q25, q75 = np.percentile(x, [25, 75])
bin_width = 2 * (q75 - q25) * len(x) ** (-1 / 3)
bins = round((x.max() - x.min()) / bin_width)
print("Freedman–Diaconis number of bins:", bins)
plt.hist(x, bins=500)
plt.show()

In [None]:
get_hist(
    all_data_wn[["head"]]
    .groupby(["head"], as_index=False)
    .value_counts()
    .sort_values(by="count", ascending=True)["count"],
)