In [None]:
import pandas as pd
import requests
import numpy as np
import torch
from tqdm import tqdm

from pykeen import predict
from pykeen.triples import TriplesFactory

In [None]:
def get_italian_labels_batch(qids, batch_size = 50):
    url = "https://www.wikidata.org/w/api.php"
    labels = {}  # Number of QIDs to fetch in one request
    
    for i in tqdm(range(0, len(qids), batch_size), desc="Fetching labels"):
        batch_qids = qids[i:i + batch_size]
        params = {
            "action": "wbgetentities",
            "ids": "|".join(batch_qids),
            "languages": "it",
            "format": "json",
            "props": "labels",
        }
        response = requests.get(url, params=params)
        data = response.json()
        
        for qid in batch_qids:
            if 'entities' in data and qid in data['entities'] and 'labels' in data['entities'][qid] and 'it' in data['entities'][qid]['labels']:
                labels[qid] = data['entities'][qid]['labels']['it']['value']
            else:
                labels[qid] = "No Italian label found"
                
    return labels

In [None]:
pykeen_model = torch.load('Models/RotatE.pkl')
pykeen_model

In [None]:
dataframe = pd.read_csv("data/df_most_clicked_reduced.csv")
#dataframe_test = dataframe.drop_duplicates().head(100)
dataframe

In [None]:
len(np.unique(dataframe['head'])), len(np.unique(dataframe['rel'])), len(np.unique(dataframe['tail']))

In [None]:
triples = dataframe[['head', 'rel', 'tail']].values
tf = TriplesFactory.from_labeled_triples(triples)

In [None]:
# Ensure the length of each tuple is 3
assert all(len(triple) == 3 for triple in triples)

In [None]:
# Create an empty list to store the results
results = []

# Iterate through each row in the dataframe with a progress bar
for index, row in tqdm(dataframe.iterrows(), total=dataframe_test.shape[0]):
    distractors = predict.predict_target(
        model=pykeen_model,
        head=row['head'],
        relation=row['rel'],
        triples_factory=tf,
    ).df
    distractors = distractors.drop(columns=["tail_id"])
    distractors["head"] = row['head']
    distractors["rel"] = row['rel']
    distractors["tail_true"] = row['tail']

    #deleting all the distractor triplets that are correct (to not have two correct answer)
    merged_df = distractors.merge(dataframe, left_on=['tail_label', 'head', 'rel'], right_on=['tail', 'head', 'rel'], how='left', indicator=True)
    distractors = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge', "tail"])

    distractors = distractors.sort_values("score", ascending=False).head(3)
    
    # Append the results to the list
    results.append(distractors)

# Combine all the results into a single dataframe
distactors_dataframes = pd.concat(results, ignore_index=True)

In [None]:
distactors_dataframes

In [None]:
#Adding the italian names

#finding unique elements
list_unique_object = []

for column in ["head", "tail_label", "tail_true"]:
    current_list = list(np.unique(distactors_dataframes[column]))
    list_unique_object = list(set(list_unique_object) | set(current_list))

list_unique_relation = np.unique(distactors_dataframes.rel)

In [None]:
len(list_unique_object), len(list_unique_relation)

In [None]:
desc_Q_list = get_italian_labels_batch(list_unique_object)
desc_P_list = get_italian_labels_batch(list_unique_relation)

In [None]:
rel_italian = pd.Series(desc_P_list).to_frame()
rel_italian.columns = ["name"]
rel_italian

In [None]:
rel_italian = rel_italian.query(' name != "No Italian label found"')

In [None]:
object_italian = pd.Series(desc_Q_list).to_frame()
object_italian.columns = ["name"]
object_italian

In [None]:
object_italian = object_italian.query(' name != "No Italian label found"')

In [None]:
#replacing name for entities
for column in ["head", "tail_label", "tail_true"]:
    distactors_dataframes = distactors_dataframes.merge(object_italian, left_on=column, right_index = True, how='left')
    distactors_dataframes[column] = distactors_dataframes["name"]
    distactors_dataframes = distactors_dataframes.drop(columns=["name"])

distactors_dataframes = distactors_dataframes.merge(rel_italian, left_on="rel", right_index = True, how='left')
distactors_dataframes["rel"] = distactors_dataframes["name"]
distactors_dataframes = distactors_dataframes.drop(columns=["name"])

In [None]:
#renaming columns
distactors_dataframes.columns = ["Score", "Distractor", "Object", "Relationship" , "Answer"]

In [None]:
distactors_dataframes.loc[:,'count'] = distactors_dataframes.groupby(['Object', 'Relationship', 'Answer']).cumcount()

In [None]:
distactors_dataframes

In [None]:
# Pivoting the DataFrame
df_pivoted_distractor = distactors_dataframes.pivot_table(index=['Object', 'Relationship', 'Answer'], columns='count', values='Distractor', aggfunc='first').reset_index()
df_pivoted_distractor.columns = ['Object', 'Relationship', 'Correct', 'Answer_1', 'Answer_2', 'Answer_3']  # Renaming the columns
df_pivoted_distractor["Answer_4"] = df_pivoted_distractor.Correct

In [None]:
# Pivot the DataFrame for Score
df_pivoted_score = distactors_dataframes.pivot_table(index=['Object', 'Relationship', 'Answer'], columns='count', values='Score', aggfunc='first').reset_index()
df_pivoted_score.columns = ['Object', 'Relationship', 'Correct'] + [f'Score_{int(col + 1)}' for col in df_pivoted_score.columns[3:]]

In [None]:
df_pivoted = pd.merge(df_pivoted_distractor, df_pivoted_score, on=['Object', 'Relationship', 'Correct'])
df_pivoted["Score_sum"] = df_pivoted.Score_1 + df_pivoted.Score_2 + df_pivoted.Score_3
df_pivoted

In [None]:
#dropping anwer with any pages with no italian name
df_pivoted = df_pivoted.dropna().reset_index()

In [None]:
# Set a random seed for reproducibility
np.random.seed(42)

# Define the columns to shuffle
columns_to_shuffle = ['Answer_1', 'Answer_2', 'Answer_3', 'Answer_4']

# Shuffle each row
df_pivoted[columns_to_shuffle] = df_pivoted[columns_to_shuffle].apply(lambda x: np.random.permutation(x), axis=1, result_type='expand')

df_pivoted.sort_values("Score_sum")

In [None]:
df_pivoted.to_csv("datasetItaliano.csv")