In [1]:
import pandas as pd
import requests
import numpy as np

In [2]:
def get_italian_label(qid):
    url = f"https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": qid,
        "languages": "it",
        "format": "json",
        "props": "labels"
    }
    response = requests.get(url, params=params)
    data = response.json()
    italian_name = data['entities'][qid]['labels']['it']['value']
    return italian_name

def get_italian_label_rel(qid):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": qid,
        "languages": "it",
        "format": "json",
        "props": "labels"
    }
    response = requests.get(url, params=params)
    data = response.json()
    
    # Check if the property and the Italian label exist in the response
    if 'entities' in data and qid in data['entities'] and 'labels' in data['entities'][qid] and 'it' in data['entities'][qid]['labels']:
        italian_name = data['entities'][qid]['labels']['it']['value']
    else:
        italian_name = "No Italian label found"
    
    return italian_name

In [3]:
finalPrediction = pd.read_csv("data/final_pred.csv")
finalPrediction

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score,in_training
0,261,Q30,25,P47,261,Q30,2.172914,False
1,120,Q159,25,P47,120,Q159,1.884166,False
2,171,Q183,25,P47,171,Q183,1.816020,False
3,445,Q865,25,P47,120,Q159,1.809572,False
4,124,Q16,25,P47,124,Q16,1.806055,False
...,...,...,...,...,...,...,...,...
8430282,261,Q30,8,P150,322,Q41,-1.607602,False
8430283,124,Q16,8,P150,120,Q159,-1.609319,False
8430284,261,Q30,8,P150,419,Q801,-1.615608,False
8430285,261,Q30,8,P150,124,Q16,-1.638939,False


In [4]:
wikidata_train = pd.read_csv("data/wikidata5m_transductive_train_red.tsv", delimiter="\t")
question = wikidata_train.sample(100, random_state=42)
question

Unnamed: 0,head,rel,tail
3612,Q423,P530,Q881
393,Q1370,P17,Q30
6998,Q865,P530,Q28
5216,Q800,P530,Q77
23,Q184,P530,Q55
...,...,...,...
472,Q801,P530,Q148
5380,Q217,P530,Q183
6752,Q224,P530,Q865
1084,Q183,P530,Q945


In [5]:
merged = question.merge(finalPrediction, left_on=["head", "rel"], right_on=["head_label","relation_label"], how = "left")
merged.head = merged["head"].str.strip()
merged.rel = merged.rel.str.strip()

In [6]:
dataframe = merged.groupby(['head', 'rel', 'tail']).apply(lambda x: x.nlargest(3, 'score')).reset_index(drop=True)
dataframe

Unnamed: 0,head,rel,tail,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score,in_training
0,Q1016,P530,Q159,3,Q1016,28,P530,477,Q96,1.348786,False
1,Q1016,P530,Q159,3,Q1016,28,P530,320,Q408,1.334645,False
2,Q1016,P530,Q159,3,Q1016,28,P530,124,Q16,1.324010,False
3,Q1045,P530,Q148,11,Q1045,28,P530,120,Q159,1.214741,False
4,Q1045,P530,Q148,11,Q1045,28,P530,124,Q16,1.054364,False
...,...,...,...,...,...,...,...,...,...,...,...
295,Q987,P190,Q1297,486,Q987,15,P190,475,Q956,0.897657,False
296,Q987,P190,Q1297,486,Q987,15,P190,375,Q656,0.893429,False
297,Q99,P150,Q62,487,Q99,8,P150,120,Q159,0.519113,False
298,Q99,P150,Q62,487,Q99,8,P150,95,Q148,0.497206,False


In [7]:
list_unique_object = []

for column in ["head", "tail", "tail_label"]:
    current_list = list(np.unique(dataframe[column]))
    list_unique_object = list(set(list_unique_object) | set(current_list))

list_unique_relation = np.unique(dataframe.rel)

In [8]:
len(list_unique_object), len(list_unique_relation)

(175, 12)

In [9]:
desc_Q_list=[]
desc_P_list=[]

for Q in list_unique_object:
    desc_Q_list.append(get_italian_label(Q))

for P in list_unique_relation:
    desc_P_list.append(get_italian_label_rel(P))

In [10]:
rel_italiano = pd.DataFrame({
    "ID" : list_unique_relation,
    "Nome" : desc_P_list,
})

In [11]:
object_italiano = pd.DataFrame({
    "ID" : list_unique_object,
    "Nome" : desc_Q_list,
})

In [12]:
for column in ["head", "tail", "tail_label"]:
    dataframe = dataframe.merge(object_italiano, left_on=column, right_on='ID', how='left')
    dataframe[column] = dataframe["Nome"]
    dataframe = dataframe.drop(columns=["ID", "Nome"])

dataframe = dataframe.merge(rel_italiano, left_on="rel", right_on='ID', how='left')
dataframe["rel"] = dataframe["Nome"]
dataframe = dataframe.drop(columns=["ID", "Nome"])

In [14]:
dataframe = dataframe.drop(columns=["head_id", "head_label", "relation_id", "relation_label", "tail_id","score", "in_training"])
dataframe.columns = ["Object", "Relationship", "Answer", "Distractor"]

In [15]:
dataframe

Unnamed: 0,Object,Relationship,Answer,Distractor
0,Libia,relazione diplomatica,Russia,Messico
1,Libia,relazione diplomatica,Russia,Australia
2,Libia,relazione diplomatica,Russia,Canada
3,Somalia,relazione diplomatica,Cina,Russia
4,Somalia,relazione diplomatica,Cina,Canada
...,...,...,...,...
295,Nuova Delhi,città gemellata,Chicago,Pechino
296,Nuova Delhi,città gemellata,Chicago,San Pietroburgo
297,California,sottodivisioni amministrative,San Francisco,Russia
298,California,sottodivisioni amministrative,San Francisco,Cina


In [16]:
dataframe['count'] = dataframe.groupby(['Object', 'Relationship', 'Answer']).cumcount()

In [17]:
# Pivoting the DataFrame
df_pivoted = dataframe.pivot_table(index=['Object', 'Relationship', 'Answer'], columns='count', values='Distractor', aggfunc='first').reset_index()
df_pivoted.columns = ['Object', 'Relationship', 'Correct', 'Answer_1', 'Answer_2', 'Answer_3']  # Renaming the columns
df_pivoted["Answer_4"] = df_pivoted.Correct

In [18]:
df_pivoted

Unnamed: 0,Object,Relationship,Correct,Answer_1,Answer_2,Answer_3,Answer_4
0,Adelaide,unità amministrativa in cui è situato,Australia Meridionale,Nebraska,Illinois,Colorado,Australia Meridionale
1,Albania,relazione diplomatica,Francia,Canada,Messico,India,Francia
2,Angola,relazione diplomatica,Australia,Bangladesh,Canada,Danimarca,Australia
3,Australia,relazione diplomatica,Birmania,Kazakistan,Ungheria,Nuova Zelanda,Birmania
4,Australia Meridionale,Paese,Australia,Stati Uniti d'America,Francia,Taiwan,Australia
...,...,...,...,...,...,...,...
95,United States Marine Corps,luogo di fondazione,Filadelfia,San Pietroburgo,Vilnius,New York,Filadelfia
96,Vietnam,relazione diplomatica,Ucraina,Cina,Canada,Israele,Ucraina
97,Virginia,Paese,Stati Uniti d'America,Canada,Virginia,Francia,Stati Uniti d'America
98,Virginia,confina con,Carolina del Nord,Missouri,Virginia,Illinois,Carolina del Nord


In [19]:
# Set a random seed for reproducibility
np.random.seed(42)

# Define the columns to shuffle
columns_to_shuffle = ['Answer_1', 'Answer_2', 'Answer_3', 'Answer_4']

# Shuffle each row
df_pivoted[columns_to_shuffle] = df_pivoted[columns_to_shuffle].apply(lambda x: np.random.permutation(x), axis=1, result_type='expand')

df_pivoted

Unnamed: 0,Object,Relationship,Correct,Answer_1,Answer_2,Answer_3,Answer_4
0,Adelaide,unità amministrativa in cui è situato,Australia Meridionale,Illinois,Australia Meridionale,Nebraska,Colorado
1,Albania,relazione diplomatica,Francia,Messico,Francia,Canada,India
2,Angola,relazione diplomatica,Australia,Australia,Bangladesh,Canada,Danimarca
3,Australia,relazione diplomatica,Birmania,Ungheria,Kazakistan,Birmania,Nuova Zelanda
4,Australia Meridionale,Paese,Australia,Taiwan,Francia,Stati Uniti d'America,Australia
...,...,...,...,...,...,...,...
95,United States Marine Corps,luogo di fondazione,Filadelfia,San Pietroburgo,New York,Filadelfia,Vilnius
96,Vietnam,relazione diplomatica,Ucraina,Cina,Canada,Ucraina,Israele
97,Virginia,Paese,Stati Uniti d'America,Virginia,Francia,Canada,Stati Uniti d'America
98,Virginia,confina con,Carolina del Nord,Missouri,Virginia,Illinois,Carolina del Nord


In [21]:
df_pivoted.to_csv("datasetItaliano.csv")