In [52]:
import pandas as pd
import requests
import numpy as np

In [65]:
def get_italian_label(qid):
    url = f"https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": qid,
        "languages": "it",
        "format": "json",
        "props": "labels"
    }
    response = requests.get(url, params=params)
    data = response.json()
    italian_name = data['entities'][qid]['labels']['it']['value']
    return italian_name

def get_italian_label_rel(qid):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": qid,
        "languages": "it",
        "format": "json",
        "props": "labels"
    }
    response = requests.get(url, params=params)
    data = response.json()
    
    # Check if the property and the Italian label exist in the response
    if 'entities' in data and qid in data['entities'] and 'labels' in data['entities'][qid] and 'it' in data['entities'][qid]['labels']:
        italian_name = data['entities'][qid]['labels']['it']['value']
    else:
        italian_name = "No Italian label found"
    
    return italian_name

In [4]:
finalPrediction = pd.read_csv("final_pred.csv")
finalPrediction

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score,in_training
0,171,Q183,34,rel,171,Q183,-0.392535,False
1,171,Q183,19,P355,171,Q183,-0.392904,False
2,171,Q183,34,rel,445,Q865,-0.395508,False
3,171,Q183,15,P190,171,Q183,-0.396271,False
4,171,Q183,16,P276,171,Q183,-0.397741,False
...,...,...,...,...,...,...,...,...
8430282,263,Q309331,20,P36,276,Q3357,-3.732768,False
8430283,490,tail,5,P1376,489,head,-3.768142,False
8430284,248,Q2793400,10,P155,490,tail,-3.791963,False
8430285,344,Q49117,12,P159,330,Q432637,-3.793376,False


In [26]:
wikidata_train = pd.read_csv("wikidata5m_transductive_train_red.tsv", delimiter="\t")
question = wikidata_train.sample(100, random_state=42)
question

Unnamed: 0,head,rel,tail
3612,Q423,P530,Q881
393,Q1370,P17,Q30
6998,Q865,P530,Q28
5216,Q800,P530,Q77
23,Q184,P530,Q55
...,...,...,...
472,Q801,P530,Q148
5380,Q217,P530,Q183
6752,Q224,P530,Q865
1084,Q183,P530,Q945


In [42]:
merged = question.merge(finalPrediction, left_on=["head", "rel"], right_on=["head_label","relation_label"], how = "left")
merged.head = merged["head"].str.strip()
merged.rel = merged.rel.str.strip()

In [91]:
dataframe = merged.groupby(['head', 'rel', 'tail']).apply(lambda x: x.nlargest(3, 'score')).reset_index(drop=True)
dataframe

Unnamed: 0,head,rel,tail,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score,in_training
0,Q1016,P530,Q159,3,Q1016,28,P530,124,Q16,-0.984883,False
1,Q1016,P530,Q159,3,Q1016,28,P530,320,Q408,-0.998975,False
2,Q1016,P530,Q159,3,Q1016,28,P530,242,Q252,-1.004103,False
3,Q1045,P530,Q148,11,Q1045,28,P530,432,Q833,-1.150194,False
4,Q1045,P530,Q148,11,Q1045,28,P530,320,Q408,-1.179237,False
...,...,...,...,...,...,...,...,...,...,...,...
295,Q987,P190,Q1297,486,Q987,15,P190,255,Q29,-1.597987,False
296,Q987,P190,Q1297,486,Q987,15,P190,477,Q96,-1.612138,False
297,Q99,P150,Q62,487,Q99,8,P150,171,Q183,-1.925779,False
298,Q99,P150,Q62,487,Q99,8,P150,82,Q142,-1.927541,False


In [92]:
list_unique_object = []

for column in ["head", "tail", "tail_label"]:
    current_list = list(np.unique(dataframe[column]))
    list_unique_object = list(set(list_unique_object) | set(current_list))

list_unique_relation = np.unique(dataframe.rel)

In [93]:
len(list_unique_object), len(list_unique_relation)

(137, 12)

In [66]:
desc_Q_list=[]
desc_P_list=[]

for Q in list_unique_object:
    desc_Q_list.append(get_italian_label(Q))

for P in list_unique_relation:
    desc_P_list.append(get_italian_label_rel(P))

In [81]:
rel_italiano = pd.DataFrame({
    "ID" : list_unique_relation,
    "Nome" : desc_P_list,
})

In [82]:
object_italiano = pd.DataFrame({
    "ID" : list_unique_object,
    "Nome" : desc_Q_list,
})

In [94]:
for column in ["head", "tail", "tail_label"]:
    dataframe = dataframe.merge(object_italiano, left_on=column, right_on='ID', how='left')
    dataframe[column] = dataframe["Nome"]
    dataframe = dataframe.drop(columns=["ID", "Nome"])

dataframe = dataframe.merge(rel_italiano, left_on="rel", right_on='ID', how='left')
dataframe["rel"] = dataframe["Nome"]
dataframe = dataframe.drop(columns=["ID", "Nome"])

In [101]:
dataframe = dataframe.drop(columns=["head_id", "head_label", "relation_id", "relation_label", "tail_id","score", "in_training", "count"])
dataframe.columns = ["Object", "Relationship", "Answer", "Distractor"]

In [102]:
dataframe

Unnamed: 0,Object,Relationship,Answer,Distractor
0,Libia,relazione diplomatica,Russia,Canada
1,Libia,relazione diplomatica,Russia,Australia
2,Libia,relazione diplomatica,Russia,Indonesia
3,Somalia,relazione diplomatica,Cina,Malaysia
4,Somalia,relazione diplomatica,Cina,Australia
...,...,...,...,...
295,Nuova Delhi,città gemellata,Chicago,Spagna
296,Nuova Delhi,città gemellata,Chicago,Messico
297,California,sottodivisioni amministrative,San Francisco,Germania
298,California,sottodivisioni amministrative,San Francisco,Francia


In [103]:
dataframe['count'] = dataframe.groupby(['Object', 'Relationship', 'Answer']).cumcount()

In [109]:
# Pivoting the DataFrame
df_pivoted = dataframe.pivot_table(index=['Object', 'Relationship', 'Answer'], columns='count', values='Distractor', aggfunc='first').reset_index()
df_pivoted.columns = ['Object', 'Relationship', 'Correct', 'Answer_1', 'Answer_2', 'Answer_3']  # Renaming the columns
df_pivoted["Answer_4"] = df_pivoted.Correct

In [110]:
df_pivoted

Unnamed: 0,Object,Relationship,Correct,Answer_1,Answer_2,Answer_3,Answer_4
0,Adelaide,unità amministrativa in cui è situato,Australia Meridionale,Mosca,Taiwan,Messico,Australia Meridionale
1,Albania,relazione diplomatica,Francia,Canada,Grecia,Brasile,Francia
2,Angola,relazione diplomatica,Australia,Canada,Italia,Regno Unito,Australia
3,Australia,relazione diplomatica,Birmania,Australia,Indonesia,Georgia,Birmania
4,Australia Meridionale,Paese,Australia,Austria,Danimarca,Grecia,Australia
...,...,...,...,...,...,...,...
95,United States Marine Corps,luogo di fondazione,Filadelfia,Indonesia,Taiwan,Russia,Filadelfia
96,Vietnam,relazione diplomatica,Ucraina,Turchia,Cina,Canada,Ucraina
97,Virginia,Paese,Stati Uniti d'America,Indonesia,Iran,Danimarca,Stati Uniti d'America
98,Virginia,confina con,Carolina del Nord,Turchia,Germania,Taiwan,Carolina del Nord


In [None]:
# Set a random seed for reproducibility
np.random.seed(42)

# Define the columns to shuffle
columns_to_shuffle = ['Answer_1', 'Answer_2', 'Answer_3', 'Answer_4']

# Shuffle each row
df_pivoted[columns_to_shuffle] = df_pivoted[columns_to_shuffle].apply(lambda x: np.random.permutation(x), axis=1, result_type='expand')

df_pivoted

In [96]:
dataframe.to_csv("datasetItaliano.csv")