In [7]:
import pandas as pd
import requests
import numpy as np
import torch

from pykeen import predict
from pykeen.triples import TriplesFactory

In [4]:
pykeen_model = torch.load('Models/TransE.pkl')
pykeen_model

TransE(
  (loss): MarginRankingLoss(
    (margin_activation): ReLU()
  )
  (interaction): TransEInteraction()
  (entity_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(5902, 50)
    )
  )
  (relation_representations): ModuleList(
    (0): Embedding(
      (_embeddings): Embedding(335, 50)
    )
  )
  (weight_regularizers): ModuleList()
)

In [14]:
dataframe = pd.read_csv("data/df_most_clicked_reduced.csv").dropna()
dataframe

Unnamed: 0,head,rel,tail
0,Q122921105,P31,Q11424
1,Q122921105,P495,Q668
2,Q122921105,P364,Q1568
3,Q122921105,P86,Q7489036
4,Q122921105,P7573,Q74434526
...,...,...,...
115736,Q55720,P793,Q23702848
115737,Q55720,P166,Q55697
115738,Q55720,P97,Q579431
115739,Q55720,P1884,Q152357


In [19]:
len(np.unique(dataframe['head'])), len(np.unique(dataframe['rel'])), len(np.unique(dataframe['tail']))

(7164, 411, 4111)

In [15]:
triples = dataframe[['head', 'rel', 'tail']].values
tf = TriplesFactory.from_labeled_triples(triples)

In [17]:
# Ensure the length of each tuple is 3
assert all(len(triple) == 3 for triple in triples)

In [18]:
predict.predict_target(
    model=pykeen_model,
    head="Q122921105",
    relation="P31",
    triples_factory=tf,
).df

ValueError: All arrays must be of the same length

In [5]:
merged = question.merge(finalPrediction, left_on=["head", "rel"], right_on=["head_label","relation_label"], how = "left")
merged.head = merged["head"].str.strip()
merged.rel = merged.rel.str.strip()

In [6]:
dataframe = merged.groupby(['head', 'rel', 'tail']).apply(lambda x: x.nlargest(3, 'score')).reset_index(drop=True)
dataframe

Unnamed: 0,head,rel,tail,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score,in_training
0,Q1016,P530,Q159,3,Q1016,28,P530,477,Q96,1.348786,False
1,Q1016,P530,Q159,3,Q1016,28,P530,320,Q408,1.334645,False
2,Q1016,P530,Q159,3,Q1016,28,P530,124,Q16,1.324010,False
3,Q1045,P530,Q148,11,Q1045,28,P530,120,Q159,1.214741,False
4,Q1045,P530,Q148,11,Q1045,28,P530,124,Q16,1.054364,False
...,...,...,...,...,...,...,...,...,...,...,...
295,Q987,P190,Q1297,486,Q987,15,P190,475,Q956,0.897657,False
296,Q987,P190,Q1297,486,Q987,15,P190,375,Q656,0.893429,False
297,Q99,P150,Q62,487,Q99,8,P150,120,Q159,0.519113,False
298,Q99,P150,Q62,487,Q99,8,P150,95,Q148,0.497206,False


In [7]:
list_unique_object = []

for column in ["head", "tail", "tail_label"]:
    current_list = list(np.unique(dataframe[column]))
    list_unique_object = list(set(list_unique_object) | set(current_list))

list_unique_relation = np.unique(dataframe.rel)

In [8]:
len(list_unique_object), len(list_unique_relation)

(175, 12)

In [9]:
desc_Q_list=[]
desc_P_list=[]

for Q in list_unique_object:
    desc_Q_list.append(get_italian_label(Q))

for P in list_unique_relation:
    desc_P_list.append(get_italian_label_rel(P))

In [10]:
rel_italiano = pd.DataFrame({
    "ID" : list_unique_relation,
    "Nome" : desc_P_list,
})

In [11]:
object_italiano = pd.DataFrame({
    "ID" : list_unique_object,
    "Nome" : desc_Q_list,
})

In [12]:
for column in ["head", "tail", "tail_label"]:
    dataframe = dataframe.merge(object_italiano, left_on=column, right_on='ID', how='left')
    dataframe[column] = dataframe["Nome"]
    dataframe = dataframe.drop(columns=["ID", "Nome"])

dataframe = dataframe.merge(rel_italiano, left_on="rel", right_on='ID', how='left')
dataframe["rel"] = dataframe["Nome"]
dataframe = dataframe.drop(columns=["ID", "Nome"])

In [14]:
dataframe = dataframe.drop(columns=["head_id", "head_label", "relation_id", "relation_label", "tail_id","score", "in_training"])
dataframe.columns = ["Object", "Relationship", "Answer", "Distractor"]

In [15]:
dataframe

Unnamed: 0,Object,Relationship,Answer,Distractor
0,Libia,relazione diplomatica,Russia,Messico
1,Libia,relazione diplomatica,Russia,Australia
2,Libia,relazione diplomatica,Russia,Canada
3,Somalia,relazione diplomatica,Cina,Russia
4,Somalia,relazione diplomatica,Cina,Canada
...,...,...,...,...
295,Nuova Delhi,città gemellata,Chicago,Pechino
296,Nuova Delhi,città gemellata,Chicago,San Pietroburgo
297,California,sottodivisioni amministrative,San Francisco,Russia
298,California,sottodivisioni amministrative,San Francisco,Cina


In [16]:
dataframe['count'] = dataframe.groupby(['Object', 'Relationship', 'Answer']).cumcount()

In [17]:
# Pivoting the DataFrame
df_pivoted = dataframe.pivot_table(index=['Object', 'Relationship', 'Answer'], columns='count', values='Distractor', aggfunc='first').reset_index()
df_pivoted.columns = ['Object', 'Relationship', 'Correct', 'Answer_1', 'Answer_2', 'Answer_3']  # Renaming the columns
df_pivoted["Answer_4"] = df_pivoted.Correct

In [18]:
df_pivoted

Unnamed: 0,Object,Relationship,Correct,Answer_1,Answer_2,Answer_3,Answer_4
0,Adelaide,unità amministrativa in cui è situato,Australia Meridionale,Nebraska,Illinois,Colorado,Australia Meridionale
1,Albania,relazione diplomatica,Francia,Canada,Messico,India,Francia
2,Angola,relazione diplomatica,Australia,Bangladesh,Canada,Danimarca,Australia
3,Australia,relazione diplomatica,Birmania,Kazakistan,Ungheria,Nuova Zelanda,Birmania
4,Australia Meridionale,Paese,Australia,Stati Uniti d'America,Francia,Taiwan,Australia
...,...,...,...,...,...,...,...
95,United States Marine Corps,luogo di fondazione,Filadelfia,San Pietroburgo,Vilnius,New York,Filadelfia
96,Vietnam,relazione diplomatica,Ucraina,Cina,Canada,Israele,Ucraina
97,Virginia,Paese,Stati Uniti d'America,Canada,Virginia,Francia,Stati Uniti d'America
98,Virginia,confina con,Carolina del Nord,Missouri,Virginia,Illinois,Carolina del Nord


In [19]:
# Set a random seed for reproducibility
np.random.seed(42)

# Define the columns to shuffle
columns_to_shuffle = ['Answer_1', 'Answer_2', 'Answer_3', 'Answer_4']

# Shuffle each row
df_pivoted[columns_to_shuffle] = df_pivoted[columns_to_shuffle].apply(lambda x: np.random.permutation(x), axis=1, result_type='expand')

df_pivoted

Unnamed: 0,Object,Relationship,Correct,Answer_1,Answer_2,Answer_3,Answer_4
0,Adelaide,unità amministrativa in cui è situato,Australia Meridionale,Illinois,Australia Meridionale,Nebraska,Colorado
1,Albania,relazione diplomatica,Francia,Messico,Francia,Canada,India
2,Angola,relazione diplomatica,Australia,Australia,Bangladesh,Canada,Danimarca
3,Australia,relazione diplomatica,Birmania,Ungheria,Kazakistan,Birmania,Nuova Zelanda
4,Australia Meridionale,Paese,Australia,Taiwan,Francia,Stati Uniti d'America,Australia
...,...,...,...,...,...,...,...
95,United States Marine Corps,luogo di fondazione,Filadelfia,San Pietroburgo,New York,Filadelfia,Vilnius
96,Vietnam,relazione diplomatica,Ucraina,Cina,Canada,Ucraina,Israele
97,Virginia,Paese,Stati Uniti d'America,Virginia,Francia,Canada,Stati Uniti d'America
98,Virginia,confina con,Carolina del Nord,Missouri,Virginia,Illinois,Carolina del Nord


In [21]:
df_pivoted.to_csv("datasetItaliano.csv")