## Install

In [2]:
!pip install pykeen[tensorboard]



## Librerie

In [3]:
import json
import requests
import os
import pandas as pd
import tarfile
import tqdm
from pykeen.pipeline import pipeline
from pykeen.predict import predict_all
import torch
from pykeen.evaluation import RankBasedEvaluator
from pykeen.triples import TriplesFactory

IQS = False

INFO:pykeen.utils:Using opt_einsum


## Moduli

In [3]:
def download_file(url, destination):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(destination, 'wb') as file:
                file.write(response.content)
            print("Download completato con successo.")
        else:
            print(f"Errore durante il download del file. Codice di stato: {response.status_code}")
    except Exception as e:
        print(f"Si è verificato un errore: {e}")


def decompress_tar_gz(file_path, destination_folder):
    try:

        with tarfile.open(file_path, 'r:gz') as tar:
            if not os.path.exists(destination_folder):
              os.mkdir(destination_folder)

            tar.extractall(destination_folder)
        print("Decompressione completata con successo.")
        return True
    except Exception as e:
        print(f"Si è verificato un errore durante la decompressione: {e}")
        return False


def read_firstlines(file_path, nlines):
    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()[:nlines]
            for line in lines:
                print(line.rstrip())
    except FileNotFoundError:
        print(f"Il file '{file_path}' non esiste.")
    except Exception as e:
        print(f"Si è verificato un errore durante la lettura del file: {e}")


def itemQualityScore(item2search):
    inference_url = 'https://api.wikimedia.org/service/lw/inference/v1/models/wikidatawiki-itemquality:predict'

    try:
        req = requests.post(f"https://www.wikidata.org/w/api.php?action=query&format=json&formatversion=2&prop=revisions|entityterms&titles={item2search}&origin=*")
        jsonReq = req.json()

        data = {"rev_id": jsonReq['query']['pages'][0]['revisions'][0]['revid'] }
        response = requests.post(inference_url, headers=headers, data=json.dumps(data))
        prob = response.json()
        prediction = prob['wikidatawiki']['scores'][str(data['rev_id'])]['itemquality']['score']['prediction']
        probabilityFromModel = prob['wikidatawiki']['scores'][str(data['rev_id'])]['itemquality']['score']['probability'][prediction]
    except Exception as e:
        prediction = "NotFound"
        probabilityFromModel = 0


    return prediction, probabilityFromModel


## Dowload file wikidata

### Triple codificate

In [5]:
url = "https://www.dropbox.com/s/6sbhm0rwo4l73jq/wikidata5m_transductive.tar.gz?dl=1"
destination = "wiki_tran.tar.gz"
download_file(url, destination)
decompress_tar_gz(destination, "./data")
os.remove("./wiki_tran.tar.gz")

Download completato con successo.
Decompressione completata con successo.


### Triple con sinonimi

In [6]:
url = "https://www.dropbox.com/s/7jp4ib8zo3i6m10/wikidata5m_text.txt.gz?dl=1"
# Sempre triple ma con i nomi
destination = "corpus.tar.gz"

download_file(url, destination)
decompress_tar_gz(destination, "./data")
os.remove("./corpus.tar.gz")

Download completato con successo.
Si è verificato un errore durante la decompressione: invalid header


### Alias delle entità

In [7]:
url = "https://www.dropbox.com/s/lnbhc8yuhit4wm5/wikidata5m_alias.tar.gz?dl=1"
destination = "alias.tar.gz"
download_file(url, destination)
decompress_tar_gz(destination, "./data")
os.remove("./alias.tar.gz")

Download completato con successo.
Decompressione completata con successo.


### Stampa delle prime righe dei file

In [8]:
for file in os.listdir("./data"):
    print(f"Prime righe di {file}")
    read_firstlines(f"./data/{file}", 3)
    print("\n")

Prime righe di wikidata5m_relation.txt
P489	currency symbol description
P834	train depot	railway depot	depot	rail yard
P2629	BBFC rating	BBFC certificate


Prime righe di wikidata5m_transductive_valid_red.tsv
head	rel	tail
Q3576734	P495	Q30
Q641724	P1412	Q1860


Prime righe di wikidata5m_transductive_train.txt
Q29387131	P31	Q5
Q326660	P1412	Q652
Q7339549	P57	Q1365729


Prime righe di wikidata5m_transductive_test_red.tsv
head	rel	tail
Q7965079	P27	Q16
Q6719921	P31	Q11446


Prime righe di wikidata5m_entity.txt
Q5196650	Cut Your Hair	cut your hair
Q912600	Straumur-Burðarás	Straumur	straumur–burðarás investment bank	straumur	Straumur-Burðarás Investment Bank	straumur-burðarás investment bank	straumur investment bank	straumur-burðarás fjárf.banki	Straumur-Burðarás Fjárf.banki	straumur-burðarás	Straumur Investment Bank	Straumur–Burðarás Investment Bank
Q47551	ditiano	tipciano	titiaen geovene	Tizzianello	Called, Titian Tiziano Vecelli Cavaliere	called titian veccellio	genannt Vecelli Titian	V

## API wikidata

In [9]:
headers = {
    'Authorization': 'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiI1ZDEzYzQ4ZTg3YzQ4YzJhYWRmZGFlYzA1MjhkYTU0MSIsImp0aSI6IjgzZWE4MmVkMjI3ZDgzZDkwMzBlNGUyNzJiNTBmOTViYjhlODA3NDI4NTI5YjVhMzBlYjRkNTJmZDNhY2JkNTZmMmU4ZTZiNzI3ZDMxNTQ2IiwiaWF0IjoxNzEyODMzOTg0Ljk0NTI2NiwibmJmIjoxNzEyODMzOTg0Ljk0NTI3MSwiZXhwIjozMzI2OTc0Mjc4NC45NDM3NTIsInN1YiI6Ijc1NDA2OTAyIiwiaXNzIjoiaHR0cHM6Ly9tZXRhLndpa2ltZWRpYS5vcmciLCJyYXRlbGltaXQiOnsicmVxdWVzdHNfcGVyX3VuaXQiOjUwMDAsInVuaXQiOiJIT1VSIn0sInNjb3BlcyI6WyJiYXNpYyJdfQ.NnXfiaq4StcjEq9fqjVRUrJnn5m-eIDy6jQtLTwprH_huGrzm8Z6HduKUiE93a3zwQ3T3t2c8EV4VzDHWv-fPSfozhNVVhwzBhNI80qyMYJQb2ieSu0Jq0mobtY443ygnPDWhADV0QhMpgLpafUPA8QpjnfzmSpXzfWkiEj1oWnuMJ25cr4Y7jqTUjlLQ_wHHSS43DWX1ZURmC0tQb2fZJ0o1NdoiC5MTtcaEqB3t2zF_C1BNGB_jLcrqkYKMZwjDL4SSaph5vIVS9mpYJHaoPlkroKnsc66_75541eSrt5D5YHwkYNavNrvFYpDWVnzUCp5kfX5POaqMquHYEYIaTNJImVuY-2hXIvwkF4FG9rh6rt8Y7WY67Sf_wFhVN9LYqMZVlR3JnZzeI-wfPy_MZ2Wh-WeTb6FHJDow66hwBNX8ua5EfOEF3XOj1N4k5iwYSRSCW6Ko1we-MlExWRMRKFS9t1Rfaw1C8wJCWJRfm0TIZkl-1nP5ACr5w9d1n9S1qjp95HEMnb25aUG3N1q0zwg9nS7xoAqWHXoojoLg31ocSpvpe6Qb1lrolKJwI6GnSjJJAYG03tTejgTjLfBON4uJlEaHFi5rk867L5Hiz25HY3h-OlS5DabwMdH9hZnvipPPeslG9e4X3TgqtidVlPWSk3DSH3Fm_ZeHuZ-7EY',
    'Content-type': 'Content-type',
    'User-Agent': 'tumnus7@gmail.com'
}

### Test API prediction

In [10]:
item2search = "Q35610"

In [11]:
pred, prob = itemQualityScore(item2search)

In [12]:
print(pred, prob)

A 0.930850565566538


## Data Loading e scrematura

In [4]:
df_org_train = pd.read_csv('data/wikidata5m_transductive_train.txt', sep='\t', header=None, names=["head", "rel", "tail"])
df_org_valid = pd.read_csv('data/wikidata5m_transductive_valid.txt', sep='\t', header=None, names=["head", "rel", "tail"])
df_org_test = pd.read_csv('data/wikidata5m_transductive_test.txt', sep='\t', header=None, names=["head", "rel", "tail"])

In [5]:
df_org_train_most_head = df_org_train.groupby("head").count().sort_values("rel", ascending=False).head(100000)
df_org_train_most_tail = df_org_train.groupby("tail").count().sort_values("rel", ascending=False).head(100000)
df_org_train_most_rel = df_org_train.groupby("rel").count().sort_values("tail", ascending=False).head(300)

In [6]:
df_org_train_most = df_org_train_most_head.join(df_org_train_most_tail, how="inner", lsuffix='_l', rsuffix='_r')
df_org_train_most["occurencies"] = df_org_train_most.rel_l + df_org_train_most.rel_r
df_org_train_most = df_org_train_most.sort_values("occurencies", ascending=False).head(500)

In [7]:
df_org_train_reduced = df_org_train[(df_org_train["head"].isin(df_org_train_most.index)) & (df_org_train["rel"].isin(df_org_train_most_rel.index)) & (df_org_train["tail"].isin(df_org_train_most.index))]
df_org_valid_reduced = df_org_valid[(df_org_valid["head"].isin(df_org_train_most.index)) & (df_org_valid["rel"].isin(df_org_train_most.index)) & (df_org_valid["tail"].isin(df_org_train.index))]
df_org_test_reduced = df_org_test[(df_org_test["head"].isin(df_org_train_most.index)) & (df_org_test["rel"].isin(df_org_train_most.index)) & (df_org_test["tail"].isin(df_org_train.index))]

In [17]:
df_org_train_reduced

Unnamed: 0,head,rel,tail
5307,Q365,P17,Q7318
6157,Q586,P190,Q33935
9143,Q148,P530,Q750
11578,Q408,P530,Q183
14878,Q837,P530,Q843
...,...,...,...
20605130,Q1731,P190,Q34370
20607958,Q1019,P530,Q159
20608891,Q159,P530,Q222
20610226,Q954,P530,Q408


In [8]:
wiki_train_path = 'data/wikidata5m_transductive_train_red.tsv'
df_org_train_reduced.to_csv(wiki_train_path, sep='\t', index=False)

wiki_validation_path = 'data/wikidata5m_transductive_valid_red.tsv'
#df_org_valid.to_csv(wiki_validation_path, sep='\t', index=False)

wiki_test_path = 'data/wikidata5m_transductive_test_red.tsv'
#df_org_test.to_csv(wiki_test_path, sep='\t', index=False)

### Aggiungo ItemQualityScore

In [19]:
if IQS:
  parole_Q = []
  pred_Q = []

  checkpoint_interval = 500
  contatore_iterazioni = 0

  for pathData in [wiki_train_path, wiki_validation_path, wiki_test_path]:
    with open(pathData, 'r') as file:
        for riga in tqdm.tqdm(file):
            item2search = riga.split()[0]
            pred, prob = itemQualityScore(item2search)
            if pred == 'A' or pred == 'B':
                parole_Q.append(item2search)
                pred_Q.append(pred)

            contatore_iterazioni += 1

            if contatore_iterazioni % checkpoint_interval == 0:
                df_checkpoint = pd.DataFrame({'Entity': parole_Q, 'Score': pred_Q})

                df_checkpoint.to_csv(f"./data/checkpoint_{contatore_iterazioni}_{pathData.split('/')[1].split('.')[0]}_entity_score.csv", index=False)

                parole_Q = []
                pred_Q = []

    if parole_Q:
        df_checkpoint = pd.DataFrame({'Entity': parole_Q, 'Score': pred_Q})
        df_checkpoint.to_csv(f"./data/checkpoint_{contatore_iterazioni}_{pathData.split('/')[1].split('.')[0]}_entity_score.csv", index=False)

    df_finale = pd.concat([pd.read_csv(f"checkpoint_{i * checkpoint_interval}_{pathData.split('/')[1].split('.')[0]}_entity_score.csv") for i in range(1, (contatore_iterazioni // checkpoint_interval) + 1)], ignore_index=True)
    df_finale.to_csv(f"./data/{pathData.split('/')[1].split('.')[0]}_entity_score.csv", index=False)

## Embedding

Spiegazione delle metriche [*qui*](https://docs.ampligraph.org/en/2.0.0/ampligraph.evaluation.html#metrics)

In [6]:
if df_org_train_reduced not in globals():
    df_org_train_reduced = pd.read_csv('data/wikidata5m_transductive_train_red.tsv', sep='\t', header=None, names=["head", "rel", "tail"])

In [7]:
df4emb = df_org_train_reduced #pd.concat([df_org_train_reduced, df_org_valid, df_org_test], ignore_index=True)

triples_factory = TriplesFactory.from_labeled_triples(triples=df4emb[['head', 'rel', 'tail']].values)

training = triples_factory
validation = triples_factory
testing = triples_factory

d=training
id_to_entity={v: k for k, v in d.entity_to_id.items()}
id_to_relation={v: k for k, v in d.relation_to_id.items()}

triples_factory.triples



array([['Q100', 'P1376', 'Q771'],
       ['Q100', 'P17', 'Q30'],
       ['Q100', 'P190', 'Q1492'],
       ...,
       ['Q994', 'P190', 'Q656'],
       ['Q994', 'P30', 'Q46'],
       ['head', 'rel', 'tail']], dtype='<U9')

In [8]:
evaluator = RankBasedEvaluator()

emb_list = ["RotatE"]
for emb in emb_list:
    print(f"Start with {emb}\n")
    try:
      result = pipeline(
        training=training,
        validation=validation,
        testing=testing,
        model=emb,
        device='gpu',
        random_seed=42,
        result_tracker='tensorboard',
        result_tracker_kwargs=dict(
        experiment_name=f'./log/wiki_{emb}',
        ),
      )
      result.save_to_directory(f'./evalModel/wiki_{emb}')
      metrics = evaluator.evaluate(result.model, testing.mapped_triples, additional_filter_triples=[training.mapped_triples, validation.mapped_triples])

      print(f"Hits@1: {metrics.get_metric('hits@1')}")
      print(f"Hits@3: {metrics.get_metric('hits@3')}")
      print(f"Hits@5: {metrics.get_metric('hits@5')}")
      print(f"Hits@10: {metrics.get_metric('hits@10')}")
      print(f"Mean Reciprocal Rank: {metrics.get_metric('mean_reciprocal_rank')}")
    except Exception as e:
        print(f"Si è verificato un errore: {e}")
        continue

Start with RotatE


INFO:pykeen.pipeline.api:Using device: gpu


Training epochs on cuda:0:   0%|          | 0/5 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/30 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/30 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/30 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/30 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/30 [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/7.55k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.48s seconds
INFO:pykeen.triples.triples_factory:Stored TriplesFactory(num_entities=491, num_relations=35, create_inverse_triples=False, num_triples=7548) to file:///content/data/evalModel/wiki_RotatE/training_triples
INFO:pykeen.pipeline.api:Saved to directory: /content/data/evalModel/wiki_RotatE


Evaluating on cuda:0:   0%|          | 0.00/7.55k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.28s seconds


Hits@1: 0.09684684684684684
Hits@3: 0.20084790673025968
Hits@5: 0.2658320084790673
Hits@10: 0.3523449920508744
Mean Reciprocal Rank: 0.18241891264915466


### Loading del modello salvato

In [None]:
model = torch.load('/content/evalModel/wiki_RotatE/trained_model.pkl')

### Metriche

In [9]:
pack = predict_all(model=result.model)



scoring:   0%|          | 0.00/17.2k [00:00<?, ?batch/s]

In [10]:
pred = pack.process(factory=result.training)
pred_annotated = pred.add_membership_columns(training=result.training)
pred_annotated.df

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score,in_training
0,171,Q183,34,rel,171,Q183,-0.392535,False
1,171,Q183,19,P355,171,Q183,-0.392904,False
2,445,Q865,28,P530,171,Q183,-0.394712,True
3,171,Q183,34,rel,445,Q865,-0.395508,False
4,171,Q183,15,P190,171,Q183,-0.396271,False
...,...,...,...,...,...,...,...,...
8437830,263,Q309331,20,P36,276,Q3357,-3.732768,False
8437831,490,tail,5,P1376,489,head,-3.768142,False
8437832,248,Q2793400,10,P155,490,tail,-3.791963,False
8437833,344,Q49117,12,P159,330,Q432637,-3.793376,False


In [11]:
print(f"Punteggio massimo raggiunto {pred_annotated.df['score'].max()}")
print(f"Punteggio massimo raggiunto {pred_annotated.df['score'].min()}")

Punteggio massimo raggiunto -0.3925352990627289
Punteggio massimo raggiunto -3.8312225341796875


In [12]:
triple_not_in_train = pred_annotated.df[pred_annotated.df['in_training']==False]

triple_not_in_train.to_csv("./data/final_pred.csv", index=False)

## Generazione dei distrattori

In [13]:
scored_triple = pd.read_csv("./data/final_pred.csv")
scored_triple = scored_triple.sort_values(by='score', ascending=False)

In [4]:
scored_triple.head(10)

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score,in_training
0,171,Q183,34,rel,171,Q183,-0.392535,False
1,171,Q183,19,P355,171,Q183,-0.392904,False
2,171,Q183,34,rel,445,Q865,-0.395508,False
3,171,Q183,15,P190,171,Q183,-0.396271,False
4,171,Q183,16,P276,171,Q183,-0.397741,False
5,171,Q183,28,P530,171,Q183,-0.402134,False
6,171,Q183,5,P1376,171,Q183,-0.402757,False
7,171,Q183,6,P138,171,Q183,-0.403304,False
8,445,Q865,34,rel,445,Q865,-0.40347,False
9,171,Q183,5,P1376,320,Q408,-0.406509,False


In [28]:
scored_triple.sort_values(by='score', ascending=False).groupby(['head_label', 'relation_label']).head(4)['head_label'].value_counts()

head_label
Q183        140
Q24639      140
Q1989       140
Q220546     140
Q7026       140
           ... 
Q1741       140
Q3711       140
Q1762022    140
Q2044       140
tail        140
Name: count, Length: 491, dtype: int64

In [25]:
top_scores_df = scored_triple.sort_values(by='score', ascending=False).groupby(['head_label', 'relation_label']).head(4)
top_scores_df = top_scores_df.drop(['head_id', 'relation_id', 'tail_id'], axis=1)
top_scores_df = top_scores_df[top_scores_df['head_label'] != top_scores_df['tail_label']]
top_scores_df = top_scores_df[top_scores_df['head_label'] != 'head']
top_scores_df = top_scores_df[top_scores_df['head_label'] != 'tail']
top_scores_df = top_scores_df[top_scores_df['relation_label'] != 'rel']
top_scores_df = top_scores_df[top_scores_df['tail_label'] != 'tail']
top_scores_df = top_scores_df.sort_values(by='score', ascending=False)
top_scores_df.head(40)['head_label'].value_counts()

  top_scores_df = scored_triple.groupby(['head_label', 'relation_label']).apply(lambda x: x.nlargest(4, 'score')).reset_index(drop=True)


head_label
Q183    22
Q865    15
Q408     3
Name: count, dtype: int64