## Install

In [1]:
!pip install pykeen[tensorboard]

Collecting pykeen[tensorboard]
  Downloading pykeen-1.10.2-py3-none-any.whl.metadata (83 kB)
     ---------------------------------------- 0.0/83.8 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/83.8 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/83.8 kB ? eta -:--:--
     ------------------ ------------------- 41.0/83.8 kB 281.8 kB/s eta 0:00:01
     -------------------------------------  81.9/83.8 kB 459.5 kB/s eta 0:00:01
     -------------------------------------- 83.8/83.8 kB 430.3 kB/s eta 0:00:00
Collecting dataclasses-json (from pykeen[tensorboard])
  Downloading dataclasses_json-0.6.4-py3-none-any.whl.metadata (25 kB)
Collecting scipy>=1.7.0 (from pykeen[tensorboard])
  Downloading scipy-1.13.0-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ---------------------------------------- 60.6/60.6 kB 1.1 MB/s eta 0:00:00
Collecting click (from pykeen[tensorboard])
  Down

## Librerie

In [2]:
import json
import requests
import os
import pandas as pd
import tarfile
import tqdm
from pykeen.pipeline import pipeline
from pykeen.predict import predict_all
import torch
from pykeen.evaluation import RankBasedEvaluator
from pykeen.triples import TriplesFactory

IQS = False

## Moduli

In [3]:
def download_file(url, destination):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(destination, 'wb') as file:
                file.write(response.content)
            print("Download completato con successo.")
        else:
            print(f"Errore durante il download del file. Codice di stato: {response.status_code}")
    except Exception as e:
        print(f"Si è verificato un errore: {e}")


def decompress_tar_gz(file_path, destination_folder):
    try:

        with tarfile.open(file_path, 'r:gz') as tar:
            if not os.path.exists(destination_folder):
              os.mkdir(destination_folder)

            tar.extractall(destination_folder)
        print("Decompressione completata con successo.")
        return True
    except Exception as e:
        print(f"Si è verificato un errore durante la decompressione: {e}")
        return False


def read_firstlines(file_path, nlines):
    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()[:nlines]
            for line in lines:
                print(line.rstrip())
    except FileNotFoundError:
        print(f"Il file '{file_path}' non esiste.")
    except Exception as e:
        print(f"Si è verificato un errore durante la lettura del file: {e}")


def itemQualityScore(item2search):
    inference_url = 'https://api.wikimedia.org/service/lw/inference/v1/models/wikidatawiki-itemquality:predict'

    try:
        req = requests.post(f"https://www.wikidata.org/w/api.php?action=query&format=json&formatversion=2&prop=revisions|entityterms&titles={item2search}&origin=*")
        jsonReq = req.json()

        data = {"rev_id": jsonReq['query']['pages'][0]['revisions'][0]['revid'] }
        response = requests.post(inference_url, headers=headers, data=json.dumps(data))
        prob = response.json()
        prediction = prob['wikidatawiki']['scores'][str(data['rev_id'])]['itemquality']['score']['prediction']
        probabilityFromModel = prob['wikidatawiki']['scores'][str(data['rev_id'])]['itemquality']['score']['probability'][prediction]
    except Exception as e:
        prediction = "NotFound"
        probabilityFromModel = 0


    return prediction, probabilityFromModel


## Dowload file wikidata

### Triple codificate

In [4]:
url = "https://www.dropbox.com/s/6sbhm0rwo4l73jq/wikidata5m_transductive.tar.gz?dl=1"
destination = "wiki_tran.tar.gz"
download_file(url, destination)
decompress_tar_gz(destination, "./data")
os.remove("./wiki_tran.tar.gz")

Download completato con successo.
Decompressione completata con successo.


### Triple con sinonimi

In [5]:
url = "https://www.dropbox.com/s/7jp4ib8zo3i6m10/wikidata5m_text.txt.gz?dl=1"
# Sempre triple ma con i nomi
destination = "corpus.tar.gz"

download_file(url, destination)
decompress_tar_gz(destination, "./data")
os.remove("./corpus.tar.gz")

Download completato con successo.
Si è verificato un errore durante la decompressione: invalid header


### Alias delle entità

In [6]:
url = "https://www.dropbox.com/s/lnbhc8yuhit4wm5/wikidata5m_alias.tar.gz?dl=1"
destination = "alias.tar.gz"
download_file(url, destination)
decompress_tar_gz(destination, "./data")
os.remove("./alias.tar.gz")

Download completato con successo.
Decompressione completata con successo.


### Stampa delle prime righe dei file

In [7]:
for file in os.listdir("./data"):
    print(f"Prime righe di {file}")
    read_firstlines(f"./data/{file}", 3)
    print("\n")

Prime righe di wikidata5m_entity.txt
Si è verificato un errore durante la lettura del file: 'charmap' codec can't decode byte 0x8d in position 5945: character maps to <undefined>


Prime righe di wikidata5m_relation.txt
P489	currency symbol description
P834	train depot	railway depot	depot	rail yard
P2629	BBFC rating	BBFC certificate


Prime righe di wikidata5m_transductive_test.txt
Q7965079	P27	Q16
Q6719921	P31	Q11446
Q4925109	P175	Q5165801


Prime righe di wikidata5m_transductive_train.txt
Q29387131	P31	Q5
Q326660	P1412	Q652
Q7339549	P57	Q1365729


Prime righe di wikidata5m_transductive_valid.txt
Q3576734	P495	Q30
Q641724	P1412	Q1860
Q959357	P39	Q49476




## API wikidata

In [8]:
headers = {
    'Authorization': 'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiI1ZDEzYzQ4ZTg3YzQ4YzJhYWRmZGFlYzA1MjhkYTU0MSIsImp0aSI6IjgzZWE4MmVkMjI3ZDgzZDkwMzBlNGUyNzJiNTBmOTViYjhlODA3NDI4NTI5YjVhMzBlYjRkNTJmZDNhY2JkNTZmMmU4ZTZiNzI3ZDMxNTQ2IiwiaWF0IjoxNzEyODMzOTg0Ljk0NTI2NiwibmJmIjoxNzEyODMzOTg0Ljk0NTI3MSwiZXhwIjozMzI2OTc0Mjc4NC45NDM3NTIsInN1YiI6Ijc1NDA2OTAyIiwiaXNzIjoiaHR0cHM6Ly9tZXRhLndpa2ltZWRpYS5vcmciLCJyYXRlbGltaXQiOnsicmVxdWVzdHNfcGVyX3VuaXQiOjUwMDAsInVuaXQiOiJIT1VSIn0sInNjb3BlcyI6WyJiYXNpYyJdfQ.NnXfiaq4StcjEq9fqjVRUrJnn5m-eIDy6jQtLTwprH_huGrzm8Z6HduKUiE93a3zwQ3T3t2c8EV4VzDHWv-fPSfozhNVVhwzBhNI80qyMYJQb2ieSu0Jq0mobtY443ygnPDWhADV0QhMpgLpafUPA8QpjnfzmSpXzfWkiEj1oWnuMJ25cr4Y7jqTUjlLQ_wHHSS43DWX1ZURmC0tQb2fZJ0o1NdoiC5MTtcaEqB3t2zF_C1BNGB_jLcrqkYKMZwjDL4SSaph5vIVS9mpYJHaoPlkroKnsc66_75541eSrt5D5YHwkYNavNrvFYpDWVnzUCp5kfX5POaqMquHYEYIaTNJImVuY-2hXIvwkF4FG9rh6rt8Y7WY67Sf_wFhVN9LYqMZVlR3JnZzeI-wfPy_MZ2Wh-WeTb6FHJDow66hwBNX8ua5EfOEF3XOj1N4k5iwYSRSCW6Ko1we-MlExWRMRKFS9t1Rfaw1C8wJCWJRfm0TIZkl-1nP5ACr5w9d1n9S1qjp95HEMnb25aUG3N1q0zwg9nS7xoAqWHXoojoLg31ocSpvpe6Qb1lrolKJwI6GnSjJJAYG03tTejgTjLfBON4uJlEaHFi5rk867L5Hiz25HY3h-OlS5DabwMdH9hZnvipPPeslG9e4X3TgqtidVlPWSk3DSH3Fm_ZeHuZ-7EY',
    'Content-type': 'Content-type',
    'User-Agent': 'tumnus7@gmail.com'
}

### Test API prediction

In [9]:
item2search = "Q35610"

In [10]:
pred, prob = itemQualityScore(item2search)

In [11]:
print(pred, prob)

A 0.9308966801692856


## Data Loading e scrematura

In [12]:
df_org_train = pd.read_csv('data/wikidata5m_transductive_train.txt', sep='\t', header=None, names=["head", "rel", "tail"])
df_org_valid = pd.read_csv('data/wikidata5m_transductive_valid.txt', sep='\t', header=None, names=["head", "rel", "tail"])
df_org_test = pd.read_csv('data/wikidata5m_transductive_test.txt', sep='\t', header=None, names=["head", "rel", "tail"])

In [13]:
df_org_train_most_head = df_org_train.groupby("head").count().sort_values("rel", ascending=False).head(100000)
df_org_train_most_tail = df_org_train.groupby("tail").count().sort_values("rel", ascending=False).head(100000)
df_org_train_most_rel = df_org_train.groupby("rel").count().sort_values("tail", ascending=False).head(300)

In [14]:
df_org_train_most = df_org_train_most_head.join(df_org_train_most_tail, how="inner", lsuffix='_l', rsuffix='_r')
df_org_train_most["occurencies"] = df_org_train_most.rel_l + df_org_train_most.rel_r
df_org_train_most = df_org_train_most.sort_values("occurencies", ascending=False).head(500)

In [15]:
df_org_train_reduced = df_org_train[(df_org_train["head"].isin(df_org_train_most.index)) & (df_org_train["rel"].isin(df_org_train_most_rel.index)) & (df_org_train["tail"].isin(df_org_train_most.index))]
df_org_valid_reduced = df_org_valid[(df_org_valid["head"].isin(df_org_train_most.index)) & (df_org_valid["rel"].isin(df_org_train_most.index)) & (df_org_valid["tail"].isin(df_org_train.index))]
df_org_test_reduced = df_org_test[(df_org_test["head"].isin(df_org_train_most.index)) & (df_org_test["rel"].isin(df_org_train_most.index)) & (df_org_test["tail"].isin(df_org_train.index))]

In [16]:
df_org_train_reduced

Unnamed: 0,head,rel,tail
5307,Q365,P17,Q7318
6157,Q586,P190,Q33935
9143,Q148,P530,Q750
11578,Q408,P530,Q183
14878,Q837,P530,Q843
...,...,...,...
20605130,Q1731,P190,Q34370
20607958,Q1019,P530,Q159
20608891,Q159,P530,Q222
20610226,Q954,P530,Q408


In [17]:
wiki_train_path = 'data/wikidata5m_transductive_train_red.tsv'
df_org_train_reduced.to_csv(wiki_train_path, sep='\t', index=False)

wiki_validation_path = 'data/wikidata5m_transductive_valid_red.tsv'
#df_org_valid.to_csv(wiki_validation_path, sep='\t', index=False)

wiki_test_path = 'data/wikidata5m_transductive_test_red.tsv'
#df_org_test.to_csv(wiki_test_path, sep='\t', index=False)

### Aggiungo ItemQualityScore

In [19]:
if IQS:
  parole_Q = []
  pred_Q = []

  checkpoint_interval = 500
  contatore_iterazioni = 0

  for pathData in [wiki_train_path, wiki_validation_path, wiki_test_path]:
    with open(pathData, 'r') as file:
        for riga in tqdm.tqdm(file):
            item2search = riga.split()[0]
            pred, prob = itemQualityScore(item2search)
            if pred == 'A' or pred == 'B':
                parole_Q.append(item2search)
                pred_Q.append(pred)

            contatore_iterazioni += 1

            if contatore_iterazioni % checkpoint_interval == 0:
                df_checkpoint = pd.DataFrame({'Entity': parole_Q, 'Score': pred_Q})

                df_checkpoint.to_csv(f"./data/checkpoint_{contatore_iterazioni}_{pathData.split('/')[1].split('.')[0]}_entity_score.csv", index=False)

                parole_Q = []
                pred_Q = []

    if parole_Q:
        df_checkpoint = pd.DataFrame({'Entity': parole_Q, 'Score': pred_Q})
        df_checkpoint.to_csv(f"./data/checkpoint_{contatore_iterazioni}_{pathData.split('/')[1].split('.')[0]}_entity_score.csv", index=False)

    df_finale = pd.concat([pd.read_csv(f"checkpoint_{i * checkpoint_interval}_{pathData.split('/')[1].split('.')[0]}_entity_score.csv") for i in range(1, (contatore_iterazioni // checkpoint_interval) + 1)], ignore_index=True)
    df_finale.to_csv(f"./data/{pathData.split('/')[1].split('.')[0]}_entity_score.csv", index=False)

## Embedding

Spiegazione delle metriche [*qui*](https://docs.ampligraph.org/en/2.0.0/ampligraph.evaluation.html#metrics)

In [6]:
if 'df_org_train_reduced' not in globals():
    df_org_train_reduced = pd.read_csv('data/wikidata5m_transductive_train_red.tsv', sep='\t', header=None, names=["head", "rel", "tail"])

In [7]:
df4emb = df_org_train_reduced #pd.concat([df_org_train_reduced, df_org_valid, df_org_test], ignore_index=True)

triples_factory = TriplesFactory.from_labeled_triples(triples=df4emb[['head', 'rel', 'tail']].values)

training = triples_factory
validation = triples_factory
testing = triples_factory

d=training
id_to_entity={v: k for k, v in d.entity_to_id.items()}
id_to_relation={v: k for k, v in d.relation_to_id.items()}

triples_factory.triples



array([['Q100', 'P1376', 'Q771'],
       ['Q100', 'P17', 'Q30'],
       ['Q100', 'P190', 'Q1492'],
       ...,
       ['Q994', 'P190', 'Q656'],
       ['Q994', 'P30', 'Q46'],
       ['head', 'rel', 'tail']], dtype='<U9')

In [None]:
evaluator = RankBasedEvaluator()
emb_dict={}
emb_list = ["PairRE", "ConvE", "QuatE"]
df_metrics=pd.DataFrame(columns=["Model", "Hits@1", "Hits@3", "Hits@5", "Hits@10", "MRR"])
for emb in emb_list:
    print(f"\nStart with {emb}\n")
    try:
        result = pipeline(
            training=training,
            validation=validation,
            testing=testing,
            model=emb,
            device='gpu',
            random_seed=42,
            result_tracker='tensorboard',
            result_tracker_kwargs=dict(
                experiment_name=f'./log/wiki_{emb}',
            ),
        )
        result.save_to_directory(f'./evalModel/wiki_{emb}')
        metrics = evaluator.evaluate(result.model, testing.mapped_triples, additional_filter_triples=[training.mapped_triples, validation.mapped_triples])
        df_metrics['Model'] = emb
        df_metrics['Hits@1'] = metrics.get_metric('hits@1')
        df_metrics['Hits@3'] = metrics.get_metric('hits@3')
        df_metrics['Hits@5'] = metrics.get_metric('hits@5')
        df_metrics['Hits@10'] = metrics.get_metric('hits@10')
        df_metrics['MRR'] = metrics.get_metric('mean_reciprocal_rank')
        
        # Prediction
        pack = predict_all(model=result.model)
        prediction_all_triple = pack.process(factory=result.training)
        prediction_all_annotated = prediction_all_triple.add_membership_columns(training=result.training)

        df_metrics['MaxScore'] = prediction_all_annotated.df['score'].max()
        df_metrics['MinScore'] = prediction_all_annotated.df['score'].min()
        emb_dict[emb] = prediction_all_annotated
        
        triple_not_in_train = prediction_all_annotated.df[prediction_all_annotated.df['in_training']==False]

        triple_not_in_train.to_csv(f"./data/scored_predicted_triple_{emb}_notInTrain.csv", index=False)

    except Exception as e:
        print(f"Si è verificato un errore: {e}")
        continue