In [18]:
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
import time
import requests
from scipy.spatial.distance import euclidean
import json
import joblib

## Load Test set 


In [19]:
ds_test_set=pd.read_csv("test_unlabeled.csv")
X_test_set=ds_test_set.values

## Load Vocabulary

In [20]:
with open("vocabulary_category.json","r",encoding="utf-8") as f: # open file vocabulary_category.json in read mode to load the stored vocabulary containing properties
    vocabulary_j=json.load(f) # load vocabulary in json format

## Load tfidf to embedded sample

In [21]:
tfidf=joblib.load('tfidf_vectorizer.pkl')

## Load Centroids 

In [22]:
centroids=np.load("centroids_every_label.npz")

## Function to return Original label predicted

In [23]:
# compute the original label associated to numerical label [0,1,2]
def return_original_label(Y):
    l_pred=[]
    for y in Y:
        if y==0:
            l_pred.append("cultural agnostic")
        elif y==1:
            l_pred.append("cultural representative")
        else:
            l_pred.append("cultural exclusive")
    return l_pred

## Support Functions to predict samples

In [24]:
# compute the weight of a sample based on its distance from a centroid X2 (less distance->higher weights)
def weight_distance_estimation(X1,X2):
   
    dist=euclidean(X1,X2) # computes euclidean distance
    
    weights=np.exp(-dist) # inverse of distance using an exponential trend for more robustness of weight values
    
    return weights

In [25]:
#function which extracts wiki-item Qxxx 
def extract_entity_id(url): 
    return url.strip().split("/")[-1] 
#extract all samples from that class 
def extract_sample_from_class(X,label):
    l=list()
    for elem in X:
        if elem[6]==label:
            l.append(elem[0])
    return l
# Function to get request in a more efficient way without 1 to 1 corresponding request
def handle_get_request(entities_list,labeled):
        batch_size=40 # number of entity from extraction 
        url_base = "https://www.wikidata.org/w/api.php" # url base to peform a HTTP request to obtain Wikidata properties using api
        results = {} # dictionary  where entity |Qxxx| are the keys while the claims will be the corresponding values
        total_batches = (len(entities_list) + batch_size - 1) // batch_size # compute how many subsets of extraction 

        with tqdm(total=total_batches, desc=f"Downloading batch of class {labeled}") as pbar:
            for i in range(0, len(entities_list), batch_size): # loop for every batch of all entities

                batch = entities_list[i:i + batch_size] # i-th batch
                ids_string = "|".join(batch) # Considers a sequence of |Q1xxx|Q2xxx| items to collect from the server
                
                # define parameters of get request
                params = { 
                    "action": "wbgetentities", # obtain wbentities
                    "ids": ids_string, # specify which items we want to extract corresponding to the i-th batch
                    "format": "json"
                }

                attempt=0 # number of attempt in case of errors
                success=False

                while not success and attempt <3:

                    try:

                        response_get_id = requests.get(url_base, params=params, timeout=20) # it obtains a get of properties using previous parameters  
                        response_get_id.raise_for_status() # verify if the HTTP request fails and eventually store the error in e 
                        data = response_get_id.json() # extract data from json file

                        entities = data.get("entities", {}) # extract all properties of every item
                        for entity_id, entity_data in entities.items():  
                            claims = entity_data.get("claims", {}) # obtain claims
                            results[entity_id] = claims # collect all claims of an item |Qxxx| in a dictionary 
                        
                        success=True
                    except requests.exceptions.RequestException as e: 

                        print(f"Batch Error {batch}: {e}", flush=True)
                        wait_time = 2 ** attempt  # we increase time with exponential control wrt the number of attempts
                        
                        print(f"Retry waiting  {wait_time}s...", flush=True)
                        time.sleep(wait_time) # introduce a little execution delay to retry the same HTTP Request
                        attempt+=1
                pbar.update(1)
            return results
def extraction_identities_from_sample(X,C=None):
    list_identities=[]
    list_sample_cat=[]

    # we detect 3 cases: 
    if type(C)==str: # case of explicit expression of a class :'cultural agnostic','cultural representative','cultural exclusive
        
        list_sample_cat=extract_sample_from_class(X,C) # extract all items from dataset belonging to class C

    elif C!=None: # case of giving a list of categories

        for elem in C:
            list_sample_cat.extend(extract_sample_from_class(X,elem)) # we collect all url items 
    
    else: # case of extraction of all samples without a specific category
        
        list_sample_cat=X[:,0]
        
    set_properties=list()

    for url in list_sample_cat:

        entity_train=extract_entity_id(url) # extract only the last part of url Qxxxx

        if entity_train.startswith("Q"): # verify if entity_id starts with Q
            list_identities.append(entity_train)

    claims_identity=handle_get_request(list_identities,C) # recall the function obtaining a dictionary of all properties associated to all selected items in list identities
    

    for entity_id, claims in claims_identity.items(): # take all claims from all entities

        set_property_item=set()

        for prop_id in claims.keys(): # take all properties Pxx from claims associated to each entitity
            set_property_item.add(str(prop_id)) # create set of properties for that item 

        set_properties.append(set_property_item) # create a list of properties sets
    return set_properties
# extraction of identities from samples belonging to the same class 
def embedding_sample(X_data,vocabulary,sample_properties=None):
    
    def process_sample(X,vocab,set_p=None):
        set_prop=extraction_identities_from_sample(X) # for every sample extract all identities
        sample=np.zeros((X.shape[0],len(vocab)),dtype=np.int32) # create matrix of x_train_embedded (X_dimension,n_properties of vocabulary)

        for i in range(0,len(set_prop)):
            for v in range(0,len(vocab)):
                if vocab[v] in set_prop[i]: # if property v belongs to the set_p of that sample -> sample cell at index v will be 1 
                    sample[i,v]=1 
        return sample
    
    return process_sample(X_data,vocabulary)

## Fucntion to predict sample


In [26]:
# predict function of every sample
def predict_entity_score(x_sample,centroid_CA,centroid_CR,centroid_CE,T,tfidf):  

    list_sample=[]
    
    # transform every sample associating weights wrt same distribution of train samples methodology
    if T=='test':
        x_sample=tfidf.transform([x_sample]).toarray()[0]
        list_sample.append(x_sample)
    
    # compute the weights of every sample using the euclidean distance wrt specific centroid  
    similiraty_sample_CA=weight_distance_estimation(x_sample,centroid_CA) 
    similarity_sample_CR=weight_distance_estimation(x_sample,centroid_CR)
    similarity_sample_CE=weight_distance_estimation(x_sample,centroid_CE)
 
    # compute the sum of all weights of similarity  
    Sum_weight=similarity_sample_CE+similarity_sample_CR+similiraty_sample_CA

    # normalize wrt this sum to obtain a value among [0,1] to treat as a probability
    total_score_agnostic=similiraty_sample_CA/Sum_weight
    total_score_exclusive=similarity_sample_CE/Sum_weight
    total_score_representative=similarity_sample_CR/Sum_weight
    
    # build a vector with the following values and compute the argmax
    v=[total_score_agnostic,total_score_representative,total_score_exclusive]
    class_pred=np.argmax(v)
    if T=='test':
        return class_pred,list_sample
    return class_pred

## Test Step

In [27]:
X_test_embed=embedding_sample(X_test_set,vocabulary_j)
Y_test_pred=np.zeros(X_test_embed.shape[0],dtype=np.int32)
list_x_test_weight=[]
for i in range(0,X_test_embed.shape[0]):

    # predict every validation sample with Centroid CA, Centroid CR, Centroid CE and corresponding property weights wrt CA,CR,CE
    index_class_valid,list_sample_i=predict_entity_score(X_test_embed[i],centroids["centroid_CA"],centroids["centroid_CR"],centroids["centroid_CE"],'test',tfidf=tfidf)
    
    # collect all y_pred
    Y_test_pred[i]=index_class_valid 

    # collect all x_test_weigthted 
    list_x_test_weight.extend(list_sample_i)

y_test=pd.DataFrame(np.column_stack((X_test_set[:,0],X_test_set[:,1],return_original_label(Y_test_pred))),columns=["item","name","label"])
y_test.to_csv("Salmonators_output_model_NONLM.csv")

Downloading batch of class None:   0%|          | 0/8 [00:00<?, ?it/s]