In [None]:
# RUN it if you are on COLAB
#!pip install datasets
#!pip install SPARQLWrapper

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import time
from datasets import load_from_disk

In [None]:
# Mount drive if you are on google colab
#from google.colab import drive
#drive.mount('/content/drive')

### Exploiting the wikidata knowledge-graph based structure

To enrich the dataset we identified some relevant properties for the 19 categories that we are dealing with. The idea is to exploit the knowledge-graph based wikidata structure and to extract such relevant information using **SPARQL** as query language.

We identified a total of 29 relevant properites:
- **P495**: country of origin
- **P2596**: culture
- **P172**: ethnic group
- **P37**: official language
- **P407**: langauge of work or name
- **P135**: Movement(Art, Literature, philosophy)
- **P136**: Genre
- **P921**: Main subject
- **P547**: Memorialized by
- **P784**: Significant event
- **P840**: Narrative location
- **P17**: Country
- **P1843**: Taxon common name
- **P1001**: Applies to jurisdiction
- **P144**: Based on
- **P361**: Part of
- **P1705**: Native label
- **P2012**: cuisine
- **P2541**: Operating area
- **P1535**: Used by
- **P366**: Use
- **P1142**: Political ideology
- **P140**: Religion
- **P102**: Member of political party
- **P1344**: Participant in
- **P183**: Endemic to
- **P2341**: Indigenous to
- **P1532**: Country for sport
- **P279**: Subclass of


In [2]:
properties = ['originLabel', 'cultureLabel', 'ethnic_groupLabel', 'off_languageLabel',
                'nameLabel', 'movementLabel', 'genreLabel', 'main_subjectLabel',
                'memorializedLabel', 'sign_eventLabel', 'narrative_locLabel', 'countryLabel',
                'taxonLabel', 'jurisdictionLabel', 'based_onLabel', 'part_ofLabel',
                'native_labelLabel', 'cuisineLabel', 'areaLabel', 'used_byLabel',
                'useLabel', 'political_ideoLabel', 'religionLabel', 'political_partyLabel',
                'participant_inLabel', 'endemicLabel', 'indigenousLabel',
                'country_sportLabel', 'subclass_ofLabel'
              ]

# NOTE: we add Lable at the end of the name because we are interested in the lable, and not
# the wikidata identifier (e.g. Q177, identifier for pizza)

In [3]:
# To interact with wikidata we are going to use the SPARQLWrapper library
sparql = SPARQLWrapper('https://query.wikidata.org/sparql', agent='GjWikidataBot/1.0')
sparql.setReturnFormat(JSON)

In [4]:
# To extract information from wikidata we are going to need only the wikidata
# identifier. So now we define two different functions to extract them

def extract_test_IDs(dataset):
    test_IDs = []
    for item in dataset['item']:
        item_id = item.strip().split('/')[-1]
        test_IDs.append(item_id)

    return test_IDs

def extract_IDs(dataset):
    silver_IDs = []
    gold_IDs = []

    silver_samples = len(dataset['train'])
    for index in range(silver_samples):
        item_id = dataset['train'][index]['item'].strip().split('/')[-1]
        silver_IDs.append(item_id)

    gold_samples = len(dataset['validation'])
    for index in range(gold_samples):
        item_id = dataset['validation'][index]['item'].strip().split('/')[-1]
        gold_IDs.append(item_id)

    return silver_IDs, gold_IDs


In [5]:
# Load the datasets
datasets = load_from_disk('../datasets/train_and_val')
test_set = pd.read_csv('../datasets/test/testset.csv')

In [6]:
silver_IDs, gold_IDs = extract_IDs(datasets)
test_IDs = extract_test_IDs(test_set)

In [7]:
# Auxiliary function used to handle the response from the server
def get_query_result(data, properties):
    query_result = {}
    keys = data['results']['bindings'][0].keys()
    for prop in properties:
        if prop in keys:
            query_result[prop] = data['results']['bindings'][0][prop]['value']
        else:
            query_result[prop] = None

    return query_result

In the following we are going to define the function that extracts the data from wikidata. The main part of the function is the `SPARQL` query. In the `SELECT` clause we insert the variables to which we are intersted. In the `WHERE` clause we insert conditions of the following form:
`OPTIONAL wd:item_id wdt:PROP_id ?var .`

where:
- OPTIONAL is inserted to keep the result also in case of missing values (by default, if a variable is NULL, the returned result is an empty string)
- item_id: is the identifier of the item
- PROP_id is the idenfier of the propertie (e.g. country of origin has identifier **P495**)
- ?var: is the value we want to retrieve (`?` indicates that var is a variable)


In [8]:
def retrieve_info(items):
    i = 1                # to handle sleeping time
    dict_result = {}     # dictionary storing query results
    num_items = len(items)
    k = 0
    while k < num_items:
        item = items[k]
        sparql.setQuery(
            f'''
            SELECT ?originLabel ?cultureLabel ?ethnic_groupLabel ?off_languageLabel
                    ?nameLabel ?movementLabel ?genreLabel ?main_subjectLabel
                    ?memorializedLabel ?sign_eventLabel ?narrative_locLabel ?countryLabel
                    ?taxonLabel ?jurisdictionLabel ?based_onLabel ?part_ofLabel
                    ?native_labelLabel ?cuisineLabel ?areaLabel ?used_byLabel
                    ?useLabel ?political_ideoLabel ?religionLabel ?political_partyLabel
                    ?participant_inLabel ?endemicLabel ?indigenousLabel
                    ?country_sportLabel ?subclass_ofLabel
            WHERE {{
                OPTIONAL {{ wd:{item} wdt:P495 ?origin .}}
                OPTIONAL {{ wd:{item} wdt:P2596 ?culture .}}
                OPTIONAL {{ wd:{item} wdt:P172 ?ethnic_group .}}
                OPTIONAL {{ wd:{item} wdt:P37 ?off_language .}}
                OPTIONAL {{ wd:{item} wdt:P407 ?name .}}
                OPTIONAL {{ wd:{item} wdt:P135 ?movement .}}
                OPTIONAL {{ wd:{item} wdt:P136 ?genre .}}
                OPTIONAL {{ wd:{item} wdt:P921 ?main_subject .}}
                OPTIONAL {{ wd:{item} wdt:P547 ?memorialized .}}
                OPTIONAL {{ wd:{item} wdt:P793 ?sign_event .}}
                OPTIONAL {{ wd:{item} wdt:P840 ?narrative_loc .}}
                OPTIONAL {{ wd:{item} wdt:P17 ?country .}}
                OPTIONAL {{ wd:{item} wdt:P1843 ?taxon .}}
                OPTIONAL {{ wd:{item} wdt:P1001 ?jurisdiction .}}
                OPTIONAL {{ wd:{item} wdt:P144 ?based_on .}}
                OPTIONAL {{ wd:{item} wdt:P361 ?part_of .}}
                OPTIONAL {{ wd:{item} wdt:P1705 ?native_label .}}
                OPTIONAL {{ wd:{item} wdt:P2012 ?cuisine .}}
                OPTIONAL {{ wd:{item} wdt:P2541 ?area .}}
                OPTIONAL {{ wd:{item} wdt:P1535 ?used_by .}}
                OPTIONAL {{ wd:{item} wdt:P366 ?use .}}
                OPTIONAL {{ wd:{item} wdt:P1142 ?political_ideo .}}
                OPTIONAL {{ wd:{item} wdt:P140 ?religion .}}
                OPTIONAL {{ wd:{item} wdt:P102 ?political_party .}}
                OPTIONAL {{ wd:{item} wdt:P1344 ?participant_in .}}
                OPTIONAL {{ wd:{item} wdt:P183 ?endemic .}}
                OPTIONAL {{ wd:{item} wdt:P2341 ?indigenous .}}
                OPTIONAL {{ wd:{item} wdt:P1532 ?country_sport .}}
                OPTIONAL {{ wd:{item} wdt:P279 ?subclass_of .}}

                SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],mul,en". }}

            }}

            LIMIT 1
        '''
        )

        data = ''
        try:
            data = sparql.queryAndConvert()
            query_result = get_query_result(data, properties)
            dict_result[item] = query_result
            k += 1

        except Exception as e:
            # We increment the sleeping time when wikidata
            # complains because we are sending too many requests
            time.sleep(10 * i)
            i += 1
            continue


    return dict_result



Now we can retrieve the information from wikidata.

In [9]:
# retrieve info for the silver dataset
dict_training = retrieve_info(silver_IDs)

In [11]:
# retrieve info for the dev dataset
dict_validation = retrieve_info(gold_IDs)

In [15]:
# retrieve info for the test dataset
dict_test = retrieve_info(test_IDs)

We convert the dictionaries to pandas DataFrame and then save everything as a csv file.

In [21]:
training_table = pd.DataFrame.from_dict(dict_training, orient='index')
training_table = training_table.reset_index()
training_table.rename(columns={'index': 'item'}, inplace=True)
training_table.to_csv('../datasets/properties/training_props.csv', index=True)

In [22]:
validation_table = pd.DataFrame.from_dict(dict_validation, orient='index')
validation_table = validation_table.reset_index()
validation_table.rename(columns={'index': 'item'}, inplace=True)
validation_table.to_csv('../datasets/properties/validation_props.csv', index=True)

In [24]:
test_table = pd.DataFrame.from_dict(dict_test, orient='index')
test_table = test_table.reset_index()
test_table.rename(columns={'index': 'item'}, inplace=True)
test_table.to_csv('../datasets/properties/test_props.csv', index=True)