In [1]:
import json
import requests
import os
import pandas as pd
import numpy as np
import tqdm
import matplotlib.pyplot as plt

In [2]:
def fetch_labels_in_batches(ids, batch_size=50):
    # Prepare batches
    batches = [ids[i:i + batch_size] for i in range(0, len(ids), batch_size)]

    # Dictionary to hold the results
    labels_dict = {}

    for batch in batches:
        # Join IDs with '|' to form a single string for the API request
        batch_ids = '|'.join(batch)
        url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={batch_ids}&format=json&languages=en&props=labels"
        response = requests.get(url)
        data = response.json()

        # Extract labels for each entity in the batch
        for wikidata_id in batch:
            try:
                label = data['entities'][wikidata_id]['labels']['en']['value']
                labels_dict[wikidata_id] = label
            except KeyError:
                labels_dict[wikidata_id] = 'Label not found'  # Handle missing labels

    return labels_dict

## Data Loading

In [3]:
#Set a min count to have a smaller dataset and avoid non-meaningful embedding
min_occ = 5

In [18]:
df_most_clicked = pd.read_csv('data/most_clicked_pages.csv', header=0, names=["head", "rel", "tail"]).drop_duplicates().reset_index(drop=True)
df_most_clicked

Unnamed: 0,head,rel,tail
0,Q122921105,P31,Q11424
1,Q122921105,P495,Q668
2,Q122921105,P57,Q1607373
3,Q122921105,P364,Q1568
4,Q122921105,P86,Q7489036
...,...,...,...
285019,Q20683904,P27,Q145
285020,Q20683904,P735,Q18131042
285021,Q20683904,P734,Q37471755
285022,Q20683904,P7763,Q73555012


In [19]:
tail = df_most_clicked.groupby('tail').count()['head']
tail_repeated = tail[tail > min_occ]

In [20]:
len(tail_repeated)

4833

In [21]:
df_most_clicked_reduced = df_most_clicked.query(' tail in @tail_repeated.index')

In [22]:
unique_rel = np.unique(df_most_clicked_reduced.rel)
len(unique_rel)

450

In [23]:
unique_object = np.unique(list(df_most_clicked_reduced['head']) + list(df_most_clicked_reduced['tail']))
len(unique_object)

13412

In [12]:
rel_labels = fetch_labels_in_batches(unique_rel)
rel_labels

{'P1001': 'applies to jurisdiction',
 'P101': 'field of work',
 'P102': 'member of political party',
 'P1027': 'conferred by',
 'P103': 'native language',
 'P1035': 'honorific suffix',
 'P1037': 'director / manager',
 'P1038': 'relative',
 'P1040': 'film editor',
 'P1049': 'worshipped by',
 'P105': 'taxon rank',
 'P1050': 'medical condition',
 'P10527': 'documentation files at',
 'P1056': 'product or material produced or service provided',
 'P10588': 'academic calendar type',
 'P106': 'occupation',
 'P10606': 'notable role',
 'P10624': 'official observer status in organisation',
 'P1066': 'student of',
 'P1071': 'location of creation',
 'P10741': 'dance style',
 'P108': 'employer',
 'P1080': 'from narrative universe',
 'P11108': 'recorded participant',
 'P112': 'founded by',
 'P1142': 'political ideology',
 'P115': 'home venue',
 'P1158': 'location of landing',
 'P11611': 'Norwegian media rating',
 'P1165': 'home world',
 'P11747': 'holds diplomatic passport of',
 'P118': 'league',
 'P

In [13]:
entity_labels = fetch_labels_in_batches(unique_object)
entity_labels

{'L252247-F2': 'Label not found',
 'L371': 'Label not found',
 'L484': 'Label not found',
 'L485': 'Label not found',
 'L615860-F1': 'Label not found',
 'Q100': 'Boston',
 'Q1000': 'Gabon',
 'Q1000219': 'Kapoor family',
 'Q1000592': 'Tyson Fury',
 'Q1001': 'Mahatma Gandhi',
 'Q100137722': 'Wikiproject Nuremberg Trials',
 'Q100144350': 'Nicolas Jackson',
 'Q100193610': 'Jahmyr Gibbs',
 'Q100268160': 'The Mole – Undercover in North Korea',
 'Q100292174': 'Books of Blood',
 'Q100361125': 'Furiosa: A Mad Max Saga',
 'Q1004037': 'Frederik X of Denmark',
 'Q100408299': 'Kyren Williams',
 'Q100437698': 'murder of Samuel Paty',
 'Q1005': 'The Gambia',
 'Q1005887': 'Fórum Hungary',
 'Q1005953': 'Midland',
 'Q100598166': 'Saurabh Sachdeva',
 'Q100598223': 'Kathleen Biden',
 'Q100598230': 'Melissa Cohen',
 'Q1006': 'Guinea',
 'Q100604534': 'Greg Brockman',
 'Q100605126': 'Naomi Biden',
 'Q10068': 'Lindsey Vonn',
 'Q1007': 'Guinea-Bissau',
 'Q100704907': 'Tiger 3',
 'Q100711980': 'Category I',
 'Q

In [14]:
rel_dataframe = pd.DataFrame({
    "ID" : rel_labels.keys(),
    "name" : rel_labels.values(),
}).set_index('ID')
rel_dataframe

Unnamed: 0_level_0,name
ID,Unnamed: 1_level_1
P1001,applies to jurisdiction
P101,field of work
P102,member of political party
P1027,conferred by
P103,native language
...,...
P97,noble title
P9714,taxon range
P9866,GRAC rating
P991,successful candidate


In [15]:
entity_dataframe = pd.DataFrame({
    "ID" : entity_labels.keys(),
    "name" : entity_labels.values(),
}).set_index('ID')
entity_dataframe

Unnamed: 0_level_0,name
ID,Unnamed: 1_level_1
L252247-F2,Label not found
L371,Label not found
L484,Label not found
L485,Label not found
L615860-F1,Label not found
...,...
Q99774629,Iman Vellani
Q99925317,Arch Manning
Q99943130,Gypsy Rose
Q99951011,International Space Hall of Fame


In [16]:
#deleting unknown label
entity_dataframe = entity_dataframe.query(' name != "Label not found"')

In [24]:
entity_dataframe.to_csv("data/common_entity.csv", index = False)
df_most_clicked_reduced.to_csv("data/df_most_clicked_reduced.csv", index = False)
rel_dataframe.to_csv("data/common_relations.csv", index = False)