In [30]:
import json
import requests
import os
import pandas as pd
import numpy as np
import tqdm
import matplotlib.pyplot as plt

#os.mkdir("./data")

In [31]:
def fetch_labels_in_batches(ids, batch_size=50):
    # Prepare batches
    batches = [ids[i:i + batch_size] for i in range(0, len(ids), batch_size)]

    # Dictionary to hold the results
    labels_dict = {}

    for batch in batches:
        # Join IDs with '|' to form a single string for the API request
        batch_ids = '|'.join(batch)
        url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={batch_ids}&format=json&languages=en&props=labels"
        response = requests.get(url)
        data = response.json()

        # Extract labels for each entity in the batch
        for wikidata_id in batch:
            try:
                label = data['entities'][wikidata_id]['labels']['en']['value']
                labels_dict[wikidata_id] = label
            except KeyError:
                labels_dict[wikidata_id] = 'Label not found'  # Handle missing labels

    return labels_dict

## Data Loading

In [32]:
#Set a min count to have a smaller dataset and avoid non-meaningful embedding
min_occ = 5

In [33]:
df_most_clicked = pd.read_csv('data/most_clicked_pages.csv', header=0, names=["head", "rel", "tail"]).drop_duplicates().reset_index(drop=True)
df_most_clicked

Unnamed: 0,head,rel,tail
0,Q122921105,P31,Q11424
1,Q122921105,P495,Q668
2,Q122921105,P57,Q1607373
3,Q122921105,P364,Q1568
4,Q122921105,P86,Q7489036
...,...,...,...
49115,Q2599,P10527,Q50920401
49116,Q2599,P5008,Q6173448
49117,Q123465873,P31,Q5398426
49118,Q123465873,P161,Q9016731


## Removing meaningfull relations

In [34]:
codes = [
    "P2061",
    "P3156",
    "P9086",
    "P2629",
    "P2545",
    "P4312",
    "P2643",
    "P6452",
    "P9028",
    "P853",
    "P3216",
    "P2758",
    "P3402",
    "P3403",
    "P3931",
    "P10527",
    "P2756",
    "P2853",
    "P501",
    "P852",
    "P500",
    "P734",
    "P2747",
    "P9139",
    "P4437",
    "P1981",
    "P735",
    "P9866",
    "P916",
    "P1884",
    "P8874",
    "P511",
    "P1035",
    "P3306",
    "P7573",
    "P5150",
    "P5201",
    "P3428",
    "P31",
    "P141",
    "P3650",
    "P2684",
    "P3818",
    "P87",
    "P1408",
    "P3842",
    "P9929",
    "P6104",
    "P5970",
    "P241",
    "P1657",
    "P8889",
    "P3730",
    "P2363",
    "P11611",
    "P1308",
    "P7327",
    "P5008",
    "P2637",
    "P6658",
    "P3834",
    "P6978",
    "P1950",
    "P21",
    "P91",
    "P423",
    "P787",
    "P3279",
    "P9714",
    "P105",
    "P914",
]


df_most_clicked = df_most_clicked[~df_most_clicked['rel'].isin(codes)]

In [35]:
tail = df_most_clicked.groupby('tail').count()['head']
tail_repeated = tail[tail > min_occ]

In [36]:
len(tail_repeated)

825

In [37]:
df_most_clicked_reduced = df_most_clicked.query(' tail in @tail_repeated.index')

In [38]:
unique_rel = np.unique(df_most_clicked_reduced.rel)
len(unique_rel)

188

In [39]:
unique_object = np.unique(list(df_most_clicked_reduced['head']) + list(df_most_clicked_reduced['tail']))
len(unique_object)

1684

In [40]:
rel_labels = fetch_labels_in_batches(unique_rel)
rel_labels

{'P1001': 'applies to jurisdiction',
 'P101': 'field of work',
 'P102': 'member of political party',
 'P103': 'native language',
 'P1038': 'relative',
 'P1049': 'worshipped by',
 'P1050': 'medical condition',
 'P1056': 'product or material produced or service provided',
 'P106': 'occupation',
 'P10606': 'notable role',
 'P108': 'employer',
 'P112': 'founded by',
 'P1142': 'political ideology',
 'P118': 'league',
 'P119': 'place of burial',
 'P1196': 'manner of death',
 'P122': 'basic form of government',
 'P1269': 'facet of',
 'P127': 'owned by',
 'P1290': 'godparent',
 'P1303': 'instrument',
 'P131': 'located in the administrative territorial entity',
 'P1313': 'office held by head of government',
 'P1336': 'territory claimed by',
 'P1340': 'eye color',
 'P1343': 'described by source',
 'P1344': 'participant in',
 'P1346': 'winner',
 'P135': 'movement',
 'P136': 'genre',
 'P1365': 'replaces',
 'P1376': 'capital of',
 'P138': 'named after',
 'P140': 'religion or worldview',
 'P1411': '

In [41]:
entity_labels = fetch_labels_in_batches(unique_object)
entity_labels

{'L484': 'Label not found',
 'L485': 'Label not found',
 'Q1000': 'Gabon',
 'Q1004037': 'Frederik X of Denmark',
 'Q1005': 'The Gambia',
 'Q1005887': 'Fórum Hungary',
 'Q100704907': 'Tiger 3',
 'Q100795875': 'Blake Corum',
 'Q1008': 'Ivory Coast',
 'Q1009': 'Cameroon',
 'Q1011': 'Cape Verde',
 'Q101110072': '2024 United States presidential election',
 'Q1011547': 'Golden Globe Awards',
 'Q1011564': 'Golden Globe Award for Best Actress in a Motion Picture – Musical or Comedy',
 'Q1013': 'Lesotho',
 'Q1014': 'Liberia',
 'Q1016': 'Libya',
 'Q1019': 'Madagascar',
 'Q1020': 'Malawi',
 'Q102139': 'Margrethe II of Denmark',
 'Q102180036': 'Riz Test',
 'Q102338984': 'Matt Rogers',
 'Q102427': 'Academy Award for Best Picture',
 'Q1025': 'Mauritania',
 'Q1027': 'Mauritius',
 'Q1027880': 'Grammy Award for Best Pop Duo/Group Performance',
 'Q1027891': 'Grammy Award for Best Pop Vocal Album',
 'Q1027904': 'Grammy Award for Song of the Year',
 'Q1028': 'Morocco',
 'Q102813': 'Yul Brynner',
 'Q102818

In [42]:
rel_dataframe = pd.DataFrame({
    "ID" : rel_labels.keys(),
    "name" : rel_labels.values(),
}).set_index('ID')
rel_dataframe

Unnamed: 0_level_0,name
ID,Unnamed: 1_level_1
P1001,applies to jurisdiction
P101,field of work
P102,member of political party
P103,native language
P1038,relative
...,...
P921,main subject
P937,work location
P945,allegiance
P9493,artist files at


In [43]:
entity_dataframe = pd.DataFrame({
    "ID" : entity_labels.keys(),
    "name" : entity_labels.values(),
}).set_index('ID')
entity_dataframe

Unnamed: 0_level_0,name
ID,Unnamed: 1_level_1
L484,Label not found
L485,Label not found
Q1000,Gabon
Q1004037,Frederik X of Denmark
Q1005,The Gambia
...,...
Q989447,Primetime Emmy Award for Outstanding Lead Actr...
Q989453,Primetime Emmy Award for Outstanding Lead Acto...
Q99,California
Q99413897,Medvik


In [44]:
#deleting unknown label
entity_dataframe = entity_dataframe.query(' name != "Label not found"')

In [46]:
entity_dataframe.to_csv("data/common_entity.csv")
df_most_clicked_reduced.to_csv("data/df_most_clicked_reduced.csv", index = False)
rel_dataframe.to_csv("data/common_relations.csv")