In [None]:
import requests
import pandas as pd
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
def process_batch(titles_batch, api_url):
    params = {
        "action": "query",
        "format": "json",
        "titles": "|".join(titles_batch),
        "prop": "pageprops"
    }
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        pages = response.json().get('query', {}).get('pages', {})
        batch_entities = {}
        for page_id, page_info in pages.items():
            pageprops = page_info.get('pageprops', {})
            wikidata_id = pageprops.get('wikibase_item')
            if wikidata_id:
                batch_entities[page_info['title']] = wikidata_id
        return batch_entities
    except requests.RequestException as e:
        print(f"Failed to fetch data: {e}")
        return {}  # Return an empty dict in case of failure

In [None]:
def fetch_wikidata_ids(most_viewed_pages, batch_size=50):
    WIKIPEDIA_API_URL = 'https://en.wikipedia.org/w/api.php'
    entities = {}
    titles_batch = []

    for page in most_viewed_pages:
        titles_batch.append(page['title'])
        if len(titles_batch) >= batch_size:
            entities.update(process_batch(titles_batch, WIKIPEDIA_API_URL))
            titles_batch = []  # Reset the batch

    if titles_batch:  # Process any remaining titles
        entities.update(process_batch(titles_batch, WIKIPEDIA_API_URL))

    return entities

In [None]:
WIKIPEDIA_API_URL = "https://en.wikipedia.org/w/api.php"

In [None]:
def fetch_most_viewed_pages(total_pages, url):
    WIKIPEDIA_API_URL = url
    limit = 500
    fetched_pages = []

    for i in range(0, total_pages, limit):
        params = {
            "action": "query",
            "format": "json",
            "list": "mostviewed",
            "pvimlimit": min(limit, total_pages - i)
        }

        response = requests.get(WIKIPEDIA_API_URL, params=params)
        response_data = response.json()

        if 'query' in response_data and 'mostviewed' in response_data['query']:
            fetched_pages.extend(response_data['query']['mostviewed'])
        else:
            break  # Exit loop if no more data is available

    return fetched_pages

nentities = 10000
most_viewed_pages = fetch_most_viewed_pages(nentities, WIKIPEDIA_API_URL)

print(f"Fetched {len(most_viewed_pages)} pages")

Fetched 10000 pages


In [None]:
entities = {}
print()
for index, page in enumerate(most_viewed_pages):
    print(str(index) + ' out of ' + str(len(most_viewed_pages)), end='\r')

    title = page['title']

    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "pageprops"
    }

    response = requests.get(WIKIPEDIA_API_URL, params=params)
    pages = response.json()['query']['pages']
    for page_id, page_info in pages.items():
        pageprops = page_info.get('pageprops', {})
        wikidata_id = pageprops.get('wikibase_item')

        if wikidata_id:
            entities[title] = wikidata_id




KeyboardInterrupt: 

In [None]:
len(entities)

In [None]:
# Step 3: Get the relations between these entities from Wikidata
WIKIDATA_API_URL = "https://www.wikidata.org/w/api.php"

relations = {}
for index, (title, wikidata_id) in enumerate(entities.items()):
    print()
    print(str(index) + ' out of ' + str(len(entities)), end='\r')
    params = {
        "action": "wbgetentities",
        "format": "json",
        "ids": wikidata_id,
        "props": "claims"
    }

    response = requests.get(WIKIDATA_API_URL, params=params)
    claims = response.json()['entities'][wikidata_id]['claims']

    relations[wikidata_id] = claims

In [None]:
# Print out the relations
limit = 1
count = 0
for wikidata_id, claims in relations.items():
    if count<=limit:
        count+=1
        print(f"Entity: {wikidata_id}")
        for property_id, claim_list in claims.items():
            print(f"  Property: {property_id}")
            for claim in claim_list:
                mainsnak = claim['mainsnak']
                if 'datavalue' in mainsnak:
                    value = mainsnak['datavalue']
                    print(f"    Value: {value}")
    else:
        break

In [None]:
triples = []

for wikidata_id, claims in relations.items():
    for property_id, claim_list in claims.items():
        for claim in claim_list:
            mainsnak = claim['mainsnak']
            if 'datavalue' in mainsnak:
                value = mainsnak['datavalue']
                if value['type'] == 'wikibase-entityid':
                    value_id = value['value']['id']
                    #print(f"Entity: {wikidata_id} Relation: {property_id} Entity: {value_id}")
                    triples.append({'entity': wikidata_id, 'rel': property_id, 'objt': value_id})

# Crea il DataFrame dalle triple
df = pd.DataFrame(triples)
df.head(3)

In [None]:
print(f"Numero di entità (head) presenti: {df.entity.nunique()}")
print(f"Numero di relazioni presenti: {df.rel.nunique()}")
print(f"Numero di entità (tail) presenti: {df.objt.nunique()}")

In [None]:
df.to_csv("/content/drive/df_triple.csv")