## Pacchetti

In [1]:
import requests
import pandas as pd
#from google.colab import drive
#drive.mount('/content/drive/')



## Variabili globali

In [2]:
## Variabili globali
WIKIPEDIA_API_URL = "https://en.wikipedia.org/w/api.php"

nentities = 10000
BATCH_SIZE = 50

## Moduli

In [3]:
def process_batch(titles_batch, api_url="https://en.wikipedia.org/w/api.php"):
    params = {
        "action": "query",
        "format": "json",
        "titles": "|".join(titles_batch),
        "prop": "pageprops"
    }
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        pages = response.json().get('query', {}).get('pages', {})
        batch_entities = {}
        for page_id, page_info in pages.items():
            pageprops = page_info.get('pageprops', {})
            wikidata_id = pageprops.get('wikibase_item')
            if wikidata_id:
                batch_entities[page_info['title']] = wikidata_id
        return batch_entities
    except requests.RequestException as e:
        print(f"Failed to fetch data: {e}")
        return {}  # Return an empty dict in case of failure
    
def fetch_wikidata_ids(most_viewed_pages, batch_size=50, api_url="https://en.wikipedia.org/w/api.php"):
    entities = {}
    titles_batch = []

    for page in most_viewed_pages:
        titles_batch.append(page['title'])
        if len(titles_batch) >= batch_size:
            entities.update(process_batch(titles_batch, api_url))
            titles_batch = []  # Reset the batch

    if titles_batch:  # Process any remaining titles
        entities.update(process_batch(titles_batch, api_url))
    return entities

def fetch_most_viewed_pages(total_pages, api_url="https://en.wikipedia.org/w/api.php"):
    limit = 500
    fetched_pages = []

    for i in range(0, total_pages, limit):
        params = {
            "action": "query",
            "format": "json",
            "list": "mostviewed",
            "pvimlimit": min(limit, total_pages - i)
        }

        response = requests.get(api_url, params=params)
        response_data = response.json()

        if 'query' in response_data and 'mostviewed' in response_data['query']:
            fetched_pages.extend(response_data['query']['mostviewed'])
        else:
            break  # Exit loop if no more data is available

    return fetched_pages

def get_relations_batch(wikidata_ids, batch_size=50, api_url="https://en.wikipedia.org/w/api.php"):
    relations = {}
    for start in range(0, len(wikidata_ids), batch_size):
        end = start + batch_size
        batch = wikidata_ids[start:end]
        wikidata_ids_str = "|".join(batch)

        print(f"Processing batch {start // batch_size + 1} out of {len(wikidata_ids) // batch_size + 1}")

        params = {
            "action": "wbgetentities",
            "format": "json",
            "ids": wikidata_ids_str,
            "props": "claims"
        }

        response = requests.get(api_url, params=params)
        if response.status_code != 200:
            print(f"Error: Received status code {response.status_code}")
            continue

        try:
            data = response.json().get('entities', {})
        except ValueError:
            print("Error: Unable to parse JSON response")
            continue

        for wikidata_id in batch:
            if wikidata_id in data:
                claims = data[wikidata_id].get('claims', {})
                relations[wikidata_id] = claims
            else:
                print(f"Warning: No data found for Wikidata ID {wikidata_id}")

    return relations

## Calcolo paginie più visitate

In [9]:
most_viewed_pages = fetch_most_viewed_pages(nentities, WIKIPEDIA_API_URL)
print(f"Fetched {len(most_viewed_pages)} pages")

entities = fetch_wikidata_ids(most_viewed_pages)
entities_list = [str(element) for element in list(entities.values())]

Fetched 10000 pages


In [14]:
relations = get_relations_batch(entities_list)
print("Processing complete.")

Processing batch 1 out of 10
Processing batch 2 out of 10
Processing batch 3 out of 10
Processing batch 4 out of 10
Processing batch 5 out of 10
Processing batch 6 out of 10
Processing batch 7 out of 10
Processing batch 8 out of 10
Processing batch 9 out of 10
Processing batch 10 out of 10
Processing complete.


In [5]:
# Print out the relations
limit = 1
count = 0
for wikidata_id, claims in relations.items():
    if count<=limit:
        count+=1
        print(f"Entity: {wikidata_id}")
        for property_id, claim_list in claims.items():
            print(f"  Property: {property_id}")
            for claim in claim_list:
                mainsnak = claim['mainsnak']
                if 'datavalue' in mainsnak:
                    value = mainsnak['datavalue']
                    print(f"    Value: {value}")
    else:
        break

In [6]:
triples = []

for wikidata_id, claims in relations.items():
    for property_id, claim_list in claims.items():
        for claim in claim_list:
            mainsnak = claim['mainsnak']
            if 'datavalue' in mainsnak:
                value = mainsnak['datavalue']
                if value['type'] == 'wikibase-entityid':
                    value_id = value['value']['id']
                    #print(f"Entity: {wikidata_id} Relation: {property_id} Entity: {value_id}")
                    triples.append({'entity': wikidata_id, 'rel': property_id, 'objt': value_id})

# Crea il DataFrame dalle triple
df = pd.DataFrame(triples)
df.head(3)

In [7]:
print(f"Numero di entità (head) presenti: {df.entity.nunique()}")
print(f"Numero di relazioni presenti: {df.rel.nunique()}")
print(f"Numero di entità (tail) presenti: {df.objt.nunique()}")

AttributeError: 'DataFrame' object has no attribute 'entity'

In [None]:
df.to_csv("data/df_triple.csv")