## Pacchetti

In [1]:
import pandas as pd
import requests
from tqdm import tqdm
import time
from datetime import datetime, timedelta
#from google.colab import drive
#drive.mount('/content/drive/')



## Moduli

In [2]:
def normalize_title(title):
    return title.strip().lower().replace(' ', '_')

def process_batch(titles_batch, api_url="https://en.wikipedia.org/w/api.php"):
    params = {
        "action": "query",
        "format": "json",
        "titles": "|".join(titles_batch.keys()),
        "prop": "pageprops"
    }
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        pages = response.json().get('query', {}).get('pages', {})
        batch_entities = {}
        for page_id, page_info in pages.items():
            pageprops = page_info.get('pageprops', {})
            wikidata_id = pageprops.get('wikibase_item')
            if wikidata_id:
                title = page_info['title']
                normalized_title = normalize_title(title)
                if normalized_title in titles_batch:
                    batch_entities[title] = {
                        "wikidata_id": wikidata_id,
                        "views": titles_batch[normalized_title]
                    }
        return batch_entities
    except requests.RequestException as e:
        print(f"Failed to fetch data: {e}")
        return {}  # Return an empty dict in case of failure

def fetch_wikidata_ids(most_viewed_pages, batch_size=50, api_url="https://en.wikipedia.org/w/api.php"):
    entities = {}
    titles_batch = {}

    for page in most_viewed_pages:
        normalized_title = normalize_title(page['article'])
        titles_batch[normalized_title] = page['views']
        if len(titles_batch) >= batch_size:
            entities.update(process_batch(titles_batch, api_url))
            titles_batch = {}  # Reset the batch

    if titles_batch:  # Process any remaining titles
        entities.update(process_batch(titles_batch, api_url))

    return entities

def fetch_most_viewed_pages(total_pages, api_url="https://en.wikipedia.org/w/api.php"):
    limit = 500
    fetched_pages = []

    for i in range(0, total_pages, limit):
        params = {
            "action": "query",
            "format": "json",
            "list": "mostviewed",
            "pvimlimit": min(limit, total_pages - i)
        }

        response = requests.get(api_url, params=params)
        response_data = response.json()

        if 'query' in response_data and 'mostviewed' in response_data['query']:
            fetched_pages.extend(response_data['query']['mostviewed'])
        else:
            break  # Exit loop if no more data is available

    return fetched_pages

def get_relations_batch(wikidata_ids, batch_size=50, api_url="https://www.wikidata.org/w/api.php"):
    relations = {}
    for start in range(0, len(wikidata_ids), batch_size):
        end = start + batch_size
        batch = wikidata_ids[start:end]
        wikidata_ids_str = "|".join(batch)

        print(f"Processing batch {start // batch_size + 1} out of {len(wikidata_ids) // batch_size + 1}")

        params = {
            "action": "wbgetentities",
            "format": "json",
            "ids": wikidata_ids_str,
            "props": "claims"
        }

        response = requests.get(api_url, params=params)
        if response.status_code != 200:
            print(f"Error: Received status code {response.status_code}")
            continue

        try:
            data = response.json().get('entities', {})
        except ValueError:
            print("Error: Unable to parse JSON response")
            continue

        for wikidata_id in batch:
            if wikidata_id in data:
                claims = data[wikidata_id].get('claims', {})
                relations[wikidata_id] = claims
            else:
                print(f"Warning: No data found for Wikidata ID {wikidata_id}")

    return relations


def query(date):
    url = f'https://wikimedia.org/api/rest_v1/metrics/pageviews/top/en.wikipedia/all-access/{date}'
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"

    result = requests.get(url, headers={'User-Agent': user_agent}).json()

    if 'error' in result:
        raise Exception(result['error'])
    if 'warnings' in result:
        print(result['warnings'])
    if 'items' in result:
        return result['items'][0]['articles']
    return []

def most_viewed(n):
    unique_articles = {}
    date = datetime.strptime("2024-01-01", "%Y-%m-%d")

    while len(unique_articles) < n:
        formatted_date = date.strftime("%Y/%m/%d")
        articles = query(formatted_date)

        for article in articles:
            unique_articles[article['article']] = article

        date -= timedelta(days=1)  # Passa al giorno precedente

    # Prendi solo i primi n articoli
    most_viewed_articles = list(unique_articles.values())[:n]

    return most_viewed_articles

## Calcolo paginie più visitate

In [3]:
n = 1000  # Numero di pagine uniche richieste
most_viewed_pages = most_viewed(n)
print(f"Fetched {len(most_viewed_pages)} pages")

entities = fetch_wikidata_ids(most_viewed_pages)
entities_id = [str(element['wikidata_id']) for element in list(entities.values())]
entities_views = [str(element['views']) for element in list(entities.values())]
entities_title = entities.keys()

df_ent = pd.DataFrame(columns=['pageTitle', 'views', 'wikidata_id'])
df_ent['pageTitle'] = entities_title
df_ent['views'] = entities_views
df_ent['wikidata_id'] = entities_id

df_ent.head(3)

Fetched 1000 pages


Unnamed: 0,pageTitle,views,wikidata_id
0,Cleopatra,158267,Q635
1,Deaths in 2024,151799,Q123489953
2,Dunki (film),209043,Q122341144


In [4]:
relations = get_relations_batch(df_ent['wikidata_id'].to_list())
print("Processing complete.")

Processing batch 1 out of 3
Processing batch 2 out of 3
Processing batch 3 out of 3
Processing complete.


In [5]:
# Print out the relations
limit = 1
count = 0
for wikidata_id, claims in relations.items():
    if count<=limit:
        count+=1
        print(f"Entity: {wikidata_id}")
        for property_id, claim_list in claims.items():
            print(f"  Property: {property_id}")
            for claim in claim_list:
                mainsnak = claim['mainsnak']
                if 'datavalue' in mainsnak:
                    value = mainsnak['datavalue']
                    print(f"    Value: {value}")
    else:
        break

Entity: Q635
  Property: P268
    Value: {'value': '11938532d', 'type': 'string'}
  Property: P22
    Value: {'value': {'entity-type': 'item', 'numeric-id': 39991, 'id': 'Q39991'}, 'type': 'wikibase-entityid'}
  Property: P21
    Value: {'value': {'entity-type': 'item', 'numeric-id': 6581072, 'id': 'Q6581072'}, 'type': 'wikibase-entityid'}
  Property: P244
    Value: {'value': 'n80067160', 'type': 'string'}
  Property: P214
    Value: {'value': '67762941', 'type': 'string'}
    Value: {'value': '441145857130722922990', 'type': 'string'}
    Value: {'value': '112160307351457741408', 'type': 'string'}
    Value: {'value': '4511162669647955500001', 'type': 'string'}
    Value: {'value': '8089162669699255500002', 'type': 'string'}
    Value: {'value': '97737753', 'type': 'string'}
    Value: {'value': '17156809346545120044', 'type': 'string'}
    Value: {'value': '194159474179027661362', 'type': 'string'}
    Value: {'value': '375144647697614384674', 'type': 'string'}
  Property: P227
    

In [6]:
triples = []

for wikidata_id, claims in relations.items():
    for property_id, claim_list in claims.items():
        for claim in claim_list:
            mainsnak = claim['mainsnak']
            if 'datavalue' in mainsnak:
                value = mainsnak['datavalue']
                if value['type'] == 'wikibase-entityid':
                    value_id = value['value']['id']
                    #print(f"Entity: {wikidata_id} Relation: {property_id} Entity: {value_id}")
                    triples.append({'entity': wikidata_id, 'rel': property_id, 'objt': value_id})

# Crea il DataFrame dalle triple
df = pd.DataFrame(triples)
df.head(3)

Unnamed: 0,entity,rel,objt
0,Q635,P22,Q39991
1,Q635,P21,Q6581072
2,Q635,P509,Q114953


In [7]:
print(f"Numero di entità (head) presenti: {df.entity.nunique()}")
print(f"Numero di relazioni presenti: {df.rel.nunique()}")
print(f"Numero di entità (tail) presenti: {df.objt.nunique()}")

Numero di entità (head) presenti: 139
Numero di relazioni presenti: 388
Numero di entità (tail) presenti: 8093


In [8]:
df.shape

(12312, 3)

In [9]:
df_try = df.merge(df_ent, right_on='wikidata_id', left_on='entity', how='left')
df_try=df_try.drop(['wikidata_id'], axis=1)
df_try=df_try.rename(columns={'pageTitle': 'pageTitle_entity', 'views':'views_entity'})
df_try.head(3)

Unnamed: 0,entity,rel,objt,pageTitle_entity,views_entity
0,Q635,P22,Q39991,Cleopatra,158267
1,Q635,P21,Q6581072,Cleopatra,158267
2,Q635,P509,Q114953,Cleopatra,158267


In [10]:
df_try.to_csv("./data/df_triple.csv", index=None)

## Riduzione del numero di code

In [11]:
df_try = pd.read_csv("./data/df_triple.csv", header=0)
df_try.shape

(12312, 5)

In [None]:
def get_wikipedia_titles(wikidata_ids):
    url = 'https://www.wikidata.org/w/api.php'
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'ids': '|'.join(wikidata_ids),
        'props': 'sitelinks',
        'sitefilter': 'enwiki'
    }
    response = requests.get(url, params=params)
    data = response.json()

    titles = []
    ent = []
    for entity in data.get('entities', {}).values():
        sitelinks = entity.get('sitelinks', {})
        if 'enwiki' in sitelinks:
            titles.append(sitelinks['enwiki']['title'])
            ent.append(entity['id'])

    df_titles = pd.DataFrame(columns=['titles', 'id'])
    df_titles['titles'] = titles
    df_titles['ent'] = ent
    return df_titles

def get_pageviews(titles):
    url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{}/daily/20230101/20231231'
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    pageviews = {}

    for title in titles:
        title_formatted = title.replace(' ', '_')
        response = requests.get(url.format(title_formatted), headers={'User-Agent': user_agent})
        if response.status_code == 200:
            data = response.json()
            views = sum(item['views'] for item in data['items'])
            pageviews[title] = views
        else:
            pageviews[title] = "None"

    return pageviews

def process_wikidata_entities(wikidata_ids):
    results = {}
    batch_size = 50

    df = pd.DataFrame(columns=['id', 'title', 'views'])

    for i in tqdm(range(0, len(wikidata_ids), batch_size), "Batch"):
        batch = wikidata_ids[i:i+batch_size]
        titles = get_wikipedia_titles(batch)
        pageviews = get_pageviews(titles.titles.to_list())

        app = pd.DataFrame({'id':titles.ent.to_list(), 'title':titles.titles.to_list(), 'views':pageviews.values()})
        df = pd.concat([df, app], ignore_index=True)
        results.update(pageviews)

    return results, df


# Inizio del cronometraggio
start_time = time.time()

wikidata_ids = df_try['objt'].to_list()
results, df_res_tail = process_wikidata_entities(wikidata_ids)

# Fine del cronometraggio
end_time = time.time()

# Calcolo del tempo di esecuzione
execution_time = end_time - start_time
print(f"Tempo di esecuzione: {execution_time} secondi")

title_tail_list = list()
views_tail_list = list()

for title, views in results.items():
    title_tail_list.append(title)
    views_tail_list.append(views)

Batch:  28%|██▊       | 68/247 [16:06<39:19, 13.18s/it]  

In [None]:
df_unito = pd.merge(df_try, df_res_tail, left_on='objt', right_on='id', how='left')

df_unito = df_unito.drop(['id'], axis=1).drop_duplicates()

df_unito = df_unito.rename(columns={'title': 'pageTitle_objt', 'views':'views_objt'})

df_unito.head(3)

In [ ]:
df_unito.to_csv("./data/df_triple_info.csv", index=None)