In [2]:
!pip install requests



You should consider upgrading via the 'C:\Users\utente\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'C:\Users\utente\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [5]:
import pandas as pd
import requests
import re
import time
from json.decoder import JSONDecodeError

# Definizione di genre_mapping
genre_mapping = {
    'Q6581097': 'Male',
    'Q6581072': 'Female',
    'Q1097630': 'Non specificato'
}
# Funzione che si dedicherà ad effettuare la query SPARQL su Wikidata per cercare le informazioni di genere, data di nascita e luogo di nascita per ogni artista
def get_demographic_data(artist_name):
    # Controlla se artist_name è una stringa
    if not isinstance(artist_name, str):
        return {
            'genre': None,
            'birth_date': None,
            'birth_place': None
        }

    cleaned_artist_name = re.sub(r'[^\w\s&,]+', '', artist_name)

    # Gestisci il caso di nomi separati da "and" o "&"
    if '&' in cleaned_artist_name:
        cleaned_artist_name = cleaned_artist_name.split('&')[0].strip()
    elif 'and' in cleaned_artist_name:
        cleaned_artist_name = cleaned_artist_name.split('and')[0].strip()

    sparql_query = f"""
    SELECT ?artist ?genre ?birthDate ?birthPlaceLabel
    WHERE {{
      ?artist rdfs:label "{cleaned_artist_name}"@en.
      ?artist wdt:P21 ?genre;
              wdt:P569 ?birthDate;
              wdt:P19 ?birthPlace.

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    LIMIT 1
    """
    endpoint_url = "https://query.wikidata.org/sparql"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'application/json',
    }
    # Per evitare dei crash o dei problemi di connessione, si gestiscono le opportune eccezioni e si immette un timeout di 120 secondi.
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = requests.get(endpoint_url, headers=headers, params={'query': sparql_query, 'format': 'json'}, timeout=120)
            response.raise_for_status()
            data = response.json()
            break
        except requests.exceptions.Timeout:
            time.sleep(5)
            if attempt == max_retries - 1:
                raise
        except JSONDecodeError:
            data = None

    # Verifica se ci sono risultati nella lista
    if data and 'results' in data and 'bindings' in data['results'] and data['results']['bindings']:
        genre_url = data['results']['bindings'][0]['genre']['value'] if 'genre' in data['results']['bindings'][0] else None
        birth_date = data['results']['bindings'][0]['birthDate']['value'] if 'birthDate' in data['results']['bindings'][0] else None
        birth_place = data['results']['bindings'][0]['birthPlaceLabel']['value'] if 'birthPlaceLabel' in data['results']['bindings'][0] else None
        genre_id = genre_url.split('/')[-1] if genre_url else None
        genre = genre_mapping.get(genre_id, None)

        return {
            'genre': genre,
            'birth_date': birth_date,
            'birth_place': birth_place
        }
    else:
        return {
            'genre': None,
            'birth_date': None,
            'birth_place': None
        }

# Caricamento del campione iniziale di valutazioni
ratings_df = pd.read_csv("sample.csv")  # Sostituisci con il percorso del tuo file CSV

# Aggiungi colonne per i dati demografici
ratings_df['artist_genre'] = None
ratings_df['artist_birth_date'] = None
ratings_df['artist_birth_place'] = None

# Itera sul DataFrame e arricchisci con i dati demografici
for index, row in ratings_df.iterrows():
    artist_name = row['artist_name']
    demographic_data = get_demographic_data(artist_name)
    
    # Aggiungi i dati demografici al DataFrame
    ratings_df.at[index, 'artist_genre'] = demographic_data['genre']
    ratings_df.at[index, 'artist_birth_date'] = demographic_data['birth_date']
    ratings_df.at[index, 'artist_birth_place'] = demographic_data['birth_place']

# Salva il DataFrame arricchito
ratings_df.to_csv("user_augmentation.csv", index=False) 