### Imports

In [30]:
########################### DataSet ########################### 
# https://github.com/cvdfoundation/google-landmark?tab=readme-ov-file#release-history
###############################################################

import os
import pandas as pd
import hashlib
import requests
import tarfile
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

base_dir = Path(os.getcwd()).resolve().parent
base_dir

WindowsPath('C:/Users/diogo/Desktop/APVC/APVC-ProjetoFinal')

In [31]:
data_dir = base_dir / 'data'
data_dir.mkdir(parents=True, exist_ok=True)

## 1. Recolher os dados

In [32]:
dataLandMarkTrain_dir = data_dir / 'land_mark' / 'train'
dataLandMarkTrain_dir.mkdir(parents=True, exist_ok=True)

urls = [
    "https://s3.amazonaws.com/google-landmark/metadata/train.csv",
    "https://s3.amazonaws.com/google-landmark/metadata/train_clean.csv",
    "https://s3.amazonaws.com/google-landmark/metadata/train_attribution.csv",
    "https://s3.amazonaws.com/google-landmark/metadata/train_label_to_category.csv",
    "https://s3.amazonaws.com/google-landmark/metadata/train_label_to_hierarchical.csv"
]

# Download dos ficheiros apenas se não existirem
for u in urls:
    file_name = Path(u).name
    save_path = dataLandMarkTrain_dir / file_name

    if save_path.exists():
        print(f"Já existe: {save_path}")
        continue

    response = requests.get(u)
    response.raise_for_status()

    with open(save_path, "wb") as f:
        f.write(response.content)

    print(f"Guardado: {save_path}")


Já existe: C:\Users\diogo\Desktop\APVC\APVC-ProjetoFinal\data\land_mark\train\train.csv
Já existe: C:\Users\diogo\Desktop\APVC\APVC-ProjetoFinal\data\land_mark\train\train_clean.csv
Já existe: C:\Users\diogo\Desktop\APVC\APVC-ProjetoFinal\data\land_mark\train\train_attribution.csv
Já existe: C:\Users\diogo\Desktop\APVC\APVC-ProjetoFinal\data\land_mark\train\train_label_to_category.csv
Já existe: C:\Users\diogo\Desktop\APVC\APVC-ProjetoFinal\data\land_mark\train\train_label_to_hierarchical.csv


Vou utilizar o **train_clean.csv** como base, já que este possui imagens mais fiáveis quando comparado com **train.csv**.

## 2. Merge dos DataSets

In [33]:
def colunas_em_comum(df1, df2):
    """
    Recebe dois DataFrames e devolve o conjunto de colunas em comum.
    """
    return set(df1.columns) & set(df2.columns)

### 2.1. Train_clean.csv - DataSet Base

In [34]:
train_clean = pd.read_csv(dataLandMarkTrain_dir / "train_clean.csv")
train_clean["images"] = train_clean["images"].apply(lambda x: x.split() if pd.notnull(x) else [])
train_clean.head()

Unnamed: 0,landmark_id,images
0,1,"[17660ef415d37059, 92b6290d571448f6, cd41bf948..."
1,7,"[25c9dfc7ea69838d, 28b13f94a6f1f3c1, 307d6584f..."
2,9,"[0193b65bb58d2c77, 1a30a51a287ecf69, 1f4e8ab1f..."
3,11,"[1a6cb1deed46bb17, 1cc2c8fbc83e1a0c, 2361b8da8..."
4,12,"[0a199c97c382b1ff, 1492a5d344495391, 290097bd3..."


A coluna `images` corresponde ao **`id`** do *train.csv*

### 2.2. train_label_to_hierarchical.csv - Filtrar pelas categorias

In [35]:
train_label_to_hierarchical = pd.read_csv(dataLandMarkTrain_dir / "train_label_to_hierarchical.csv")
train_label_to_hierarchical.head()

Unnamed: 0,landmark_id,category,supercategory,hierarchical_label,natural_or_human_made
0,0,http://commons.wikimedia.org/wiki/Category:Hap...,horse racing venue,sports venue,human-made
1,1,http://commons.wikimedia.org/wiki/Category:Lui...,park,parks,natural
2,2,http://commons.wikimedia.org/wiki/Category:Gra...,mountain,mountain,natural
3,5,http://commons.wikimedia.org/wiki/Category:Lak...,motorsport racing track,road,human-made
4,7,http://commons.wikimedia.org/wiki/Category:Spa...,multi-purpose hall,,


In [36]:
mergeWith = colunas_em_comum(train_clean, train_label_to_hierarchical)
train_clean = train_clean.merge(train_label_to_hierarchical, on=list(mergeWith), how='left')
train_clean.head()

Unnamed: 0,landmark_id,images,category,supercategory,hierarchical_label,natural_or_human_made
0,1,"[17660ef415d37059, 92b6290d571448f6, cd41bf948...",http://commons.wikimedia.org/wiki/Category:Lui...,park,parks,natural
1,7,"[25c9dfc7ea69838d, 28b13f94a6f1f3c1, 307d6584f...",http://commons.wikimedia.org/wiki/Category:Spa...,multi-purpose hall,,
2,9,"[0193b65bb58d2c77, 1a30a51a287ecf69, 1f4e8ab1f...",,,,
3,11,"[1a6cb1deed46bb17, 1cc2c8fbc83e1a0c, 2361b8da8...",http://commons.wikimedia.org/wiki/Category:Mer...,market hall,market,human-made
4,12,"[0a199c97c382b1ff, 1492a5d344495391, 290097bd3...",http://commons.wikimedia.org/wiki/Category:Was...,architectural structure,,


### 2.3. train.csv - Retirar os links das imagens

In [37]:
train = pd.read_csv(dataLandMarkTrain_dir / "train.csv")
train["url"] = train["url"].apply(lambda x: x.split() if pd.notnull(x) else [])
train.head()

Unnamed: 0,id,url,landmark_id
0,6e158a47eb2ca3f6,[https://upload.wikimedia.org/wikipedia/common...,142820
1,202cd79556f30760,[http://upload.wikimedia.org/wikipedia/commons...,104169
2,3ad87684c99c06e1,[http://upload.wikimedia.org/wikipedia/commons...,37914
3,e7f70e9c61e66af3,[https://upload.wikimedia.org/wikipedia/common...,102140
4,4072182eddd0100e,[https://upload.wikimedia.org/wikipedia/common...,2474


In [38]:
id_to_url = dict(zip(train['id'], train['url']))

def ids_para_urls(lista_ids):
    if not isinstance(lista_ids, list):
        return []
    urls = []
    for i in lista_ids:
        url = id_to_url.get(i)
        if url:
            if isinstance(url, list):
                urls.extend(url)  # adiciona todos os urls da lista
            else:
                urls.append(url)  # adiciona url único
    return urls

# Aplicar à coluna 'images' do train_clean e criar nova coluna 'urls'
train_clean['urls'] = train_clean['images'].apply(ids_para_urls)
train_clean

Unnamed: 0,landmark_id,images,category,supercategory,hierarchical_label,natural_or_human_made,urls
0,1,"[17660ef415d37059, 92b6290d571448f6, cd41bf948...",http://commons.wikimedia.org/wiki/Category:Lui...,park,parks,natural,[http://upload.wikimedia.org/wikipedia/commons...
1,7,"[25c9dfc7ea69838d, 28b13f94a6f1f3c1, 307d6584f...",http://commons.wikimedia.org/wiki/Category:Spa...,multi-purpose hall,,,[https://upload.wikimedia.org/wikipedia/common...
2,9,"[0193b65bb58d2c77, 1a30a51a287ecf69, 1f4e8ab1f...",,,,,[https://upload.wikimedia.org/wikipedia/common...
3,11,"[1a6cb1deed46bb17, 1cc2c8fbc83e1a0c, 2361b8da8...",http://commons.wikimedia.org/wiki/Category:Mer...,market hall,market,human-made,[https://upload.wikimedia.org/wikipedia/common...
4,12,"[0a199c97c382b1ff, 1492a5d344495391, 290097bd3...",http://commons.wikimedia.org/wiki/Category:Was...,architectural structure,,,[https://upload.wikimedia.org/wikipedia/common...
...,...,...,...,...,...,...,...
81308,203083,"[1def5ad0872c6303, 23349c63e48a7de2, 2c9d17eeb...",http://commons.wikimedia.org/wiki/Category:St....,parish church,church,human-made,[https://upload.wikimedia.org/wikipedia/common...
81309,203085,"[0926becdb2c92f8a, 22b58ac428531da8, 2c9054957...",http://commons.wikimedia.org/wiki/Category:Chu...,church building,church,human-made,[https://upload.wikimedia.org/wikipedia/common...
81310,203087,"[146cc06310d08ef0, 1ee045e5a3bc9568, 389594711...",http://commons.wikimedia.org/wiki/Category:Jac...,park,parks,natural,[https://upload.wikimedia.org/wikipedia/common...
81311,203091,"[8e219a79ee5eede9, fa7142e44850dbac]",http://commons.wikimedia.org/wiki/Category:Sil...,landscape park of Poland,parks,natural,[https://upload.wikimedia.org/wikipedia/common...


In [39]:
pd.set_option('display.max_colwidth', None)
train_clean["urls"]

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

## 3. Filtrar os dados 

In [40]:
del train_label_to_hierarchical
del train

### 3.1. Classe da foto

In [41]:
pd.reset_option('display.max_colwidth')
train_clean["supercategory"].unique()

array(['park', 'multi-purpose hall', nan, ...,
       'ancient Roman architecture', 'common land',
       'Naturschutzgebiet (NSG HA 191)'], dtype=object)

In [42]:
counts = train_clean["supercategory"].value_counts()
counts_more_than_10 = counts[counts > 200]
print(counts_more_than_10)

supercategory
church building            11024
castle                      2058
mountain                    1520
museum                      1320
building                    1157
château                     1144
lighthouse                   889
lake                         889
architectural structure      771
monastery                    738
palace                       686
parish church                605
monument                     569
square                       565
cathedral                    547
cemetery                     543
park                         540
art museum                   523
house                        468
archaeological site          451
skyscraper                   426
bridge                       397
chapel                       391
island                       388
reservoir                    378
mosque                       368
tower                        355
Buddhist temple              355
sculpture                    332
waterfall                    

In [43]:
monumento_categorias = [
    # Tenho a certeza que são monumentos
    'church building', 'castle', 'monastery', 'palace',
    'parish church', 'monument', 'cathedral', 'chapel',
    'abbey', 'fort', 'city gate', 'Catholic cathedral',
    'castle ruin',

    # Não tenho a certeza que são monumentos
    'building', 'architectural structure', 'historic house museum',
    'cultural property', 'sculpture', 'fountain', 'tower',
    'lighthouse', 'bridge', 'square', 'city hall'
]

df_monumentos = train_clean[train_clean['supercategory'].isin(monumento_categorias)]
df_monumentos

Unnamed: 0,landmark_id,images,category,supercategory,hierarchical_label,natural_or_human_made,urls
4,12,"[0a199c97c382b1ff, 1492a5d344495391, 290097bd3...",http://commons.wikimedia.org/wiki/Category:Was...,architectural structure,,,[https://upload.wikimedia.org/wikipedia/common...
6,22,"[0be5d581f54d3116, 121754b8854c9757, 3238a06ee...",http://commons.wikimedia.org/wiki/Category:Cas...,castle,castle / fort,human-made,[http://upload.wikimedia.org/wikipedia/commons...
10,29,"[9a03db05d8fb4850, c612327c67863d9e, c6a1efedc...",http://commons.wikimedia.org/wiki/Category:St_...,church building,church,human-made,[https://upload.wikimedia.org/wikipedia/common...
15,37,"[022d4bc3b72988eb, 0d920307064d9e9d, 0e3d48fc1...",http://commons.wikimedia.org/wiki/Category:Chu...,church building,church,human-made,[http://upload.wikimedia.org/wikipedia/commons...
17,43,"[01d1a2461b033111, 0e2ccd04e9d0dc2e, 1753e0128...",http://commons.wikimedia.org/wiki/Category:Abb...,abbey,church,human-made,[https://upload.wikimedia.org/wikipedia/common...
...,...,...,...,...,...,...,...
81295,203054,"[79713986a0d603f9, a67fda97f7c7eb37, ed0efcd25...",http://commons.wikimedia.org/wiki/Category:1_D...,building,,,[https://upload.wikimedia.org/wikipedia/common...
81297,203057,"[0fccea3669efad6e, 261eb6778621f466, 2e41935f7...",http://commons.wikimedia.org/wiki/Category:Ser...,building,,,[https://upload.wikimedia.org/wikipedia/common...
81300,203063,"[113e2bed03e2a1ae, 57b5da8ecf0f8ed2, 5d4ffb9f3...",http://commons.wikimedia.org/wiki/Category:Zve...,monastery,monastery,human-made,[https://upload.wikimedia.org/wikipedia/common...
81308,203083,"[1def5ad0872c6303, 23349c63e48a7de2, 2c9d17eeb...",http://commons.wikimedia.org/wiki/Category:St....,parish church,church,human-made,[https://upload.wikimedia.org/wikipedia/common...


In [44]:
counts = df_monumentos["hierarchical_label"].value_counts(dropna=False)
print(counts)

hierarchical_label
church                 13100
NaN                     2756
castle / fort           2532
lighthouse               889
monastery                738
palace                   686
square                   565
bridge                   397
tower                    355
sculpture                332
government building      253
house                    241
gate                     207
fountain                 206
Name: count, dtype: int64


### 3.2. Human Made (Não tenho a certeza, não vou alterar)

In [45]:
counts = df_monumentos["natural_or_human_made"].value_counts(dropna=False)
print(counts)

natural_or_human_made
human-made    20501
NaN            2756
Name: count, dtype: int64


In [46]:
# Existem nulos que me parecem ser monumentos
pd.set_option('display.max_colwidth', None)
df_monumentos[df_monumentos['natural_or_human_made'].isna()]["category"]

4                      http://commons.wikimedia.org/wiki/Category:Wasserkunstanlage_Paradies_(Baden-Baden)
48                                         http://commons.wikimedia.org/wiki/Category:Shemokmedi_Cathedral
55       http://commons.wikimedia.org/wiki/Category:Est%C3%A1tua_de_D._Pedro_IV_na_Pra%C3%A7a_da_Liberdade
80                                   http://commons.wikimedia.org/wiki/Category:Quedlinburger_Stadtschloss
188                                               http://commons.wikimedia.org/wiki/Category:Valhallabadet
                                                       ...                                                
81165                                   http://commons.wikimedia.org/wiki/Category:Slovansk%C3%BD_d%C5%AFm
81173                                http://commons.wikimedia.org/wiki/Category:High_Roller_(Ferris_wheel)
81222                                                  http://commons.wikimedia.org/wiki/Category:Walden_7
81295                              ht

## 4. API da MediaWiki - *https://m.mediawiki.org/wiki/API:Main_page*

In [47]:
pd.reset_option('display.max_colwidth')
df_monumentos.head()

Unnamed: 0,landmark_id,images,category,supercategory,hierarchical_label,natural_or_human_made,urls
4,12,"[0a199c97c382b1ff, 1492a5d344495391, 290097bd3...",http://commons.wikimedia.org/wiki/Category:Was...,architectural structure,,,[https://upload.wikimedia.org/wikipedia/common...
6,22,"[0be5d581f54d3116, 121754b8854c9757, 3238a06ee...",http://commons.wikimedia.org/wiki/Category:Cas...,castle,castle / fort,human-made,[http://upload.wikimedia.org/wikipedia/commons...
10,29,"[9a03db05d8fb4850, c612327c67863d9e, c6a1efedc...",http://commons.wikimedia.org/wiki/Category:St_...,church building,church,human-made,[https://upload.wikimedia.org/wikipedia/common...
15,37,"[022d4bc3b72988eb, 0d920307064d9e9d, 0e3d48fc1...",http://commons.wikimedia.org/wiki/Category:Chu...,church building,church,human-made,[http://upload.wikimedia.org/wikipedia/commons...
17,43,"[01d1a2461b033111, 0e2ccd04e9d0dc2e, 1753e0128...",http://commons.wikimedia.org/wiki/Category:Abb...,abbey,church,human-made,[https://upload.wikimedia.org/wikipedia/common...


In [48]:
pd.set_option('display.max_colwidth', None)
df_monumentos["category"]

4           http://commons.wikimedia.org/wiki/Category:Wasserkunstanlage_Paradies_(Baden-Baden)
6                               http://commons.wikimedia.org/wiki/Category:Castle_of_Santa_Cruz
10                                 http://commons.wikimedia.org/wiki/Category:St_Peter,_Claydon
15                   http://commons.wikimedia.org/wiki/Category:Church_of_Saint_Stephen,_Prague
17                        http://commons.wikimedia.org/wiki/Category:Abbaye_de_Caunes-Minervois
                                                  ...                                          
81295                   http://commons.wikimedia.org/wiki/Category:1_Doki_Street_in_Gda%C5%84sk
81297                    http://commons.wikimedia.org/wiki/Category:Serbian_Patriarchy_building
81300                               http://commons.wikimedia.org/wiki/Category:Zverin_Monastery
81308         http://commons.wikimedia.org/wiki/Category:St._Peter's_Parish_Church_(Radovljica)
81309    http://commons.wikimedia.org/wi

In [49]:
import requests
from urllib.parse import urlparse, unquote
pd.reset_option('display.max_colwidth')

def extrair_titulo(url):
    """Extrai o título da página Wikimedia a partir da URL"""
    path = urlparse(url).path
    titulo = path.split('/')[-1]  # Última parte da URL
    return unquote(titulo)


def obter_info_wikimedia(titulo):
    """Consulta a Wikimedia API e retorna info básica sobre o título"""
    endpoint = "https://commons.wikimedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "coordinates|pageprops|description",
        "titles": titulo,
        "format": "json",
        "formatversion": 2
    }
    headers = {
        "User-Agent": "ProjetoMonumentosPT/1.0 (teu@email.com)"
    }
    response = requests.get(endpoint, params=params, headers=headers)
    response.raise_for_status()
    return response.json()


def obter_info_wikidata(qid):
    """Consulta a Wikidata API com o QID para obter dados detalhados"""
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()


def obter_label_por_qid(qid, lang="en"):
    """Consulta o rótulo de um QID (por ex. país, cidade)"""
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    entidade = list(data["entities"].values())[0]
    return entidade["labels"].get(lang, {}).get("value", "Desconhecido")


def extrair_detalhes_wikidata(entidade):
    """Extrai dados úteis da estrutura JSON da Wikidata"""
    claims = entidade["claims"]

    def extrair_qid(p):
        if p in claims:
            return claims[p][0]["mainsnak"]["datavalue"]["value"]["id"]
        return None

    def extrair_varios_qids(p):
        if p in claims:
            return [c["mainsnak"]["datavalue"]["value"]["id"] for c in claims[p]]
        return []

    resultado = {
        "pais_qid": extrair_qid("P17"),
        "localizacoes_qid": extrair_varios_qids("P131"),
        "instancias_qid": extrair_varios_qids("P31"),
        "label": entidade.get("labels", {}).get("en", {}).get("value", "Sem título")
    }

    return resultado


# --- Execução (exemplo) ---

url_exemplo = "https://commons.wikimedia.org/wiki/Category:Wasserkunstanlage_Paradies_(Baden-Baden)"
titulo = extrair_titulo(url_exemplo)

# 1. Obter info da Wikimedia
dados_wiki = obter_info_wikimedia(titulo)
pagina = dados_wiki["query"]["pages"][0]

coordenadas = pagina.get("coordinates", [{}])[0]
wikibase_item = pagina.get("pageprops", {}).get("wikibase_item", None)

print("📌 Exemplo de monumento extraído:")
print("🔗 URL:", url_exemplo)
print("📄 Título Wikimedia:", titulo)
print("🗺️ Coordenadas:", coordenadas)
print("🔗 Wikibase Item:", wikibase_item)

if wikibase_item:
    dados_wikidata = obter_info_wikidata(wikibase_item)
    entidade = dados_wikidata["entities"][wikibase_item]
    detalhes = extrair_detalhes_wikidata(entidade)

    print(f"\n🏛️ Nome do Monumento: {detalhes['label']}")

    nome_pais = obter_label_por_qid(detalhes["pais_qid"]) if detalhes["pais_qid"] else "Desconhecido"
    print("🌍 País:", nome_pais)

    # Mostrar todas as localizações administrativas (P131)
    print("🏘️ Localizações administrativas (P131):")
    for qid in detalhes["localizacoes_qid"]:
        nome_local = obter_label_por_qid(qid)
        print(f"   - {nome_local} (QID: {qid})")

    # Mostrar tipos (P31)
    print("🏷️ Tipos (instância de):")
    for qid in detalhes["instancias_qid"]:
        nome_tipo = obter_label_por_qid(qid)
        print(f"   - {nome_tipo} (QID: {qid})")

else:
    print("❌ Não foi possível obter o Wikidata item.")

📌 Exemplo de monumento extraído:
🔗 URL: https://commons.wikimedia.org/wiki/Category:Wasserkunstanlage_Paradies_(Baden-Baden)
📄 Título Wikimedia: Category:Wasserkunstanlage_Paradies_(Baden-Baden)
🗺️ Coordenadas: {'lat': 48.7625, 'lon': 8.25154, 'primary': True, 'globe': 'earth'}
🔗 Wikibase Item: Q2551242

🏛️ Nome do Monumento: Wasserkunstanlage Paradies (Baden-Baden)
🌍 País: Germany
🏘️ Localizações administrativas (P131):
   - Baden-Baden (QID: Q4100)
🏷️ Tipos (instância de):
   - architectural structure (QID: Q811979)
   - park (QID: Q22698)


In [None]:
import requests
import pandas as pd
from urllib.parse import urlparse, unquote
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# -------- Funções auxiliares --------

def extrair_titulo(url):
    path = urlparse(url).path
    return unquote(path.split('/')[-1])

def obter_info_wikimedia(titulo):
    endpoint = "https://commons.wikimedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "coordinates|pageprops|description",
        "titles": titulo,
        "format": "json",
        "formatversion": 2
    }
    headers = {
        "User-Agent": "ProjetoMonumentosPT/1.0 (gothamanalytics7@gmail.com)"
    }
    response = requests.get(endpoint, params=params, headers=headers)
    response.raise_for_status()
    return response.json()

@lru_cache(maxsize=2048)
def obter_info_wikidata(qid):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

@lru_cache(maxsize=2048)
def obter_label_por_qid(qid, lang="en"):
    try:
        data = obter_info_wikidata(qid)
        entidade = list(data["entities"].values())[0]
        return entidade["labels"].get(lang, {}).get("value", "Desconhecido")
    except:
        return "Desconhecido"

def extrair_detalhes_wikidata(entidade):
    claims = entidade.get("claims", {})

    def extrair_qid(p):
        try:
            return claims[p][0]["mainsnak"]["datavalue"]["value"]["id"]
        except:
            return None

    def extrair_varios_qids(p):
        qids = []
        for claim in claims.get(p, []):
            try:
                qids.append(claim["mainsnak"]["datavalue"]["value"]["id"])
            except:
                continue
        return qids

    return {
        "pais_qid": extrair_qid("P17"),
        "localizacoes_qid": extrair_varios_qids("P131"),
        "instancias_qid": extrair_varios_qids("P31"),
        "label": entidade.get("labels", {}).get("en", {}).get("value", "Sem título")
    }

# -------- Processar uma única URL --------

def processar_url(url):
    try:
        titulo = extrair_titulo(url)
        dados_wiki = obter_info_wikimedia(titulo)
        pagina = dados_wiki["query"]["pages"][0]

        coordenadas = pagina.get("coordinates", [{}])[0]
        lat = coordenadas.get("lat", None)
        lon = coordenadas.get("lon", None)
        wikibase_item = pagina.get("pageprops", {}).get("wikibase_item", None)

        if not wikibase_item:
            return [url, lat, lon, None, None, None, None, None]

        dados_wikidata = obter_info_wikidata(wikibase_item)
        entidade = dados_wikidata["entities"][wikibase_item]
        detalhes = extrair_detalhes_wikidata(entidade)

        nome_monumento = detalhes["label"]
        pais = obter_label_por_qid(detalhes["pais_qid"]) if detalhes["pais_qid"] else None
        localizacoes = [obter_label_por_qid(qid) for qid in detalhes["localizacoes_qid"]]
        tipos = [obter_label_por_qid(qid) for qid in detalhes["instancias_qid"]]

        return [url, lat, lon, nome_monumento, pais, localizacoes, tipos, wikibase_item]

    except Exception as e:
        print(f"Erro ao processar {url}: {e}")
        return [url, None, None, None, None, None, None, None]

# -------- Processamento paralelo --------

def processar_urls_em_paralelo(urls, max_workers=10):
    resultados = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        tarefas = {executor.submit(processar_url, url): url for url in urls}
        for future in tqdm(as_completed(tarefas), total=len(tarefas), desc="Processando"):
            resultados.append(future.result())
    return resultados

# -------- Executar --------

# Caminho do ficheiro de saída
ficheiro_saida = dataLandMarkTrain_dir / "Monumentos_clean.csv"

if not ficheiro_saida.exists():
    # Exemplo: df_monumentos deve ter uma coluna 'category' com as URLs
    urls = df_monumentos['category'].tolist()
    resultados = processar_urls_em_paralelo(urls)

    # Criar DataFrame a partir dos resultados
    colunas = [
        'category',
        'lat',
        'lon',
        'nome_monumento',
        'pais',
        'localizacoes_administrativas',
        'tipos_instancia_de',
        'wikibase_item'
    ]
    df_resultados = pd.DataFrame(resultados, columns=colunas)

    # Juntar com o DataFrame original (caso queiras manter colunas extra)
    df_monumentos = df_monumentos.drop(columns=['lat', 'lon', 'nome_monumento', 'pais', 'localizacoes_administrativas', 'tipos_instancia_de', 'wikibase_item'], errors='ignore')
    df_monumentos = df_monumentos.merge(df_resultados, on="category", how="left")

    # Guardar em CSV
    df_monumentos.to_csv(ficheiro_saida, index=False)

    # Mostrar primeiras linhas
    df_monumentos.head()

Processando:  47%|████▋     | 10999/23257 [39:33<34:04,  6.00it/s]  

Erro ao processar http://commons.wikimedia.org/wiki/Category:%C3%89glise_Saint-Martin_d'Ammerschwihr: 'Q55269148'


Processando:  61%|██████    | 14167/23257 [50:19<26:03,  5.81it/s]  

Erro ao processar http://commons.wikimedia.org/wiki/Category:Notre-Dame-de-l'Assomption,_Rouffach: 'Q55274180'


Processando: 100%|██████████| 23257/23257 [1:23:12<00:00,  4.66it/s]


Unnamed: 0,landmark_id,images,category,supercategory,hierarchical_label,natural_or_human_made,urls,lat,lon,nome_monumento,pais,localizacoes_administrativas,tipos_instancia_de,wikibase_item
0,12,"[0a199c97c382b1ff, 1492a5d344495391, 290097bd3...",http://commons.wikimedia.org/wiki/Category:Was...,architectural structure,,,[https://upload.wikimedia.org/wikipedia/common...,48.7625,8.25154,Wasserkunstanlage Paradies (Baden-Baden),Germany,[Baden-Baden],"[architectural structure, park]",Q2551242
1,22,"[0be5d581f54d3116, 121754b8854c9757, 3238a06ee...",http://commons.wikimedia.org/wiki/Category:Cas...,castle,castle / fort,human-made,[http://upload.wikimedia.org/wikipedia/commons...,43.3483,-8.35,Castillo de Santa Cruz,Spain,"[Liáns, Oleiros]","[castle, monument]",Q10283898
2,29,"[9a03db05d8fb4850, c612327c67863d9e, c6a1efedc...",http://commons.wikimedia.org/wiki/Category:St_...,church building,church,human-made,[https://upload.wikimedia.org/wikipedia/common...,52.1056,1.1195,"St Peter's Church, Claydon",United Kingdom,[Claydon],[church building],Q7595226
3,37,"[022d4bc3b72988eb, 0d920307064d9e9d, 0e3d48fc1...",http://commons.wikimedia.org/wiki/Category:Chu...,church building,church,human-made,[http://upload.wikimedia.org/wikipedia/commons...,50.076361,14.424889,St. Stephen's Church,Czech Republic,"[Prague 2, New Town]",[church building],Q1742727
4,43,"[01d1a2461b033111, 0e2ccd04e9d0dc2e, 1753e0128...",http://commons.wikimedia.org/wiki/Category:Abb...,abbey,church,human-made,[https://upload.wikimedia.org/wikipedia/common...,43.326056,2.527222,Abbaye de Caunes-Minervois,France,[Caunes-Minervois],[abbey],Q1345467


In [54]:
Pais_escolhido = "Portugal"
df_monumentos_pt = df_monumentos[df_monumentos["pais"] == Pais_escolhido].reset_index(drop=True)

# Guardar em CSV
df_monumentos_pt.to_csv(dataLandMarkTrain_dir / "Monumentos_Portugueses.csv", index=False)
df_monumentos_pt

Unnamed: 0,landmark_id,images,category,supercategory,hierarchical_label,natural_or_human_made,urls,lat,lon,nome_monumento,pais,localizacoes_administrativas,tipos_instancia_de,wikibase_item
0,145,"[06d3a8d8d1c97d39, 0b14eee369b61097, 1377ce845...",http://commons.wikimedia.org/wiki/Category:Est...,monument,,,[http://upload.wikimedia.org/wikipedia/commons...,41.146500,-8.611360,Monument to Pedro IV (Porto),Portugal,"[Cedofeita, Santo Ildefonso, Sé, Miragaia, São...","[monument, statue, cultural heritage]",Q11783
1,596,"[0087293d5b4efa86, 0e683185440c3b01, 16aaeed42...",http://commons.wikimedia.org/wiki/Category:Lar...,square,square,human-made,[https://upload.wikimedia.org/wikipedia/common...,38.715472,-9.136583,Praça Martim Moniz,Portugal,[Lisbon],[square],Q20716902
2,2340,"[729bca5e9e6dbe4b, 7f89e5aa3f889c23, 89b048ffa...",http://commons.wikimedia.org/wiki/Category:Pal...,palace,palace,human-made,[https://upload.wikimedia.org/wikipedia/common...,38.741441,-9.143686,Galveias Palace,Portugal,[Avenidas Novas],"[palace, cultural heritage]",Q5519418
3,2471,"[3abcfa8920504706, 73f6407a3e2915a8, cff5f5df7...",http://commons.wikimedia.org/wiki/Category:Cap...,chapel,church,human-made,[https://upload.wikimedia.org/wikipedia/common...,37.083673,-8.864804,Hermitage of Our Lady of Guadalupe,Portugal,[Vila do Bispo e Raposeira],"[chapel, cultural heritage]",Q17125232
4,2527,"[1797136481c351e4, 201637e8cdbe36f3, 2114f88b9...",http://commons.wikimedia.org/wiki/Category:Cap...,lighthouse,lighthouse,human-made,[https://upload.wikimedia.org/wikipedia/common...,38.596756,-28.826233,Ponta dos Capelinhos Lighthouse,Portugal,[Azores],"[lighthouse, cultural heritage]",Q2890500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,199865,"[1c2296d3b9cb71da, 20a3ab09bfdc2a79, 31d069976...",http://commons.wikimedia.org/wiki/Category:Cab...,lighthouse,lighthouse,human-made,[https://upload.wikimedia.org/wikipedia/common...,40.191000,-8.905167,Cabo Mondego Lighthouse,Portugal,[Buarcos e São Julião],"[lighthouse, cultural heritage]",Q11784
302,200832,"[05238b88de0ad5d2, 095dc6c75b6ef701, 173f96d00...",http://commons.wikimedia.org/wiki/Category:Igr...,church building,church,human-made,[https://upload.wikimedia.org/wikipedia/common...,32.680000,-17.104444,Igreja de Nossa Senhora da Luz,Portugal,[Ponta do Sol],"[church building, cultural heritage, parish ch...",Q10300407
303,201945,"[0214fad70905d23e, 037107e6b52e1b60, 0beb6accd...",http://commons.wikimedia.org/wiki/Category:Cas...,castle,castle / fort,human-made,[https://upload.wikimedia.org/wikipedia/common...,37.190989,-8.437906,Castle of Silves,Portugal,[Silves],"[castle, cultural heritage, archaeological site]",Q1971757
304,202667,"[1dfa189ac965d147, 31e77006c11bf1e0, 61c4142fe...",http://commons.wikimedia.org/wiki/Category:Edi...,building,,,[https://upload.wikimedia.org/wikipedia/common...,38.732470,-9.155876,"Edifício sito na Rua Alexandre Herculano, 57",Portugal,[Santo António],"[building, cultural heritage]",Q9698147


### 4.1. Visualização da localização dos "Monumentos"

In [76]:
import folium
from folium import IFrame
from folium.plugins import Fullscreen, MiniMap
import html

# Criar o mapa centrado em Portugal
mapa = folium.Map(location=[39.5, -8.0], zoom_start=7, tiles=None, control_scale=True)

# Adicionar diferentes estilos de mapa
folium.TileLayer('OpenStreetMap', name='Padrão').add_to(mapa)
folium.TileLayer('CartoDB positron', name='Claro').add_to(mapa)
folium.TileLayer('CartoDB dark_matter', name='Escuro').add_to(mapa)

# Plugins extra
Fullscreen(position='topright').add_to(mapa)
MiniMap(toggle_display=True, position='bottomright').add_to(mapa)

# Criar um dicionário para armazenar as camadas por supercategoria
camadas = {}

# Iterar sobre os monumentos
for _, row in df_monumentos_pt.dropna(subset=['lat', 'lon']).iterrows():
    nome = html.escape(str(row.get('nome_monumento', 'Monumento')))
    
    imagem_urls = row.get('urls', [])
    imagem = imagem_urls[0] if isinstance(imagem_urls, list) and imagem_urls else None

    supercat_raw = row.get('supercategory', [])
    supercat_list = supercat_raw if isinstance(supercat_raw, list) else [supercat_raw]
    local = ', '.join(row.get('localizacoes_administrativas', [])) if isinstance(row.get('localizacoes_administrativas'), list) else row.get('localizacoes_administrativas', '')
    tipos = ', '.join(row.get('tipos_instancia_de', [])) if isinstance(row.get('tipos_instancia_de'), list) else row.get('tipos_instancia_de', '')

    # HTML estilizado
    html_popup = f"""
    <div style="width:240px; font-family:'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background:white; border-radius:10px; box-shadow:0 4px 12px rgba(0,0,0,0.2); overflow:hidden;">
        {"<img src='"+imagem+"' style='width:100%; height:auto; display:block;'>" if imagem else ""}
        <div style="padding:10px;">
            <h4 style="margin:0 0 10px 0; font-size:16px; text-align:center; color:#2c3e50;">{nome}</h4>
            <p style="margin:4px 0;"><b>Supercategoria:</b> {html.escape(', '.join(supercat_list))}</p>
            <p style="margin:4px 0;"><b>Localização:</b> {html.escape(str(local))}</p>
            <p style="margin:4px 0;"><b>Tipo:</b> {html.escape(str(tipos))}</p>
        </div>
    </div>
    """

    iframe = IFrame(html=html_popup, width=260, height=360)
    popup = folium.Popup(iframe, max_width=300)

    for supercat in supercat_list:
        if not supercat:
            supercat = "Sem Categoria"

        # Criar camada se ainda não existir
        if supercat not in camadas:
            camada = folium.FeatureGroup(name=supercat, show=True)
            camadas[supercat] = camada
            camada.add_to(mapa)

        # Adicionar marcador à camada correspondente
        folium.CircleMarker(
            location=[row['lat'], row['lon']],
            radius=8,
            color='#2c3e50',
            fill=True,
            fill_color='#2980b9',
            fill_opacity=0.9,
            tooltip=nome,
            popup=popup
        ).add_to(camadas[supercat])

# Adicionar controlo de camadas
folium.LayerControl(collapsed=False).add_to(mapa)

# Guardar o mapa
mapa.save("mapa_monumentos.html")

In [81]:
pd.set_option('display.max_colwidth', None)
df_monumentos_pt[["urls", "nome_monumento"]].head(1)

Unnamed: 0,urls,nome_monumento
0,"[http://upload.wikimedia.org/wikipedia/commons/f/fa/Est%C3%A1tua_de_D._Pedro_IV_-_Porto_-_detalhe_do_pedestal_5.jpg, http://upload.wikimedia.org/wikipedia/commons/3/39/12._Statue_%C3%A9questre_de_Pierre_IV_%28Porto%29.JPG, http://upload.wikimedia.org/wikipedia/commons/5/50/06._Statue_%C3%A9questre_de_Pierre_IV_%28Porto%29.JPG, https://upload.wikimedia.org/wikipedia/commons/4/45/D.Pedro_IV.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d9/Statue_D._Pedro_IV_Porto_2017_2.jpg, https://upload.wikimedia.org/wikipedia/commons/b/bf/Estatua_de_D._Pedro_IV%2C_Oporto%2C_Portugal%2C_2012-05-09%2C_DD_01.JPG, https://upload.wikimedia.org/wikipedia/commons/6/64/Portugal_%2815435887847%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/ef/Sto._Ildefonso_-_Praca_da_Liberdade_%289%29.jpg, https://upload.wikimedia.org/wikipedia/commons/7/77/D._Pedro_IV_%282331226699%29.jpg, http://upload.wikimedia.org/wikipedia/commons/4/46/11._Statue_%C3%A9questre_de_Pierre_IV_%28Porto%29.JPG, http://upload.wikimedia.org/wikipedia/commons/5/59/Dom_Pedro_IV_P%C3%A7_Liberdade_Porto.jpg, https://upload.wikimedia.org/wikipedia/commons/1/1e/Oporto_%28Portugal%29_%2816273680516%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e9/Porto%2C_Portugal_%2831898694622%29.jpg, https://upload.wikimedia.org/wikipedia/commons/4/4a/Avenida_dos_Aliados_%285390427904%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e1/Est%C3%A1tua_equestre_de_D._Pedro_IV2.jpg, https://upload.wikimedia.org/wikipedia/commons/8/86/The_Liberal_king_%282333638958%29.jpg, http://upload.wikimedia.org/wikipedia/commons/a/a5/08._Statue_%C3%A9questre_de_Pierre_IV_%28Porto%29.JPG, https://upload.wikimedia.org/wikipedia/commons/3/3f/Porto_08_%2818175173709%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f9/Est%C3%A1tua_equestre_de_D._Pedro_IV5.jpg, https://upload.wikimedia.org/wikipedia/commons/8/8e/Estatua_Equestre_D._Pedro_IV%2C_Aliados.jpg, http://upload.wikimedia.org/wikipedia/commons/0/0a/03._Statue_%C3%A9questre_de_Pierre_IV_%28Porto%29.JPG, https://upload.wikimedia.org/wikipedia/commons/b/b5/Porto-Avenida_dos_Alliados-20142909.jpg, https://upload.wikimedia.org/wikipedia/commons/3/3c/Est%C3%A1tua_Equestre_de_D._Pedro_IV.jpg, https://upload.wikimedia.org/wikipedia/commons/4/42/Aliados.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f1/2011_-_panoramio_%28326%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d5/ID_155834.jpg, https://upload.wikimedia.org/wikipedia/commons/6/6c/Est%C3%A1tuaequestredeD.Pedro_IV.jpg, http://upload.wikimedia.org/wikipedia/commons/3/35/04._Statue_%C3%A9questre_de_Pierre_IV_%28Porto%29.JPG, http://upload.wikimedia.org/wikipedia/commons/7/75/Est%C3%A1tua_de_D._Pedro_IV_-_Porto_-_detalhe_do_pedestal_2.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f1/Portugalia_Porto_plac_wolno%C5%9Bci_pomnik_Pedra_IV_02.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c8/2014-P72_Porto_%2815522096089%29_%282%29.jpg, http://upload.wikimedia.org/wikipedia/commons/b/b0/Est%C3%A1tua_de_D._Pedro_IV_-_Porto_-_detalhe_do_pedestal.jpg, https://upload.wikimedia.org/wikipedia/commons/4/46/Sto._Ildefonso_-_Praca_da_Liberdade_%287%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d2/2014-P71_Porto_%2815522576408%29.jpg, https://upload.wikimedia.org/wikipedia/commons/1/16/Est%C3%A1tua_de_D._Pedro_IV_-_Porto_-_detalhe_do_pedestal_3.jpg, https://upload.wikimedia.org/wikipedia/commons/b/b2/A_D._Pedro_IV_a_Cidade_do_Porto%2C_1866.JPG, http://upload.wikimedia.org/wikipedia/commons/d/d1/Est%C3%A1tua_de_D._Pedro_IV_-_Porto.jpg, https://upload.wikimedia.org/wikipedia/commons/9/91/Est%C3%A1tua_equestre_de_D_Pedro_IV.jpg, https://upload.wikimedia.org/wikipedia/commons/4/4d/Portugalia_Porto_plac_wolno%C5%9Bci_pomnik_Pedra_IV_04.jpg, https://upload.wikimedia.org/wikipedia/commons/3/3f/Oporto_%28Portugal%29_%2823413890773%29.jpg, https://upload.wikimedia.org/wikipedia/commons/b/b6/King_Pedro_IV_-_details_%22in_love%22_%287844270414%29.jpg, http://upload.wikimedia.org/wikipedia/commons/a/a5/Est%C3%A1tua_de_D._Pedro_IV_-_relevo_lado_direito.jpg, http://upload.wikimedia.org/wikipedia/commons/a/aa/Est%C3%A1tua_de_D._Pedro_IV_%284%29_-_Porto.jpg, http://upload.wikimedia.org/wikipedia/commons/3/38/Est%C3%A1tual_de_D._Pedro_IV_-_pedestal_lado_esquerdo.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e8/ID_73484_etatua_D.Pedro_IV.jpg, https://upload.wikimedia.org/wikipedia/commons/3/3f/Oporto_%28Portugal%29_%2816273680736%29.jpg, https://upload.wikimedia.org/wikipedia/commons/6/6b/Avenida_dos_Aliados_%285389820505%29.jpg, https://upload.wikimedia.org/wikipedia/commons/1/1c/Banco_Espirito_Santo_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/b/bc/Est%C3%A1tua_equestre_de_D._Pedro_IV4.jpg, https://upload.wikimedia.org/wikipedia/commons/0/0f/Est%C3%A1tua_de_D._Pedro_IV.jpg, http://upload.wikimedia.org/wikipedia/commons/2/2b/Est%C3%A1tua_de_D._Pedro_IV_%283%29_-_Porto.jpg, https://upload.wikimedia.org/wikipedia/commons/6/68/Porto%2C_Portugal_%2810552088176%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/5e/Avenida_dos_Aliados2.jpg, https://upload.wikimedia.org/wikipedia/commons/3/30/Estatua_de_D._Pedro_IV%2C_Oporto%2C_Portugal%2C_2012-05-09%2C_DD_03.JPG, https://upload.wikimedia.org/wikipedia/commons/3/3e/Est%C3%A1tua_de_D._Pedro_IV_com_a_C%C3%A2mara_Municipal_ao_fundo.jpg, http://upload.wikimedia.org/wikipedia/commons/4/49/Est%C3%A1tua_de_D._Pedro_IV_%282%29_-_Porto.jpg, https://upload.wikimedia.org/wikipedia/commons/5/5f/Oporto-1_%288610575190%29.jpg, http://upload.wikimedia.org/wikipedia/commons/9/94/Est%C3%A1tua_de_D._Pedro_IV_-_Porto_-_relevo_lado_esquerdo_2.jpg, http://upload.wikimedia.org/wikipedia/commons/f/f6/05._Statue_%C3%A9questre_de_Pierre_IV_%28Porto%29.JPG, https://upload.wikimedia.org/wikipedia/commons/a/a9/Avenida_dos_Aliados%2C_Porto_-_Mar_2010.jpg, https://upload.wikimedia.org/wikipedia/commons/2/2c/2014-P70%28I%29_Porto_%2815640554438%29.jpg, http://upload.wikimedia.org/wikipedia/commons/1/14/Est%C3%A1tua_de_D._Pedro_IV_%28Av._Aliados%29.JPG, https://upload.wikimedia.org/wikipedia/commons/2/26/Est%C3%A1tua_equestre_de_D._Pedro_IV3.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d6/Pra%C3%A7a_da_Liberdade%2C_Porto._Portugal._%2816478715718%29.jpg, https://upload.wikimedia.org/wikipedia/commons/3/3e/Porto_-i---i-_%2840950995982%29.jpg, https://upload.wikimedia.org/wikipedia/commons/a/ad/Porto_%285760136296%29.jpg, https://upload.wikimedia.org/wikipedia/commons/0/01/Aliados_I.jpg, https://upload.wikimedia.org/wikipedia/commons/5/58/Sto._Ildefonso_-_Praca_da_Liberdade_%288%29.jpg, https://upload.wikimedia.org/wikipedia/commons/3/3b/Porto_-_Est%C3%A1tua_equestre_de_D._Pedro_IV_ao_anoitecer.jpg, https://upload.wikimedia.org/wikipedia/commons/6/67/Estatua_de_D._PedroIV.jpg, https://upload.wikimedia.org/wikipedia/commons/f/fb/Porto_-i---i-_%2840950990442%29.jpg, https://upload.wikimedia.org/wikipedia/commons/4/46/20110924_D80_EstatuaDPedroIV-73484_0492.jpg, https://upload.wikimedia.org/wikipedia/commons/9/97/Estatua_de_D._Pedro_IV%2C_Oporto%2C_Portugal%2C_2012-05-09%2C_DD_02.JPG, http://upload.wikimedia.org/wikipedia/commons/5/5f/Est%C3%A1tua_de_D._Pedro_IV_-_relevo_lado_direito_2.jpg, http://upload.wikimedia.org/wikipedia/commons/4/41/01._Statue_%C3%A9questre_de_Pierre_IV_%28Porto%29.JPG, https://upload.wikimedia.org/wikipedia/commons/e/ee/King_Pedro_IV_statue_%287844270226%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f6/Est%C3%A1tua_equestre_de_D._Pedro_IV.jpg, https://upload.wikimedia.org/wikipedia/commons/b/b0/Avenida_dos_aliados_%282340202743%29.jpg, https://upload.wikimedia.org/wikipedia/commons/7/70/Oporto-2_%288610579546%29.jpg, http://upload.wikimedia.org/wikipedia/commons/a/a3/Est%C3%A1tua_de_D._Pedro_IV_-_Porto_-_detalhe_do_pedestal_4.jpg, http://upload.wikimedia.org/wikipedia/commons/b/b0/10._Statue_%C3%A9questre_de_Pierre_IV_%28Porto%29.JPG]",Monument to Pedro IV (Porto)


### 4.2. Guardar as imagens

In [83]:
import unicodedata
import re
pd.reset_option('display.max_colwidth')

# Função para limpar nomes de pastas
def limpar_nome(nome):
    nome = unicodedata.normalize('NFKD', nome).encode('ASCII', 'ignore').decode('utf-8')
    nome = re.sub(r'[\\/*?:"<>|]', "", nome)  # remover caracteres inválidos
    nome = nome.strip().replace(' ', '_')
    return nome

# Pasta Train Base
dataTrain_dir = data_dir / 'train'
dataTrain_dir.mkdir(exist_ok=True)

# Simular browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# Função para descarregar uma imagem (com cache e paralelismo)
def descarregar_imagem(monumento_nome, url, i):
    try:
        nome_limpo = limpar_nome(monumento_nome)
        pasta_monumento = dataTrain_dir / nome_limpo
        pasta_monumento.mkdir(exist_ok=True)

        # Geração de nome único por hash da URL
        hash_url = hashlib.md5(url.encode()).hexdigest()
        extensao = url.split('.')[-1].split('?')[0][:4]
        nome_ficheiro = f"{i:03d}_{hash_url}.{extensao}"
        caminho_ficheiro = pasta_monumento / nome_ficheiro

        if caminho_ficheiro.exists():
            return  # já existe

        resposta = requests.get(url, headers=headers, timeout=10)
        if resposta.status_code == 200:
            with open(caminho_ficheiro, 'wb') as f:
                f.write(resposta.content)
        else:
            print(f"Erro ao descarregar {url}: status {resposta.status_code}")
    except Exception as e:
        print(f"Erro com '{monumento_nome}': {e}")

# Criar lista de tarefas
tarefas = []
for _, row in df_monumentos_pt.iterrows():
    nome = str(row.get("nome_monumento", "monumento_desconhecido"))
    urls = row.get("urls", [])
    if isinstance(urls, list):
        for i, url in enumerate(urls):
            tarefas.append((nome, url, i))

# Descarregar em paralelo
with ThreadPoolExecutor(max_workers=16) as executor:
    list(tqdm(executor.map(lambda x: descarregar_imagem(*x), tarefas), total=len(tarefas)))

 12%|█▏        | 856/7210 [01:34<17:47,  5.95it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/0/07/P8062937_ShiftN2_%287833274928%29.jpg: status 404


 15%|█▌        | 1112/7210 [02:11<07:49, 12.98it/s]  

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/5/51/Adro_da_Capela_de_Santo_Amaro_-_Lisboa_-_Portuga_%2837669571036%29.jpg: status 404


 21%|██        | 1488/7210 [02:50<07:47, 12.23it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/6/60/Lisbon-20160210-029_%2825788996065%29.jpg: status 404


 21%|██▏       | 1547/7210 [02:58<10:02,  9.40it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/c/c9/2008_-_panoramio_-_%D0%92%D0%B0%D0%BB%D0%B5%D1%80%D0%B8%D0%B9_%D0%94%D0%B5%D0%B4_%283%29.jpg: status 404


 23%|██▎       | 1636/7210 [03:13<16:42,  5.56it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/2/2f/Plaza_del_Comercio%2C_Lisboa%2C_Portugal%2C_2012-05-12%2C_DD_03.JPG: status 404


 24%|██▍       | 1734/7210 [03:27<21:45,  4.19it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/8/84/Plaza_del_Comercio%2C_Lisboa%2C_Portugal%2C_2012-05-12%2C_DD_01.JPG: status 404


 27%|██▋       | 1912/7210 [03:44<06:08, 14.37it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/9/94/Le%C3%A7a_da_Palmeira_IMG_3160.JPG_%286104711629%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/f/fa/Le%C3%A7a_da_Palmeira_IMG_3158.JPG_%286104708437%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/c/c3/Le%C3%A7a_da_Palmeira_IMG_3089.JPG_%286104517397%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/6/6e/Le%C3%A7a_da_Palmeira_IMG_3165.JPG_%286105266406%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/d/d4/Le%C3%A7a_da_Palmeira_IMG_3079.JPG_%286105048112%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/7/7f/Le%C3%A7a_da_Palmeira_IMG_3088.JPG_%286104514785%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/7/77/Le%C3%A7a_da_Palmeira_IMG_3162.JPG_%286105260738%29.jpg: status 404
Erro ao descarregar 

 27%|██▋       | 1915/7210 [03:46<10:44,  8.21it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/7/71/Le%C3%A7a_da_Palmeira_IMG_3070.JPG_%286105019826%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/7/73/Le%C3%A7a_da_Palmeira_IMG_3078.JPG_%286105045256%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/8/8c/Le%C3%A7a_da_Palmeira_IMG_3161.JPG_%286105259034%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/6/62/Le%C3%A7a_da_Palmeira_IMG_3081.JPG_%286104507201%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/4/46/Le%C3%A7a_da_Palmeira_IMG_3103.JPG_%286105107532%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/1/10/Le%C3%A7a_da_Palmeira_IMG_3251.JPG_%286105463608%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/d/d1/Le%C3%A7a_da_Palmeira_IMG_3250.JPG_%286104917465%29.jpg: status 404
Erro ao descarregar 

 27%|██▋       | 1956/7210 [03:48<04:34, 19.14it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/2/29/Le%C3%A7a_da_Palmeira_IMG_3239.JPG_%286104893417%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/e/ee/Le%C3%A7a_da_Palmeira_IMG_3071.JPG_%286105023086%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/1/15/Le%C3%A7a_da_Palmeira_IMG_3166.JPG_%286105268324%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/0/08/Le%C3%A7a_da_Palmeira_IMG_3104.JPG_%286105109796%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/3/36/Restaurante_Boa_Nova._%286070725063%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/a/ae/Le%C3%A7a_da_Palmeira_IMG_3120.JPG_%286105157932%29.jpg: status 404


 27%|██▋       | 1982/7210 [03:54<12:19,  7.07it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/0/02/Igreja_da_Nossa_Senhora_do_Carmo_das_Carmelitas_-_panoramio_%281%29.jpg: status 404


 48%|████▊     | 3453/7210 [07:02<06:49,  9.18it/s]  

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/4/40/Exercise_TRIDENT_JUNCTURE_%2821808370373%29.jpg: status 404


 48%|████▊     | 3477/7210 [07:03<04:25, 14.04it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/f/fb/Exercise_TRIDENT_JUNCTURE_%2822241527080%29.jpg: status 404


 56%|█████▌    | 4050/7210 [08:04<06:52,  7.67it/s]

Erro ao descarregar http://upload.wikimedia.org/wikipedia/commons/8/8c/Portugal_Cabo_S_Vincence.jpg: status 404


 59%|█████▉    | 4261/7210 [08:43<11:30,  4.27it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/4/4f/%D0%A4%D0%BE%D1%80%D1%82_%D0%B4%D0%B5_%D0%A1%D0%B0%D0%BD_%D0%91%D1%80%D0%B0%D1%88_%D0%B4%D0%B5_%D0%A1%D0%B0%D0%BD%D0%BA%D1%81%D0%B5%D1%82_%28Forte_de_S%C3%A3o_Br%C3%A1s_de_Sanxete%29_-_panoramio.jpg: status 404


 64%|██████▍   | 4605/7210 [09:23<05:05,  8.52it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/1/19/Igreja_de_S%C3%A3o_Domingos_em_Viana_do_Castelo.jpg: status 404


 64%|██████▍   | 4646/7210 [09:25<03:46, 11.30it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/6/69/P8093304crw_%287881501342%29.jpg: status 404


 66%|██████▌   | 4750/7210 [09:35<05:01,  8.15it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/8/8e/Igreja_das_Carmelitas_Porto_01.jpg: status 404


 67%|██████▋   | 4800/7210 [09:46<07:01,  5.71it/s]

Erro ao descarregar http://upload.wikimedia.org/wikipedia/commons/b/be/FortalzaSagres.jpg: status 404


 69%|██████▉   | 4972/7210 [10:10<10:53,  3.42it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/b/bf/DIMG_7326_%281873279911%29.jpg: status 404


 90%|████████▉ | 6462/7210 [13:15<00:57, 13.01it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/8/87/Monastery_%2839047661370%29.jpg: status 404


 92%|█████████▏| 6601/7210 [13:34<01:15,  8.04it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/3/38/IMG_2973.JPG_%288071340831%29.jpg: status 404


 95%|█████████▌| 6863/7210 [14:05<00:27, 12.60it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/d/da/IMG_2967.JPG_%288071332448%29.jpg: status 404


 98%|█████████▊| 7040/7210 [14:36<00:44,  3.86it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/d/d8/Silves_-_Rua_Diogo_Manuel_03.2018.jpg: status 404


 99%|█████████▊| 7103/7210 [14:37<00:11,  9.64it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/f/f6/Cidade_de_Silves.JPG: status 404


 99%|█████████▉| 7147/7210 [14:40<00:05, 12.38it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/9/91/Cidade_de_Silves12.jpg: status 404


100%|██████████| 7210/7210 [14:47<00:00,  8.12it/s]


In [85]:
# Contagem
total_monumentos = 0
total_imagens = 0
imagens_por_monumento = {}

# Iterar pelas subpastas (cada pasta = um "monumento")
for pasta_monumento in sorted(dataTrain_dir.iterdir()):
    if pasta_monumento.is_dir():
        imagens = list(pasta_monumento.glob('*.*'))  # todas as imagens (qualquer extensão)
        num_imagens = len(imagens)
        imagens_por_monumento[pasta_monumento.name] = num_imagens
        total_monumentos += 1
        total_imagens += num_imagens

# Resultados
print(f"Total de monumentos: {total_monumentos}")
print(f"Total de imagens: {total_imagens}")
print("\nNúmero de imagens por monumento:")
for nome, num in imagens_por_monumento.items():
    print(f"- {nome}: {num}")


Total de monumentos: 306
Total de imagens: 7154

Número de imagens por monumento:
- Alfanzina_Lighthouse: 3
- Arnel_Point_Lighthouse: 12
- Aveiro_Lighthouse: 29
- Basilica_of_Our_Lady_of_the_Martyrs,_Lisboa: 5
- Brejoeira_Palace: 7
- Cabo_Carvoeiro_Lighthouse: 11
- Cabo_da_Roca_Lighthouse: 117
- Cabo_de_Santa_Maria_Lighthouse: 10
- Cabo_de_Sao_Vicente_Lighthouse: 114
- Cabo_Espichel_Lighthouse: 39
- Cabo_Mondego_Lighthouse: 12
- Cabo_Raso_Lighthouse: 4
- Cabo_Sardao_Lighthouse: 7
- Capela_da_Boa_Nova: 9
- Capela_das_Almas: 75
- Capela_de_Nossa_Senhora_da_Encarnacao: 4
- Capela_de_Nossa_Senhora_do_Monte: 9
- Capela_de_Santo_Amaro_(Alcantara): 37
- Capela_de_Sao_Jorge: 7
- Capela_do_Anjo_da_Guarda_(Ponte_de_Lima): 10
- Capela_do_Calvario: 4
- Capela_do_Paco_da_Bemposta: 5
- Capela_do_Senhor_da_Boa_Nova: 5
- Capela_do_Senhor_da_Pedra: 81
- Capela_dos_Alfaiates: 7
- Carrancas_Palace: 7
- Carris_Museum: 4
- Casa_dos_Bicos: 37
- Casa_dos_Pitas: 4
- Casa_Museu_Jose_Regio: 3
- Casa_Nobre_de_La