### Imports

In [61]:
########################### DataSet ########################### 
# https://github.com/cvdfoundation/google-landmark?tab=readme-ov-file#release-history
###############################################################

import os
import pandas as pd
import hashlib
import requests
import tarfile
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

base_dir = Path(os.getcwd()).resolve().parent
base_dir

WindowsPath('C:/Users/diogo/Desktop/APVC/APVC-ProjetoFinal')

In [62]:
data_dir = base_dir / 'data'
data_dir.mkdir(parents=True, exist_ok=True)

## 1. Recolher os dados

In [63]:
dataLandMarkTrain_dir = data_dir / 'land_mark' / 'train'
dataLandMarkTrain_dir.mkdir(parents=True, exist_ok=True)

urls = [
    "https://s3.amazonaws.com/google-landmark/metadata/train.csv",
    "https://s3.amazonaws.com/google-landmark/metadata/train_clean.csv",
    "https://s3.amazonaws.com/google-landmark/metadata/train_attribution.csv",
    "https://s3.amazonaws.com/google-landmark/metadata/train_label_to_category.csv",
    "https://s3.amazonaws.com/google-landmark/metadata/train_label_to_hierarchical.csv"
]

# Download dos ficheiros apenas se não existirem
for u in urls:
    file_name = Path(u).name
    save_path = dataLandMarkTrain_dir / file_name

    if save_path.exists():
        print(f"Já existe: {save_path}")
        continue

    response = requests.get(u)
    response.raise_for_status()

    with open(save_path, "wb") as f:
        f.write(response.content)

    print(f"Guardado: {save_path}")


Já existe: C:\Users\diogo\Desktop\APVC\APVC-ProjetoFinal\data\land_mark\train\train.csv
Já existe: C:\Users\diogo\Desktop\APVC\APVC-ProjetoFinal\data\land_mark\train\train_clean.csv
Já existe: C:\Users\diogo\Desktop\APVC\APVC-ProjetoFinal\data\land_mark\train\train_attribution.csv
Já existe: C:\Users\diogo\Desktop\APVC\APVC-ProjetoFinal\data\land_mark\train\train_label_to_category.csv
Já existe: C:\Users\diogo\Desktop\APVC\APVC-ProjetoFinal\data\land_mark\train\train_label_to_hierarchical.csv


Vou utilizar o **train_clean.csv** como base, já que este possui imagens mais fiáveis quando comparado com **train.csv**.

## 2. Merge dos DataSets

In [64]:
def colunas_em_comum(df1, df2):
    """
    Recebe dois DataFrames e devolve o conjunto de colunas em comum.
    """
    return set(df1.columns) & set(df2.columns)

### 2.1. Train_clean.csv - DataSet Base

In [65]:
train_clean = pd.read_csv(dataLandMarkTrain_dir / "train_clean.csv")
train_clean["images"] = train_clean["images"].apply(lambda x: x.split() if pd.notnull(x) else [])
train_clean.head()

Unnamed: 0,landmark_id,images
0,1,"[17660ef415d37059, 92b6290d571448f6, cd41bf948edc0340, fb09f1e98c6d2f70]"
1,7,"[25c9dfc7ea69838d, 28b13f94a6f1f3c1, 307d6584f473ba35, 4a7ba9eb16d51bc4, 597353dfbb3df649, a40d00dc4fcc3a10, aff1d42de18d9efe, c87bbcbf35a41875]"
2,9,"[0193b65bb58d2c77, 1a30a51a287ecf69, 1f4e8ab1f1b2321c, 28267d88d4d9ea30, 294c5690ad39a48e, 52ac5040369fc460, 5f849ade1b4fbcb5, 86463b5e23adde46, 899f66ffe9ba3559, 904efd09f3536f0e, a83f150c65ab52fd, b88fc57ed9ee91c8, c671c4809a90ee0e, dd7e7efdace99087, e75cc529d7dc7506, ea2537ff6259b15b]"
3,11,"[1a6cb1deed46bb17, 1cc2c8fbc83e1a0c, 2361b8da868c9113, 32652480a7d99c5e, 34533ce2fb47a64f, 3c79cb8374f8ec83, 49c20b7fcf95c10d, 6ad926b79d48e39d, 6ce47c7c47dd8531, 73e5aa8fb1eac238, 8a28e62cfb853e04, 8be314135f27f76a, 9aca8d92c54267d7, b9ebc781b0cfae45, c27520666ce65248, c8a0eef7862fbdb9, c933667cba57b88c, dc0012bbc1004ffd, ddd6991e71d5e25c, e2fe265fade3c806, eb002884433672c3, f2b734f50257264b]"
4,12,"[0a199c97c382b1ff, 1492a5d344495391, 290097bd36a6b01d, 2b87d221476447d2, 2d685b1280ba366b, 30a8e693c1dae116, 346204851c3234f5, 39ae9ce73feeaa81, 4ea6aed2ce0b2164, 57175747c275757e, 65b1d023fd3b351a, 6aed1ad6270932e7, 6c18bb0fe1a03eff, 7930846e49898145, 7b944679ec3813ee, 97b35f74c109bd68, 9c7e6430230cc82f, a48926b15f6bd8eb, a9cbe5b398593f4e, abb32a2249a0c6d4, b9fdbe1fd2335305, bac6e5726b5d1695, bb9f93faea6dcbd8, f5aacdd99e5b4966, f61eac519341403b]"


A coluna `images` corresponde ao **`id`** do *train.csv*

### 2.2. train_label_to_hierarchical.csv - Filtrar pelas categorias

In [66]:
train_label_to_hierarchical = pd.read_csv(dataLandMarkTrain_dir / "train_label_to_hierarchical.csv")
train_label_to_hierarchical.head()

Unnamed: 0,landmark_id,category,supercategory,hierarchical_label,natural_or_human_made
0,0,http://commons.wikimedia.org/wiki/Category:Happy_Valley_Racecourse,horse racing venue,sports venue,human-made
1,1,http://commons.wikimedia.org/wiki/Category:Luitpoldpark_in_Munich,park,parks,natural
2,2,http://commons.wikimedia.org/wiki/Category:Grand_Ventron,mountain,mountain,natural
3,5,http://commons.wikimedia.org/wiki/Category:Lakeside_International_Raceway,motorsport racing track,road,human-made
4,7,"http://commons.wikimedia.org/wiki/Category:Sparkassen-Arena,_G%C3%B6ttingen",multi-purpose hall,,


In [67]:
mergeWith = colunas_em_comum(train_clean, train_label_to_hierarchical)
train_clean = train_clean.merge(train_label_to_hierarchical, on=list(mergeWith), how='left')
train_clean.head()

Unnamed: 0,landmark_id,images,category,supercategory,hierarchical_label,natural_or_human_made
0,1,"[17660ef415d37059, 92b6290d571448f6, cd41bf948edc0340, fb09f1e98c6d2f70]",http://commons.wikimedia.org/wiki/Category:Luitpoldpark_in_Munich,park,parks,natural
1,7,"[25c9dfc7ea69838d, 28b13f94a6f1f3c1, 307d6584f473ba35, 4a7ba9eb16d51bc4, 597353dfbb3df649, a40d00dc4fcc3a10, aff1d42de18d9efe, c87bbcbf35a41875]","http://commons.wikimedia.org/wiki/Category:Sparkassen-Arena,_G%C3%B6ttingen",multi-purpose hall,,
2,9,"[0193b65bb58d2c77, 1a30a51a287ecf69, 1f4e8ab1f1b2321c, 28267d88d4d9ea30, 294c5690ad39a48e, 52ac5040369fc460, 5f849ade1b4fbcb5, 86463b5e23adde46, 899f66ffe9ba3559, 904efd09f3536f0e, a83f150c65ab52fd, b88fc57ed9ee91c8, c671c4809a90ee0e, dd7e7efdace99087, e75cc529d7dc7506, ea2537ff6259b15b]",,,,
3,11,"[1a6cb1deed46bb17, 1cc2c8fbc83e1a0c, 2361b8da868c9113, 32652480a7d99c5e, 34533ce2fb47a64f, 3c79cb8374f8ec83, 49c20b7fcf95c10d, 6ad926b79d48e39d, 6ce47c7c47dd8531, 73e5aa8fb1eac238, 8a28e62cfb853e04, 8be314135f27f76a, 9aca8d92c54267d7, b9ebc781b0cfae45, c27520666ce65248, c8a0eef7862fbdb9, c933667cba57b88c, dc0012bbc1004ffd, ddd6991e71d5e25c, e2fe265fade3c806, eb002884433672c3, f2b734f50257264b]",http://commons.wikimedia.org/wiki/Category:Mercado_Modelo,market hall,market,human-made
4,12,"[0a199c97c382b1ff, 1492a5d344495391, 290097bd36a6b01d, 2b87d221476447d2, 2d685b1280ba366b, 30a8e693c1dae116, 346204851c3234f5, 39ae9ce73feeaa81, 4ea6aed2ce0b2164, 57175747c275757e, 65b1d023fd3b351a, 6aed1ad6270932e7, 6c18bb0fe1a03eff, 7930846e49898145, 7b944679ec3813ee, 97b35f74c109bd68, 9c7e6430230cc82f, a48926b15f6bd8eb, a9cbe5b398593f4e, abb32a2249a0c6d4, b9fdbe1fd2335305, bac6e5726b5d1695, bb9f93faea6dcbd8, f5aacdd99e5b4966, f61eac519341403b]",http://commons.wikimedia.org/wiki/Category:Wasserkunstanlage_Paradies_(Baden-Baden),architectural structure,,


### 2.3. train.csv - Retirar os links das imagens

In [68]:
train = pd.read_csv(dataLandMarkTrain_dir / "train.csv")
train["url"] = train["url"].apply(lambda x: x.split() if pd.notnull(x) else [])
train.head()

Unnamed: 0,id,url,landmark_id
0,6e158a47eb2ca3f6,[https://upload.wikimedia.org/wikipedia/commons/b/b5/Observatoriet_v%C3%A4derkammer_2013a.jpg],142820
1,202cd79556f30760,[http://upload.wikimedia.org/wikipedia/commons/6/63/Ecosse200996-1.jpg],104169
2,3ad87684c99c06e1,[http://upload.wikimedia.org/wikipedia/commons/2/2c/Pirmasens_Dynamikum.jpg],37914
3,e7f70e9c61e66af3,[https://upload.wikimedia.org/wikipedia/commons/0/02/Occidental_Vertical.jpg],102140
4,4072182eddd0100e,[https://upload.wikimedia.org/wikipedia/commons/5/51/Looking_downstream_from_the_footbridge_over_the_Severn_-_geograph.org.uk_-_532337.jpg],2474


In [69]:
id_to_url = dict(zip(train['id'], train['url']))

def ids_para_urls(lista_ids):
    if not isinstance(lista_ids, list):
        return []
    urls = []
    for i in lista_ids:
        url = id_to_url.get(i)
        if url:
            if isinstance(url, list):
                urls.extend(url)  # adiciona todos os urls da lista
            else:
                urls.append(url)  # adiciona url único
    return urls

# Aplicar à coluna 'images' do train_clean e criar nova coluna 'urls'
train_clean['urls'] = train_clean['images'].apply(ids_para_urls)
train_clean

Unnamed: 0,landmark_id,images,category,supercategory,hierarchical_label,natural_or_human_made,urls
0,1,"[17660ef415d37059, 92b6290d571448f6, cd41bf948edc0340, fb09f1e98c6d2f70]",http://commons.wikimedia.org/wiki/Category:Luitpoldpark_in_Munich,park,parks,natural,"[http://upload.wikimedia.org/wikipedia/commons/a/ad/Luitpoldpark_Muenchen-13.jpg, http://upload.wikimedia.org/wikipedia/commons/9/9d/Luitpoldpark_Muenchen-9.jpg, http://upload.wikimedia.org/wikipedia/commons/f/f7/Luitpoldpark_Muenchen-8.jpg, http://upload.wikimedia.org/wikipedia/commons/3/3c/Luitpoldpark_Muenchen-11.jpg]"
1,7,"[25c9dfc7ea69838d, 28b13f94a6f1f3c1, 307d6584f473ba35, 4a7ba9eb16d51bc4, 597353dfbb3df649, a40d00dc4fcc3a10, aff1d42de18d9efe, c87bbcbf35a41875]","http://commons.wikimedia.org/wiki/Category:Sparkassen-Arena,_G%C3%B6ttingen",multi-purpose hall,,,"[https://upload.wikimedia.org/wikipedia/commons/b/b8/Sparkassen-Arena_Goettingen_2018-06.jpg, https://upload.wikimedia.org/wikipedia/commons/6/6d/Sparkassen-Arena%2C_G%C3%B6ttingen_06.jpg, https://upload.wikimedia.org/wikipedia/commons/5/5f/Sparkassen-Arena%2C_G%C3%B6ttingen_02.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f4/Sparkassen-_Arena_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/c/cd/Sparkassen-Arena%2C_G%C3%B6ttingen_05.jpg, https://upload.wikimedia.org/wikipedia/commons/6/6a/Sparkassen-Arena%2C_G%C3%B6ttingen_04.jpg, https://upload.wikimedia.org/wikipedia/commons/1/12/Sparkassen-Arena%2C_G%C3%B6ttingen_01.jpg, https://upload.wikimedia.org/wikipedia/commons/6/61/Sparkassen-Arena%2C_G%C3%B6ttingen_03.jpg]"
2,9,"[0193b65bb58d2c77, 1a30a51a287ecf69, 1f4e8ab1f1b2321c, 28267d88d4d9ea30, 294c5690ad39a48e, 52ac5040369fc460, 5f849ade1b4fbcb5, 86463b5e23adde46, 899f66ffe9ba3559, 904efd09f3536f0e, a83f150c65ab52fd, b88fc57ed9ee91c8, c671c4809a90ee0e, dd7e7efdace99087, e75cc529d7dc7506, ea2537ff6259b15b]",,,,,"[https://upload.wikimedia.org/wikipedia/commons/4/4b/13e16.JPG, https://upload.wikimedia.org/wikipedia/commons/5/55/Bosa_-_Chiesa_della_Madonna_del_Rosario_%2809%29.JPG, https://upload.wikimedia.org/wikipedia/commons/0/03/Luce_sull%27antico_Orologio.jpg, https://upload.wikimedia.org/wikipedia/commons/2/29/L%27antico_orologio..JPG, https://upload.wikimedia.org/wikipedia/commons/d/d1/Bosa_Antico_orologio.jpg, https://upload.wikimedia.org/wikipedia/commons/c/cd/Bosa_-_Chiesa_della_Madonna_del_Rosario_%2807%29.JPG, https://upload.wikimedia.org/wikipedia/commons/d/dc/Vecchio_orologio_Bosa.jpg, https://upload.wikimedia.org/wikipedia/commons/8/86/Bosa_-_Chiesa_della_Madonna_del_Rosario_%2802%29.JPG, https://upload.wikimedia.org/wikipedia/commons/5/52/Antico_orologio_a_Bosa.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e8/Bosa_-_Chiesa_della_Madonna_del_Rosario_%2804%29.JPG, https://upload.wikimedia.org/wikipedia/commons/7/7f/Bosa_-_Chiesa_della_Madonna_del_Rosario_%2806%29.JPG, https://upload.wikimedia.org/wikipedia/commons/d/df/Orologiando.jpg, https://upload.wikimedia.org/wikipedia/commons/7/70/Bosa_-_Chiesa_della_Madonna_del_Rosario_%2803%29.JPG, https://upload.wikimedia.org/wikipedia/commons/6/6d/Bosa%2C_chiesa_del_rosario%2C_01.JPG, https://upload.wikimedia.org/wikipedia/commons/c/cc/Bosa_-_Chiesa_della_Madonna_del_Rosario_%2801%29.JPG, https://upload.wikimedia.org/wikipedia/commons/9/94/Bosa_-_Chiesa_della_Madonna_del_Rosario_%2808%29.JPG]"
3,11,"[1a6cb1deed46bb17, 1cc2c8fbc83e1a0c, 2361b8da868c9113, 32652480a7d99c5e, 34533ce2fb47a64f, 3c79cb8374f8ec83, 49c20b7fcf95c10d, 6ad926b79d48e39d, 6ce47c7c47dd8531, 73e5aa8fb1eac238, 8a28e62cfb853e04, 8be314135f27f76a, 9aca8d92c54267d7, b9ebc781b0cfae45, c27520666ce65248, c8a0eef7862fbdb9, c933667cba57b88c, dc0012bbc1004ffd, ddd6991e71d5e25c, e2fe265fade3c806, eb002884433672c3, f2b734f50257264b]",http://commons.wikimedia.org/wiki/Category:Mercado_Modelo,market hall,market,human-made,"[https://upload.wikimedia.org/wikipedia/commons/d/d0/Mercado_Modelo_02.jpg, https://upload.wikimedia.org/wikipedia/commons/8/8a/Mercado_Modelo_%28ao_fundo_o_Elevador_Lacerda%29_%284459275737%29.jpg, https://upload.wikimedia.org/wikipedia/commons/7/78/Mercado_Square_from_Lacerda_Elevator_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/a/a9/Salvador_da_Bahia%2C_Brasil_-_2010-12-10_-_94234898.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d6/Mercado_modelo1.jpg, https://upload.wikimedia.org/wikipedia/commons/9/98/Mercado_Modelo_-_panoramio.jpg, http://upload.wikimedia.org/wikipedia/commons/0/0a/Mercado_Modelo_-_Salvador.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f6/Mercado_modelo_salvador_de_bahia_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/2/2f/Mercado_Modelo_01.jpg, https://upload.wikimedia.org/wikipedia/commons/3/39/O_Mercado_Modelo_e_o_Forte_S%C3%A3o_Marcelo.jpg, https://upload.wikimedia.org/wikipedia/commons/4/4e/Mercado_modelo_salvador_bahia.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f4/Mercadomodelo.JPG, https://upload.wikimedia.org/wikipedia/commons/d/dc/Mercado_Modelo_%282186684012%29.jpg, https://upload.wikimedia.org/wikipedia/commons/a/a8/Mercado_Modelo_Salvador_2018-0228.jpg, https://upload.wikimedia.org/wikipedia/commons/9/9d/MERCADO_MODELO_S._DE_BAHIA_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e6/Mercado_Modelo_Foto-_Tatiana_Azeviche-Setur_%2835095618713%29.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c4/Mercado_Modelo_Foto-_Tatiana_Azeviche_%2835620527520%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/55/SALVADOR_DE_BAHIA_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/8/8f/Webysther_20150907165051_-_Mercado_modelo.jpg, https://upload.wikimedia.org/wikipedia/commons/0/0a/Cidade_bassa%2C_pra%C3%A7a_Cairu%2C_mercado_modelo_01.JPG, https://upload.wikimedia.org/wikipedia/commons/e/e0/Mercado_Modelo_-_Foto_Tereza_Torres073_%2827%29_%2816278599992%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/58/Fundo_do_Mercado_Modelo.jpg]"
4,12,"[0a199c97c382b1ff, 1492a5d344495391, 290097bd36a6b01d, 2b87d221476447d2, 2d685b1280ba366b, 30a8e693c1dae116, 346204851c3234f5, 39ae9ce73feeaa81, 4ea6aed2ce0b2164, 57175747c275757e, 65b1d023fd3b351a, 6aed1ad6270932e7, 6c18bb0fe1a03eff, 7930846e49898145, 7b944679ec3813ee, 97b35f74c109bd68, 9c7e6430230cc82f, a48926b15f6bd8eb, a9cbe5b398593f4e, abb32a2249a0c6d4, b9fdbe1fd2335305, bac6e5726b5d1695, bb9f93faea6dcbd8, f5aacdd99e5b4966, f61eac519341403b]",http://commons.wikimedia.org/wiki/Category:Wasserkunstanlage_Paradies_(Baden-Baden),architectural structure,,,"[https://upload.wikimedia.org/wikipedia/commons/9/99/Sehensw%C3%BCrdigkeit_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/a/a6/Stadtgarten_am_Annaberg_-_panoramio_%281%29.jpg, https://upload.wikimedia.org/wikipedia/commons/8/87/Wassertreppe_am_Paradies_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/3/34/Wasserkunst_Paradies_erbaut_1925_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/c/ce/Wasserkunstanlage_Paradies_%281%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/df/Baden-Baden-Wasserkunst_Paradies-18-abwaerts-gje.jpg, https://upload.wikimedia.org/wikipedia/commons/7/71/Stadtgarten_am_Annaberg_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/a/a7/Baden-Baden-Wasserkunst_Paradies-04-abwaerts-gje.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e8/Baden-Baden-Wasserkunst_Paradies-16-aufwaerts-gje.jpg, https://upload.wikimedia.org/wikipedia/commons/2/2d/Baden-Baden_im_Schwarzwald_-_panoramio_%284%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/ed/Baden-Baden-Wasserkunst_Paradies-20-abwaerts-beschneit-gje.jpg, http://upload.wikimedia.org/wikipedia/commons/9/9b/Wasserkunstanlage_Paradies.jpg, https://upload.wikimedia.org/wikipedia/commons/c/cb/M%C3%A4rchenhafte_Brunnengrotte_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/5/5a/Baden-Baden-Wasserkunst_Paradies-02-aufwaerts-gje.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d2/Brunnengrotte_-_Baden_-_Baden_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/a/a0/Sehensw%C3%BCrdigkeit_Wassertreppe_und_Brunnengrotte_am_Paradies_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/2/27/Baden_-_Baden_-_Am_Paradies_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/3/38/Baden-Baden-Wasserkunst_Paradies-24-aufwaerts-beschneit-gje.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c8/Wasserkunst_Paradies_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/6/66/Baden-Baden-Wasserkunst_Paradies-06-abwaerts-gje.jpg, https://upload.wikimedia.org/wikipedia/commons/5/57/Wassertreppe_-_Baden_-_Baden_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/1/14/Wassertreppe_Am_Paradies_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/2/26/Baden_-_Baden_-_Wasserkunst_-_Paradies_-_panoramio.jpg, http://upload.wikimedia.org/wikipedia/commons/8/87/Wasserkunstanlage_Paradies_%282%29.jpg, https://upload.wikimedia.org/wikipedia/commons/2/2f/Baden_-_Baden_-_Am_Paradies_-_panoramio_%281%29.jpg]"
...,...,...,...,...,...,...,...
81308,203083,"[1def5ad0872c6303, 23349c63e48a7de2, 2c9d17eeb31f030c, 359addbaa70e8db9, 35da3f416d6ef6e9, 39f18c01af6059f3, 4f2b546dbf8f5595, 521cb9fec58982b9, 57e9a5e349eb8105, 672d5d2ed4e356bf, 683e814a72a0ddb8, 7258afb9d27f1580, 73dcfe77f97de5aa, 7461d0bccbf7bd31, 77b40994b780e172, 7a550aaee7c2f10e, 8fe356b26ab2f4b9, 96d303ee08691f14, b3d7c9f467f1de13, dc06a16977617465, e05ae0af9b1b1546, fde602950dbcb7e2]",http://commons.wikimedia.org/wiki/Category:St._Peter's_Parish_Church_(Radovljica),parish church,church,human-made,"[https://upload.wikimedia.org/wikipedia/commons/2/27/Radovljica_Linhartov_Trg_Pfarrkirche_hl_Petrus_Hauptportal_24062016_2842.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e2/Radovljica_%2831656398355%29.jpg, https://upload.wikimedia.org/wikipedia/commons/1/11/Radovljica_-_cerkev_sv._Petra_%28vhodni_portali%29_2.jpg, https://upload.wikimedia.org/wikipedia/commons/8/8a/Radovljica_-_cerkev_sv._Petra_%28pro%C4%8Delje%29.jpg, https://upload.wikimedia.org/wikipedia/commons/3/31/Radovljica_-_Trubarjeva_ulica_%28pogled_na_cerkev_sv._Petra%29.jpg, https://upload.wikimedia.org/wikipedia/commons/4/49/Radovljica_Linhartov_Trg_parish_church_sv_Petra_portal_13052015_3683.jpg, https://upload.wikimedia.org/wikipedia/commons/4/47/Radovljica_Trubarjeva_ulica_pogled_na_cerkev_sv_Petra_10042017_7397.jpg, https://upload.wikimedia.org/wikipedia/commons/2/2a/Radovljica_Linhartov_Trg_Pfarrkirche_hl_Petrus_Hauptportal_und_Rosettenfenster_18082017_0428.jpg, https://upload.wikimedia.org/wikipedia/commons/6/66/Radovljica_Linhartov_Trg_Pfarrkirche_hl_Petrus_Westfassade_mit_Portalen_10042017_7415.jpg, https://upload.wikimedia.org/wikipedia/commons/3/30/Radovljica_-_cerkev_sv._Petra_%28zvonik%29.jpg, https://upload.wikimedia.org/wikipedia/commons/1/1e/Radovljica_Linhartov_trg_Pfarrkirche_Sankt_Peter_19032015_0942.jpg, https://upload.wikimedia.org/wikipedia/commons/8/8a/Radovljica_-_cerkev_sv._Petra_%28vhodni_portali%29_1.jpg, https://upload.wikimedia.org/wikipedia/commons/5/51/St._Peter%27s_Church_%E8%81%96%E4%BC%AF%E5%A4%9A%E7%A5%BF%E5%A0%82_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/3/3d/Radovljica_-_%C5%BEupni%C5%A1%C4%8De_%28dvori%C5%A1%C4%8De%29.jpg, https://upload.wikimedia.org/wikipedia/commons/6/60/St._Peter%27s_Parish_Church_%E8%81%96%E4%BC%AF%E5%A4%9A%E7%A5%BF%E5%A0%82_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/a/a1/Radovljica_Linhartov_Trg_Pfarrkirche_hl_Petrus_Westfassade_mit_Portalen_24062016_2841.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f9/Radovljica_Linhartov_Trg_Pfarrkirche_hl_Petrus_rechtes_Nebenportal_24062016_2843.jpg, https://upload.wikimedia.org/wikipedia/commons/3/3d/Radovljica_Cesta_Svobode_1_10042017_7436.jpg, https://upload.wikimedia.org/wikipedia/commons/f/fa/St_Peter%27s_Parish_Church_%E8%81%96%E4%BC%AF%E5%A4%9A%E7%A5%BF%E5%A0%82_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/5/54/Radovljica_Trubarjeva_ulica_pogled_na_cerkev_sv_Petra_10042017_7392.jpg, https://upload.wikimedia.org/wikipedia/commons/9/95/Radovljica_Linhartov_Trg_parish_church_sv_Petra_side_portal_13052015_3684.jpg, https://upload.wikimedia.org/wikipedia/commons/b/bc/Radovljica_-_cerkev_sv._Petra_%28glavni_vhod%29.jpg]"
81309,203085,"[0926becdb2c92f8a, 22b58ac428531da8, 2c90549570d85759, 3c22c13791940790, 46e00f9e282a9683, 5f41b7849e0cb267, 763091ff5c133d5c, 7ff0ecc5fd479aac, 9d443cfa5f18b487, 9f3088033b090b40, a18e237d1f732cbf, ae72842b8a405cf4, b14212a62b4cda4f, d5a7e41aef03ea5c, f3a46c3cc912e81e]","http://commons.wikimedia.org/wiki/Category:Church_of_the_Theotokos_of_Tikhvin,_Bryansk",church building,church,human-made,"[https://upload.wikimedia.org/wikipedia/commons/e/ef/%D0%91%D1%80%D1%8F%D0%BD%D1%81%D0%BA._%D0%A2%D0%B8%D1%85%D0%B2%D0%B8%D0%BD%D1%81%D0%BA%D0%B0%D1%8F_%D1%86%D0%B5%D1%80%D0%BA%D0%BE%D0%B2%D1%8C._%D0%A7%D0%B0%D1%81%D0%BE%D0%B2%D0%BD%D1%8F_%D0%BD%D0%B0%D0%B4_%D1%80%D0%BE%D0%B4%D0%BD%D0%B8%D0%BA%D0%BE%D0%BC..JPG, https://upload.wikimedia.org/wikipedia/commons/8/82/%D0%91%D1%80%D1%8F%D0%BD%D1%81%D0%BA._%D0%A2%D0%B8%D1%85%D0%B2%D0%B8%D0%BD%D1%81%D0%BA%D0%B8%D0%B9_%D1%85%D1%80%D0%B0%D0%BC..JPG, https://upload.wikimedia.org/wikipedia/commons/5/5d/Bryansk_Tixv.JPG, https://upload.wikimedia.org/wikipedia/commons/f/f7/Sovetskiy_rayon%2C_Bryansk%2C_Bryanskaya_oblast%27%2C_Russia_-_panoramio_%28294%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/fc/Sovetskiy_rayon%2C_Bryansk%2C_Bryanskaya_oblast%27%2C_Russia_-_panoramio_%28287%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/ec/Sovetskiy_rayon%2C_Bryansk%2C_Bryanskaya_oblast%27%2C_Russia_-_panoramio_%28292%29.jpg, https://upload.wikimedia.org/wikipedia/commons/3/38/%D0%91%D1%80%D1%8F%D0%BD%D1%81%D0%BA._%D0%A2%D0%B8%D1%85%D0%B2%D0%B8%D0%BD%D1%81%D0%BA%D0%B0%D1%8F_%D1%86%D0%B5%D1%80%D0%BA%D0%BE%D0%B2%D1%8C..JPG, https://upload.wikimedia.org/wikipedia/commons/7/7b/%D0%98%D1%81%D1%82%D0%BE%D1%87%D0%BD%D0%B8%D0%BA_%D0%BF%D1%80%D0%B8_%D0%A2%D0%B8%D1%85%D0%B2%D0%B8%D0%BD%D1%81%D0%BA%D0%BE%D0%B9_%D1%86%D0%B5%D1%80%D0%BA%D0%B2%D0%B8.jpg, https://upload.wikimedia.org/wikipedia/commons/9/9e/%D0%91%D1%80%D1%8F%D0%BD%D1%81%D0%BA._%D0%A2%D0%B8%D1%85%D0%B2%D0%B8%D0%BD%D1%81%D0%BA%D0%B0%D1%8F_%D1%86%D0%B5%D1%80%D0%BA%D0%BE%D0%B2%D1%8C_%282%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/ef/%D0%A2%D0%B8%D1%85%D0%B2%D0%B8%D0%BD%D1%81%D0%BA%D0%B0%D1%8F2.jpg, https://upload.wikimedia.org/wikipedia/commons/a/a8/%D0%A2%D0%B8%D1%85%D0%B2%D0%B8%D0%BD%D1%81%D0%BA%D0%B0%D1%8F_%D1%86%D0%B5%D1%80%D0%BA%D0%BE%D0%B2%D1%8C_2%D0%B0.jpg, https://upload.wikimedia.org/wikipedia/commons/4/46/%D0%A2%D0%B8%D1%85%D0%B2%D0%B8%D0%BD%D1%81%D0%BA%D0%B0%D1%8F_%D1%86%D0%B5%D1%80%D0%BA%D0%BE%D0%B2%D1%8C_1%D0%B0.jpg, https://upload.wikimedia.org/wikipedia/commons/c/ce/%D0%A5%D1%80%D0%B0%D0%BC_%D0%B2_%D1%87%D0%B5%D1%81%D1%82%D1%8C_%D0%A2%D0%B8%CC%81%D1%85%D0%B2%D0%B8%D0%BD%D1%81%D0%BA%D0%BE%D0%B9_%D0%B8%D0%BA%D0%BE%CC%81%D0%BD%D1%8B_%D0%91%D0%BE%CC%81%D0%B6%D0%B8%D0%B5%D0%B9_%D0%9C%D0%B0%CC%81%D1%82%D0%B5%D1%80%D0%B8_%28%C2%AB%D0%A2%D0%B8%D1%85%D0%B2%D0%B8%D0%BD%D1%81%D0%BA%D0%B0%D1%8F_%D1%86%D0%B5%D1%80%D0%BA%D0%BE%D0%B2%D1%8C%C2%BB%29.jpg, https://upload.wikimedia.org/wikipedia/commons/8/80/Sovetskiy_rayon%2C_Bryansk%2C_Bryanskaya_oblast%27%2C_Russia_-_panoramio_%28289%29.jpg, https://upload.wikimedia.org/wikipedia/commons/2/24/Sovetskiy_rayon%2C_Bryansk%2C_Bryanskaya_oblast%27%2C_Russia_-_panoramio_%28290%29.jpg]"
81310,203087,"[146cc06310d08ef0, 1ee045e5a3bc9568, 3895947116add663, 3d904df7f6c9f92a, adb2a9e25454e0d1]",http://commons.wikimedia.org/wiki/Category:Jack_Block_Park,park,parks,natural,"[https://upload.wikimedia.org/wikipedia/commons/4/45/Container_transfer_06A.jpg, https://upload.wikimedia.org/wikipedia/commons/b/b0/Container_transfer_01.jpg, https://upload.wikimedia.org/wikipedia/commons/4/44/Container_transfer_06.jpg, https://upload.wikimedia.org/wikipedia/commons/8/8a/Container_transfer_02A.jpg, https://upload.wikimedia.org/wikipedia/commons/8/8d/Container_transfer_08.jpg]"
81311,203091,"[8e219a79ee5eede9, fa7142e44850dbac]",http://commons.wikimedia.org/wiki/Category:Silesian_Beskids_Landscape_Park,landscape park of Poland,parks,natural,"[https://upload.wikimedia.org/wikipedia/commons/0/0a/Beskid_%C5%9Al%C4%85ski_-_szlak_Skrzyczne_-_Malinowska_Ska%C5%82a_07.jpg, https://upload.wikimedia.org/wikipedia/commons/0/0c/Miasto_pomi%C4%99dzy_g%C3%B3rami.JPG]"


In [70]:
pd.set_option('display.max_colwidth', None)
train_clean["urls"]

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [71]:
urls = train_clean['category'].dropna()
urls

0                             http://commons.wikimedia.org/wiki/Category:Luitpoldpark_in_Munich
1                   http://commons.wikimedia.org/wiki/Category:Sparkassen-Arena,_G%C3%B6ttingen
3                                     http://commons.wikimedia.org/wiki/Category:Mercado_Modelo
4           http://commons.wikimedia.org/wiki/Category:Wasserkunstanlage_Paradies_(Baden-Baden)
6                               http://commons.wikimedia.org/wiki/Category:Castle_of_Santa_Cruz
                                                  ...                                          
81308         http://commons.wikimedia.org/wiki/Category:St._Peter's_Parish_Church_(Radovljica)
81309    http://commons.wikimedia.org/wiki/Category:Church_of_the_Theotokos_of_Tikhvin,_Bryansk
81310                                http://commons.wikimedia.org/wiki/Category:Jack_Block_Park
81311                http://commons.wikimedia.org/wiki/Category:Silesian_Beskids_Landscape_Park
81312                                   

## 3. Filtrar os dados 

### 3.1. Classe da foto

In [72]:
pd.reset_option('display.max_colwidth')
print(train_clean["supercategory"].unique().tolist())

['park', 'multi-purpose hall', nan, 'market hall', 'architectural structure', 'castle', 'extinct volcano', 'tomb (Isa Khan Niazi)', 'church building', 'church ruin', 'mountain range', 'peninsula', 'ranch', 'abbey', 'apartment building', 'opera house', 'hall of fame of a state or province (California)', 'road mountain pass', 'Buddhist temple', 'historic house museum', 'music festival', 'lighthouse', 'Catholic cathedral', 'construction', 'bullring', 'bridge', 'lake', 'cultural heritage', 'parish church', 'benedictine abbey', 'island', 'mountain', 'château', 'mining community', 'cultural property', 'Japantown', 'castle ruin', 'monument', 'national park', 'library', 'street', 'archipelago', 'city museum', 'water tower', 'shinto shrine', 'pumping station', 'tower', 'art museum', 'royal palace', 'building', 'mausoleum', 'Eastern Orthodox church', 'mountain pass', 'prison', 'urban park', 'cathedral', 'waterfall', 'Hindu temple', 'manor', 'nature park', 'war memorial', 'protected heritage monu

In [73]:
counts = train_clean["supercategory"].value_counts(dropna=False)
counts_more_than_10 = counts[counts > 100]
print(counts_more_than_10)

supercategory
NaN                24872
church building    11024
castle              2058
mountain            1520
museum              1320
                   ...  
dam                  109
windmill             106
memorial             105
Hindu temple         103
manor house          101
Name: count, Length: 81, dtype: int64


In [74]:
filtrar = False

if filtrar:
    monumento_categorias = [
        # Tenho a certeza que são monumentos
        'church building', 'castle', 'monastery', 'palace',
        'parish church', 'monument', 'cathedral', 'chapel',
        'abbey', 'fort', 'city gate', 'Catholic cathedral',
        'castle ruin',

        # Não tenho a certeza que são monumentos
        'building', 'architectural structure', 'historic house museum',
        'cultural property', 'sculpture', 'fountain', 'tower',
        'lighthouse', 'bridge', 'square', 'city hall'
    ]

    df_monumentos = train_clean[train_clean['supercategory'].isin(monumento_categorias)]
    df_monumentos
else:
    df_monumentos = train_clean.copy()
    df_monumentos

In [75]:
counts = df_monumentos["hierarchical_label"].value_counts(dropna=False)
print(counts)

hierarchical_label
NaN                   34720
church                14697
castle / fort          4293
museum                 2684
mountain               2115
                      ...  
swimming pool            22
wetland                  22
cliff                    20
air transportation       16
stairs                   12
Name: count, Length: 79, dtype: int64


### 3.2. Human Made (Não tenho a certeza, não vou alterar)

In [76]:
counts = df_monumentos["natural_or_human_made"].value_counts(dropna=False)
print(counts)

natural_or_human_made
human-made    38562
NaN           34720
natural        8031
Name: count, dtype: int64


In [77]:
# Existem nulos que me parecem ser monumentos
pd.set_option('display.max_colwidth', None)
df_monumentos[df_monumentos['natural_or_human_made'].isna()]["category"]

1                http://commons.wikimedia.org/wiki/Category:Sparkassen-Arena,_G%C3%B6ttingen
2                                                                                        NaN
4        http://commons.wikimedia.org/wiki/Category:Wasserkunstanlage_Paradies_(Baden-Baden)
5                                                                                        NaN
7                                                                                        NaN
                                                ...                                         
81293                             http://commons.wikimedia.org/wiki/Category:Islington_Green
81295                http://commons.wikimedia.org/wiki/Category:1_Doki_Street_in_Gda%C5%84sk
81297                 http://commons.wikimedia.org/wiki/Category:Serbian_Patriarchy_building
81306                                                                                    NaN
81307                                                                 

## 4. API da MediaWiki - *https://m.mediawiki.org/wiki/API:Main_page*

In [78]:
pd.reset_option('display.max_colwidth')
df_monumentos.head()

Unnamed: 0,landmark_id,images,category,supercategory,hierarchical_label,natural_or_human_made,urls
0,1,"[17660ef415d37059, 92b6290d571448f6, cd41bf948...",http://commons.wikimedia.org/wiki/Category:Lui...,park,parks,natural,[http://upload.wikimedia.org/wikipedia/commons...
1,7,"[25c9dfc7ea69838d, 28b13f94a6f1f3c1, 307d6584f...",http://commons.wikimedia.org/wiki/Category:Spa...,multi-purpose hall,,,[https://upload.wikimedia.org/wikipedia/common...
2,9,"[0193b65bb58d2c77, 1a30a51a287ecf69, 1f4e8ab1f...",,,,,[https://upload.wikimedia.org/wikipedia/common...
3,11,"[1a6cb1deed46bb17, 1cc2c8fbc83e1a0c, 2361b8da8...",http://commons.wikimedia.org/wiki/Category:Mer...,market hall,market,human-made,[https://upload.wikimedia.org/wikipedia/common...
4,12,"[0a199c97c382b1ff, 1492a5d344495391, 290097bd3...",http://commons.wikimedia.org/wiki/Category:Was...,architectural structure,,,[https://upload.wikimedia.org/wikipedia/common...


In [79]:
pd.set_option('display.max_colwidth', None)
df_monumentos["category"]

0                             http://commons.wikimedia.org/wiki/Category:Luitpoldpark_in_Munich
1                   http://commons.wikimedia.org/wiki/Category:Sparkassen-Arena,_G%C3%B6ttingen
2                                                                                           NaN
3                                     http://commons.wikimedia.org/wiki/Category:Mercado_Modelo
4           http://commons.wikimedia.org/wiki/Category:Wasserkunstanlage_Paradies_(Baden-Baden)
                                                  ...                                          
81308         http://commons.wikimedia.org/wiki/Category:St._Peter's_Parish_Church_(Radovljica)
81309    http://commons.wikimedia.org/wiki/Category:Church_of_the_Theotokos_of_Tikhvin,_Bryansk
81310                                http://commons.wikimedia.org/wiki/Category:Jack_Block_Park
81311                http://commons.wikimedia.org/wiki/Category:Silesian_Beskids_Landscape_Park
81312                                   

In [80]:
import requests
from urllib.parse import urlparse, unquote
pd.reset_option('display.max_colwidth')

def extrair_titulo(url):
    """Extrai o título da página Wikimedia a partir da URL"""
    path = urlparse(url).path
    titulo = path.split('/')[-1]  # Última parte da URL
    return unquote(titulo)


def obter_info_wikimedia(titulo):
    """Consulta a Wikimedia API e retorna info básica sobre o título"""
    endpoint = "https://commons.wikimedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "coordinates|pageprops|description",
        "titles": titulo,
        "format": "json",
        "formatversion": 2
    }
    headers = {
        "User-Agent": "ProjetoMonumentosPT/1.0 (teu@email.com)"
    }
    response = requests.get(endpoint, params=params, headers=headers)
    response.raise_for_status()
    return response.json()


def obter_info_wikidata(qid):
    """Consulta a Wikidata API com o QID para obter dados detalhados"""
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()


def obter_label_por_qid(qid, lang="en"):
    """Consulta o rótulo de um QID (por ex. país, cidade)"""
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    entidade = list(data["entities"].values())[0]
    return entidade["labels"].get(lang, {}).get("value", "Desconhecido")


def extrair_detalhes_wikidata(entidade):
    """Extrai dados úteis da estrutura JSON da Wikidata"""
    claims = entidade["claims"]

    def extrair_qid(p):
        if p in claims:
            return claims[p][0]["mainsnak"]["datavalue"]["value"]["id"]
        return None

    def extrair_varios_qids(p):
        if p in claims:
            return [c["mainsnak"]["datavalue"]["value"]["id"] for c in claims[p]]
        return []

    resultado = {
        "pais_qid": extrair_qid("P17"),
        "localizacoes_qid": extrair_varios_qids("P131"),
        "instancias_qid": extrair_varios_qids("P31"),
        "label": entidade.get("labels", {}).get("en", {}).get("value", "Sem título")
    }

    return resultado


# --- Execução (exemplo) ---

url_exemplo = "https://commons.wikimedia.org/wiki/Category:Wasserkunstanlage_Paradies_(Baden-Baden)"
titulo = extrair_titulo(url_exemplo)

# 1. Obter info da Wikimedia
dados_wiki = obter_info_wikimedia(titulo)
pagina = dados_wiki["query"]["pages"][0]

coordenadas = pagina.get("coordinates", [{}])[0]
wikibase_item = pagina.get("pageprops", {}).get("wikibase_item", None)

print("📌 Exemplo de monumento extraído:")
print("🔗 URL:", url_exemplo)
print("📄 Título Wikimedia:", titulo)
print("🗺️ Coordenadas:", coordenadas)
print("🔗 Wikibase Item:", wikibase_item)

if wikibase_item:
    dados_wikidata = obter_info_wikidata(wikibase_item)
    entidade = dados_wikidata["entities"][wikibase_item]
    detalhes = extrair_detalhes_wikidata(entidade)

    print(f"\n🏛️ Nome do Monumento: {detalhes['label']}")

    nome_pais = obter_label_por_qid(detalhes["pais_qid"]) if detalhes["pais_qid"] else "Desconhecido"
    print("🌍 País:", nome_pais)

    # Mostrar todas as localizações administrativas (P131)
    print("🏘️ Localizações administrativas (P131):")
    for qid in detalhes["localizacoes_qid"]:
        nome_local = obter_label_por_qid(qid)
        print(f"   - {nome_local} (QID: {qid})")

    # Mostrar tipos (P31)
    print("🏷️ Tipos (instância de):")
    for qid in detalhes["instancias_qid"]:
        nome_tipo = obter_label_por_qid(qid)
        print(f"   - {nome_tipo} (QID: {qid})")

else:
    print("❌ Não foi possível obter o Wikidata item.")

📌 Exemplo de monumento extraído:
🔗 URL: https://commons.wikimedia.org/wiki/Category:Wasserkunstanlage_Paradies_(Baden-Baden)
📄 Título Wikimedia: Category:Wasserkunstanlage_Paradies_(Baden-Baden)
🗺️ Coordenadas: {'lat': 48.7625, 'lon': 8.25154, 'primary': True, 'globe': 'earth'}
🔗 Wikibase Item: Q2551242

🏛️ Nome do Monumento: Wasserkunstanlage Paradies (Baden-Baden)
🌍 País: Germany
🏘️ Localizações administrativas (P131):
   - Baden-Baden (QID: Q4100)
🏷️ Tipos (instância de):
   - architectural structure (QID: Q811979)
   - park (QID: Q22698)


In [81]:
import requests
import pandas as pd
from urllib.parse import urlparse, unquote
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# -------- Funções auxiliares --------

def extrair_titulo(url):
    path = urlparse(url).path
    return unquote(path.split('/')[-1])

def obter_info_wikimedia(titulo):
    endpoint = "https://commons.wikimedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "coordinates|pageprops|description",
        "titles": titulo,
        "format": "json",
        "formatversion": 2
    }
    headers = {
        "User-Agent": "ProjetoMonumentosPT/1.0 (gothamanalytics7@gmail.com)"
    }
    response = requests.get(endpoint, params=params, headers=headers)
    response.raise_for_status()
    return response.json()

@lru_cache(maxsize=2048)
def obter_info_wikidata(qid):
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

@lru_cache(maxsize=2048)
def obter_label_por_qid(qid, lang="en"):
    try:
        data = obter_info_wikidata(qid)
        entidade = list(data["entities"].values())[0]
        return entidade["labels"].get(lang, {}).get("value", "Desconhecido")
    except:
        return "Desconhecido"

def extrair_detalhes_wikidata(entidade):
    claims = entidade.get("claims", {})

    def extrair_qid(p):
        try:
            return claims[p][0]["mainsnak"]["datavalue"]["value"]["id"]
        except:
            return None

    def extrair_varios_qids(p):
        qids = []
        for claim in claims.get(p, []):
            try:
                qids.append(claim["mainsnak"]["datavalue"]["value"]["id"])
            except:
                continue
        return qids

    return {
        "pais_qid": extrair_qid("P17"),
        "localizacoes_qid": extrair_varios_qids("P131"),
        "instancias_qid": extrair_varios_qids("P31"),
        "label": entidade.get("labels", {}).get("en", {}).get("value", "Sem título")
    }

# -------- Processar uma única URL --------

def processar_url(url):
    try:
        if pd.isna(url):  # <- ignora nulos
            return [url, None, None, None, None, None, None, None]

        titulo = extrair_titulo(url)
        dados_wiki = obter_info_wikimedia(titulo)
        pagina = dados_wiki["query"]["pages"][0]

        coordenadas = pagina.get("coordinates", [{}])[0]
        lat = coordenadas.get("lat", None)
        lon = coordenadas.get("lon", None)
        wikibase_item = pagina.get("pageprops", {}).get("wikibase_item", None)

        if not wikibase_item:
            return [url, lat, lon, None, None, None, None, None]

        dados_wikidata = obter_info_wikidata(wikibase_item)
        entidade = dados_wikidata["entities"][wikibase_item]
        detalhes = extrair_detalhes_wikidata(entidade)

        nome_monumento = detalhes["label"]
        pais = obter_label_por_qid(detalhes["pais_qid"]) if detalhes["pais_qid"] else None
        localizacoes = [obter_label_por_qid(qid) for qid in detalhes["localizacoes_qid"]]
        tipos = [obter_label_por_qid(qid) for qid in detalhes["instancias_qid"]]

        return [url, lat, lon, nome_monumento, pais, localizacoes, tipos, wikibase_item]

    except Exception as e:
        # print(f"Erro ao processar {url}: {e}")
        return [url, None, None, None, None, None, None, None]

# -------- Processamento paralelo --------

def processar_urls_em_paralelo(urls, max_workers=15):
    resultados = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        tarefas = {executor.submit(processar_url, url): url for url in urls}
        for future in tqdm(as_completed(tarefas), total=len(tarefas), desc="Processando"):
            resultados.append(future.result())
    return resultados

# -------- Executar --------

# Caminho do ficheiro de saída
ficheiro_saida = dataLandMarkTrain_dir / "Monumentos_clean.csv"

if not ficheiro_saida.exists():
    urls = df_monumentos['category'].dropna().tolist()
    resultados = processar_urls_em_paralelo(urls)

    # Criar DataFrame a partir dos resultados
    colunas = [
        'category',
        'lat',
        'lon',
        'nome_monumento',
        'pais',
        'localizacoes_administrativas',
        'tipos_instancia_de',
        'wikibase_item'
    ]
    df_resultados = pd.DataFrame(resultados, columns=colunas)
    df_resultados = df_resultados[df_resultados['pais'].notna()].reset_index(drop=True)


    # Juntar com o DataFrame original (caso queiras manter colunas extra)
    df_monumentos = df_monumentos.drop(columns=['lat', 'lon', 'nome_monumento', 'pais', 'localizacoes_administrativas', 'tipos_instancia_de', 'wikibase_item'], errors='ignore')
    df_monumentos = df_monumentos[df_monumentos['category'].notna()]
    df_monumentos = df_monumentos[df_monumentos['lat'].notna()]
    df_monumentos = df_monumentos.merge(df_resultados, on="category", how="left")
    df_monumentos = df_monumentos[df_monumentos['pais'].notna()].reset_index(drop=True)

    # Guardar em CSV
    df_monumentos.to_csv(ficheiro_saida, index=False)

    # Mostrar primeiras linhas
    df_monumentos.head()
else:
    print(f"O ficheiro {ficheiro_saida} já existe.")
    df_monumentos = pd.read_csv(ficheiro_saida)
    df_monumentos.head()

O ficheiro C:\Users\diogo\Desktop\APVC\APVC-ProjetoFinal\data\land_mark\train\Monumentos_clean.csv já existe.


In [82]:
pd.reset_option('display.max_colwidth')
Pais_escolhido = "Portugal"
df_monumentos_pt = df_monumentos[df_monumentos["pais"] == Pais_escolhido].reset_index(drop=True)
df_monumentos_pt = df_monumentos_pt[df_monumentos_pt['lat'].notna()]

# Guardar em CSV

df_monumentos_pt.to_csv(dataLandMarkTrain_dir / "Monumentos_Portugueses.csv", index=False)
df_monumentos_pt.head()

Unnamed: 0,landmark_id,images,category,supercategory,hierarchical_label,natural_or_human_made,urls,lat,lon,nome_monumento,pais,localizacoes_administrativas,tipos_instancia_de,wikibase_item
0,145,"['06d3a8d8d1c97d39', '0b14eee369b61097', '1377...",http://commons.wikimedia.org/wiki/Category:Est...,monument,,,['http://upload.wikimedia.org/wikipedia/common...,41.1465,-8.61136,Monument to Pedro IV (Porto),Portugal,"['Cedofeita, Santo Ildefonso, Sé, Miragaia, Sã...","['monument', 'statue', 'cultural heritage']",Q11783
1,596,"['0087293d5b4efa86', '0e683185440c3b01', '16aa...",http://commons.wikimedia.org/wiki/Category:Lar...,square,square,human-made,['https://upload.wikimedia.org/wikipedia/commo...,38.715472,-9.136583,Praça Martim Moniz,Portugal,['Lisbon'],['square'],Q20716902
2,1090,"['211eb58f648667ae', '2e8be9a98da670ae', '3cdb...",http://commons.wikimedia.org/wiki/Category:Aze...,locality,,,['https://upload.wikimedia.org/wikipedia/commo...,38.840956,-9.461906,Azenhas do Mar,Portugal,['Colares'],['locality'],Q1648341
4,1536,"['1c42fef86fa10ca4', '3511e647b2941813', '4050...",http://commons.wikimedia.org/wiki/Category:Cha...,drinking fountain,fountain,human-made,['https://upload.wikimedia.org/wikipedia/commo...,38.745214,-9.145839,Chafariz de Entrecampos,Portugal,['Alvalade'],"['drinking fountain', 'cultural heritage']",Q9739040
5,1574,"['003a974e9f48612f', '29c254d08cc68198', '3b3f...",http://commons.wikimedia.org/wiki/Category:Mat...,freguesia of Portugal,,,['https://upload.wikimedia.org/wikipedia/commo...,37.823333,-25.520278,Matriz,Portugal,['Ribeira Grande'],['freguesia of Portugal'],Q1999414


In [83]:
df_monumentos_pt.shape

(588, 14)

In [84]:
df_monumentos_pt.dtypes

landmark_id                       int64
images                           object
category                         object
supercategory                    object
hierarchical_label               object
natural_or_human_made            object
urls                             object
lat                             float64
lon                             float64
nome_monumento                   object
pais                             object
localizacoes_administrativas     object
tipos_instancia_de               object
wikibase_item                    object
dtype: object

In [85]:
import ast

def garantir_lista(valor):
    if isinstance(valor, list):
        return valor
    try:
        return ast.literal_eval(valor)
    except (ValueError, SyntaxError):
        return []  # ou None, conforme a tua lógica

df_monumentos_pt['urls'] = df_monumentos_pt['urls'].apply(garantir_lista)
df_monumentos_pt['urls']

0      [http://upload.wikimedia.org/wikipedia/commons...
1      [https://upload.wikimedia.org/wikipedia/common...
2      [https://upload.wikimedia.org/wikipedia/common...
4      [https://upload.wikimedia.org/wikipedia/common...
5      [https://upload.wikimedia.org/wikipedia/common...
                             ...                        
591    [https://upload.wikimedia.org/wikipedia/common...
592    [https://upload.wikimedia.org/wikipedia/common...
593    [https://upload.wikimedia.org/wikipedia/common...
594    [https://upload.wikimedia.org/wikipedia/common...
595    [https://upload.wikimedia.org/wikipedia/common...
Name: urls, Length: 588, dtype: object

### 4.1. Recolha de dados extra - (*https://commons.wikimedia.org/wiki/Category:Monumentos_Nacionais_in_Portugal_by_name*)

In [86]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import concurrent.futures
import time
import re
from urllib.parse import urljoin
import os
import ast
from unidecode import unidecode

# Função para obter tipos de instância do Wikidata usando a API
def obter_tipos_instancia_de(wikibase_item):
    if not wikibase_item:
        return None
    url = f'https://www.wikidata.org/wiki/Special:EntityData/{wikibase_item}.json'
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        claims = data['entities'][wikibase_item].get('claims', {})
        instancia_ids = []

        if 'P31' in claims:  # P31: instance of
            for claim in claims['P31']:
                mainsnak = claim.get('mainsnak', {})
                datavalue = mainsnak.get('datavalue', {})
                value = datavalue.get('value', {})
                instancia_id = value.get('id')
                if instancia_id:
                    instancia_ids.append(instancia_id)

        # Obter labels para os Q-ids encontrados
        if instancia_ids:
            ids_str = '|'.join(instancia_ids)
            url_labels = f'https://www.wikidata.org/w/api.php?action=wbgetentities&ids={ids_str}&format=json&props=labels&languages=en'
            labels_response = requests.get(url_labels)
            labels_response.raise_for_status()
            labels_data = labels_response.json()

            labels = []
            for qid in instancia_ids:
                label = labels_data['entities'].get(qid, {}).get('labels', {}).get('en', {}).get('value')
                if label:
                    labels.append(label)

            return '; '.join(labels) if labels else None
        return None
    except Exception as e:
        print(f"Erro ao obter tipos de instância de {wikibase_item}: {e}")
        return None

# Converter URL da miniatura para URL da imagem original
def converter_para_url_original(url_thumb):
    if "/thumb/" in url_thumb:
        partes = url_thumb.split("/thumb/")
        pasta_base = partes[0]
        restante = partes[1]
        partes_restante = restante.split("/")
        if len(partes_restante) >= 3:
            caminho = "/".join(partes_restante[:2])
            nome_ficheiro = partes_restante[2]
            return f"{pasta_base}/{caminho}/{nome_ficheiro}"
    return url_thumb

# Extrair informações de uma página de monumento
def extrair_info_monumento(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Nome do monumento
        nome_raw = soup.find('h1', {'id': 'firstHeading'})
        nome = nome_raw.text.strip().replace('Category:', '') if nome_raw else None

        # Extrair lat e lon do GeoHack
        lat = lon = None
        geohack_link = soup.find('a', href=re.compile(r'geohack.toolforge.org'))
        if geohack_link and geohack_link.has_attr('href'):
            href = geohack_link['href']
            match = re.search(r'params=([\d\.]+)_N_(-?[\d\.]+)_(W|E)', href)
            if match:
                lat = float(match.group(1))
                lon_val = float(match.group(2))
                lon = -abs(lon_val) if match.group(3) == 'W' else lon_val

        # Fallback para span.geo
        if lat is None or lon is None:
            coord = soup.find('span', {'class': 'geo'})
            if coord:
                partes = coord.text.strip().split(';')
                if len(partes) == 2:
                    try:
                        lat = float(partes[0].strip())
                        lon = float(partes[1].strip())
                    except ValueError:
                        pass

        # Obter wikibase_item (Q-id)
        wikibase_item = None
        wikidata_link = soup.find('li', {'id': 't-wikibase'})
        if wikidata_link and wikidata_link.a and wikidata_link.a.has_attr('href'):
            wikibase_item = wikidata_link.a['href'].split('/')[-1]

        tipos_instancia_de = obter_tipos_instancia_de(wikibase_item)

        # Categorias
        categorias = [cat.text.strip() for cat in soup.select('div#mw-normal-catlinks ul li')]
        supercategory = categorias[0] if categorias else None
        hierarchical_label = categorias[1:] if len(categorias) > 1 else None

        # Imagens (urls)
        urls = []
        extensoes_validas = ('.png', '.jpg', '.jpeg', '.webp')

        # Procurar imagens na categoria media
        for file_link in soup.select('div#mw-category-media a.image'):
            img_tag = file_link.find('img')
            if img_tag and img_tag.has_attr('src'):
                src = img_tag['src']
                if src.startswith('//'):
                    src = 'https:' + src
                if src.lower().endswith(extensoes_validas):
                    urls.append(converter_para_url_original(src))

        # Caso não encontre, procura em todas as imagens
        if not urls:
            for img_tag in soup.find_all('img'):
                src = img_tag.get('src', '')
                if src.startswith('//'):
                    src = 'https:' + src
                if any(domain in src for domain in ['commons.wikimedia.org', 'upload.wikimedia.org']):
                    if src.lower().endswith(extensoes_validas):
                        urls.append(converter_para_url_original(src))

        urls = list(set(urls))  # Remove duplicados

        # Localizações administrativas - filtragem simples
        localizacoes_administrativas = [cat for cat in categorias if re.search(r'\b(in|of)\b', cat, re.I)]
        localizacoes_administrativas = '; '.join(localizacoes_administrativas) if localizacoes_administrativas else None

        return {
            'category': url,
            'supercategory': supercategory,
            'hierarchical_label': hierarchical_label,
            'urls': urls,
            'lat': lat,
            'lon': lon,
            'nome_monumento': nome,
            'pais': 'Portugal',
            'localizacoes_administrativas': localizacoes_administrativas,
            'tipos_instancia_de': tipos_instancia_de,
            'wikibase_item': wikibase_item
        }

    except Exception as e:
        print(f"Erro ao processar {url}: {e}")
        return None

# Obter todas as páginas de monumentos, filtrando pela primeira letra do nome
def obter_links_monumentos():
    base_url = 'https://commons.wikimedia.org/w/index.php?title=Category:Monumentos_Nacionais_in_Portugal_by_name'
    urls = set()

    for letra in tqdm([chr(i) for i in range(ord('A'), ord('Z') + 1)], desc="A recolher links por letra"):
        url_pagina = f"{base_url}&from={letra}"
        while url_pagina:
            try:
                response = requests.get(url_pagina)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')

                # Selecionar todos os li dentro da div de categoria
                for li in soup.select('div.mw-category div.mw-category-group ul li'):
                    # Tentar encontrar o link em diferentes estruturas possíveis
                    link = None
                    
                    # Caso 1: Link direto no li (estrutura simples)
                    if li.a and 'Category:' in li.a.get('href', ''):
                        link = li.a
                    else:
                        # Caso 2: Link dentro de uma estrutura CategoryTree
                        category_tree_item = li.find('div', class_='CategoryTreeItem')
                        if category_tree_item:
                            link = category_tree_item.find('a', href=lambda x: x and 'Category:' in x)
                    
                    if link and link.has_attr('href'):
                        nome_monumento = link.text.strip()
                        
                        # Verificação da primeira letra (case insensitive e sem acentos)
                        if unidecode(nome_monumento[0].upper()) == letra:
                            full_link = urljoin('https://commons.wikimedia.org', link['href'])
                            urls.add(full_link)

                # Procurar link para a próxima página (next page)
                next_link = soup.find('a', string='next page')
                url_pagina = urljoin('https://commons.wikimedia.org', next_link['href']) if next_link else None

            except Exception as e:
                print(f"Erro ao processar letra {letra}: {e}")
                break

    return list(urls)

# Caminho para guardar ficheiro CSV
dataLandMarkTrain_dir = data_dir / 'land_mark' / 'train'
file_path = os.path.join(dataLandMarkTrain_dir, 'monumentos_extras.csv')

if os.path.exists(file_path):
    print(f"Ficheiro {file_path} já existe. A carregar os dados...")
    df_monumentos_extra = pd.read_csv(file_path)
    # Converter a coluna 'urls' de string para lista de strings
    df_monumentos_extra['urls'] = df_monumentos_extra['urls'].apply(ast.literal_eval)
else:
    print("Ficheiro não encontrado. A extrair dados do site...")

    # Obter links das páginas de monumentos
    links = obter_links_monumentos()

    # Extrair dados com multi-threading
    with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
        resultados = list(tqdm(executor.map(extrair_info_monumento, links), total=len(links), desc="A extrair dados"))

    # Filtrar resultados válidos
    resultados = [res for res in resultados if res]

    # Atribuir IDs únicos
    for idx, res in enumerate(resultados, start=1):
        res['landmark_id'] = idx

    # Criar DataFrame
    df_monumentos_extra = pd.DataFrame(resultados)

    # Garantir colunas existentes para reordenar
    colunas_ordem = [
        'landmark_id', 'urls', 'category', 'supercategory', 'hierarchical_label',
        'lat', 'lon', 'nome_monumento', 'pais',
        'localizacoes_administrativas', 'tipos_instancia_de', 'wikibase_item'
    ]
    colunas_existentes = [col for col in colunas_ordem if col in df_monumentos_extra.columns]

    df_monumentos_extra = df_monumentos_extra[colunas_existentes]

    # Exportar para CSV
    df_monumentos_extra.to_csv(file_path, index=False)

# Mostrar primeiras linhas
pd.reset_option('display.max_colwidth')
df_monumentos_extra.head()

Ficheiro C:\Users\diogo\Desktop\APVC\APVC-ProjetoFinal\data\land_mark\train\monumentos_extras.csv já existe. A carregar os dados...


Unnamed: 0,landmark_id,urls,category,supercategory,hierarchical_label,lat,lon,nome_monumento,pais,localizacoes_administrativas,tipos_instancia_de,wikibase_item
0,1,[https://upload.wikimedia.org/wikipedia/common...,https://commons.wikimedia.org/wiki/Category:Ca...,Military buildings in Lisbon,"['Buildings named after Saint George', 'Castle...",38.71389,-9.13333,Castelo de São Jorge,Portugal,Military buildings in Lisbon; Castles in Portu...,Wikimedia category,Q25816511
1,2,[https://upload.wikimedia.org/wikipedia/common...,https://commons.wikimedia.org/wiki/Category:Fo...,Fortresses in Viana do Castelo (district),"['Valença, Portugal', 'Monumentos Nacionais in...",42.02771,-8.646844,Fortificações da Praça de Valença do Minho,Portugal,Fortresses in Viana do Castelo (district); Mon...,fort; cultural heritage,Q10353229
2,3,[https://upload.wikimedia.org/wikipedia/common...,https://commons.wikimedia.org/wiki/Category:S%...,Roman Catholic cathedrals in Portugal,"['Churches in Silves', 'Monumentos Nacionais i...",37.1901,-8.4387,Sé Catedral de Silves,Portugal,Roman Catholic cathedrals in Portugal; Churche...,cathedral; cultural heritage,Q1107435
3,4,[https://upload.wikimedia.org/wikipedia/common...,https://commons.wikimedia.org/wiki/Category:Ca...,Monsaraz,['Castles in Portugal classified as Monumento ...,38.443184,-7.380692,Castelo de Monsaraz,Portugal,Castles in Portugal classified as Monumento Na...,castle; cultural heritage,Q5049794
4,5,[https://upload.wikimedia.org/wikipedia/common...,https://commons.wikimedia.org/wiki/Category:Ig...,Churches in Marco de Canaveses,"['Monumentos Nacionais in Porto (district)', '...",41.20812,-8.201104,Igreja de Santo André (Vila Boa de Quires),Portugal,Churches in Marco de Canaveses; Monumentos Nac...,church building; cultural heritage,Q10300684


In [87]:
df_monumentos_extra.shape

(814, 12)

In [88]:
pd.set_option('display.max_colwidth', None)
df_monumentos_extra[df_monumentos_extra['nome_monumento'].str.contains("Muralhas e fossos da cidade de Évora", case=False, na=False)]["urls"]

783    [https://upload.wikimedia.org/wikipedia/commons/4/4b/SIPA_logo_%28cropped%29.png, https://upload.wikimedia.org/wikipedia/commons/f/ff/Wikidata-logo.svg, https://upload.wikimedia.org/wikipedia/commons/b/b0/Openstreetmap_logo.svg, https://upload.wikimedia.org/wikipedia/commons/b/bc/Commons-emblem-issue.svg, https://upload.wikimedia.org/wikipedia/commons/6/69/OOjs_UI_icon_help.svg, https://upload.wikimedia.org/wikipedia/commons/a/a7/Igespar_logo_flyer_2.svg, https://upload.wikimedia.org/wikipedia/commons/4/41/Castelo_de_%C3%89vora%2C_Muralhas.jpg, https://upload.wikimedia.org/wikipedia/commons/5/54/Evora_%2835481402692%29.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c7/Commons_to_Wikidata_QuickStatements.svg, https://upload.wikimedia.org/wikipedia/commons/7/73/Blue_pencil.svg]
Name: urls, dtype: object

In [89]:
df_monumentos_pt.shape

(588, 14)

#### 4.1.1. Merge com o data set inicial

In [90]:
colunas_extra = set(df_monumentos_extra.columns)
colunas_pt = set(df_monumentos_pt.columns)

# Colunas que existem em df_monumentos_extra mas não em df_monumentos_pt
colunas_somente_extra = colunas_extra - colunas_pt

# Colunas que existem em df_monumentos_pt mas não em df_monumentos_extra
colunas_somente_pt = colunas_pt - colunas_extra

print("Colunas apenas em df_monumentos_extra:", colunas_somente_extra)
print("Colunas apenas em df_monumentos_pt:", colunas_somente_pt)

Colunas apenas em df_monumentos_extra: set()
Colunas apenas em df_monumentos_pt: {'natural_or_human_made', 'images'}


In [91]:
pd.reset_option('display.max_colwidth')
df_monumentos_pt.drop(columns=list(colunas_somente_pt), inplace=True, errors='ignore')
df_monumentos_pt

Unnamed: 0,landmark_id,category,supercategory,hierarchical_label,urls,lat,lon,nome_monumento,pais,localizacoes_administrativas,tipos_instancia_de,wikibase_item
0,145,http://commons.wikimedia.org/wiki/Category:Est...,monument,,[http://upload.wikimedia.org/wikipedia/commons...,41.146500,-8.611360,Monument to Pedro IV (Porto),Portugal,"['Cedofeita, Santo Ildefonso, Sé, Miragaia, Sã...","['monument', 'statue', 'cultural heritage']",Q11783
1,596,http://commons.wikimedia.org/wiki/Category:Lar...,square,square,[https://upload.wikimedia.org/wikipedia/common...,38.715472,-9.136583,Praça Martim Moniz,Portugal,['Lisbon'],['square'],Q20716902
2,1090,http://commons.wikimedia.org/wiki/Category:Aze...,locality,,[https://upload.wikimedia.org/wikipedia/common...,38.840956,-9.461906,Azenhas do Mar,Portugal,['Colares'],['locality'],Q1648341
4,1536,http://commons.wikimedia.org/wiki/Category:Cha...,drinking fountain,fountain,[https://upload.wikimedia.org/wikipedia/common...,38.745214,-9.145839,Chafariz de Entrecampos,Portugal,['Alvalade'],"['drinking fountain', 'cultural heritage']",Q9739040
5,1574,http://commons.wikimedia.org/wiki/Category:Mat...,freguesia of Portugal,,[https://upload.wikimedia.org/wikipedia/common...,37.823333,-25.520278,Matriz,Portugal,['Ribeira Grande'],['freguesia of Portugal'],Q1999414
...,...,...,...,...,...,...,...,...,...,...,...,...
591,201945,http://commons.wikimedia.org/wiki/Category:Cas...,castle,castle / fort,[https://upload.wikimedia.org/wikipedia/common...,37.190989,-8.437906,Castle of Silves,Portugal,['Silves'],"['castle', 'cultural heritage', 'archaeologica...",Q1971757
592,202380,http://commons.wikimedia.org/wiki/Category:Col...,theater,theatre,[https://upload.wikimedia.org/wikipedia/common...,41.146900,-8.605511,Coliseu do Porto,Portugal,"['Cedofeita, Santo Ildefonso, Sé, Miragaia, Sã...","['theatre building', 'movie theater', 'cultura...",Q2982699
593,202661,http://commons.wikimedia.org/wiki/Category:Pic...,mountain,mountain,[https://upload.wikimedia.org/wikipedia/common...,32.758733,-16.942256,Pico Ruivo,Portugal,['Madeira'],['mountain'],Q473169
594,202667,http://commons.wikimedia.org/wiki/Category:Edi...,building,,[https://upload.wikimedia.org/wikipedia/common...,38.732470,-9.155876,"Edifício sito na Rua Alexandre Herculano, 57",Portugal,['Santo António'],"['building', 'cultural heritage']",Q9698147


In [92]:
def comparar_monumentos_por_wikibase(df1, df2, coluna_wikibase='wikibase_item', coluna_nome='nome_monumento'):
    """
    Compara dois DataFrames com base na coluna wikibase_item e verifica se os nomes correspondentes são iguais.
    
    Args:
        df1: Primeiro DataFrame (no seu caso, df_monumentos_extra)
        df2: Segundo DataFrame (no seu caso, df_monumentos_pt)
        coluna_wikibase: Nome da coluna com os identificadores Wikidata
        coluna_nome: Nome da coluna com os nomes dos monumentos
    
    Returns:
        Um DataFrame com as comparações
    """
    # Criar dicionários mapeando wikibase_item para nome em cada DataFrame
    dict1 = df1.dropna(subset=[coluna_wikibase]).set_index(coluna_wikibase)[coluna_nome].to_dict()
    dict2 = df2.dropna(subset=[coluna_wikibase]).set_index(coluna_wikibase)[coluna_nome].to_dict()
    
    # Encontrar itens em comum
    wikibase_comuns = set(dict1.keys()).intersection(set(dict2.keys()))
    
    # Preparar lista de resultados
    resultados = []
    for wikibase in wikibase_comuns:
        nome1 = dict1[wikibase]
        nome2 = dict2[wikibase]
        resultados.append({
            'wikibase_item': wikibase,
            'nome_df1': nome1,
            'nome_df2': nome2,
            'nomes_iguais': nome1 == nome2
        })
    
    # Criar DataFrame com os resultados
    df_comparacao = pd.DataFrame(resultados)
    
    # Calcular estatísticas
    total_comuns = len(wikibase_comuns)
    iguais = df_comparacao['nomes_iguais'].sum()
    diferentes = total_comuns - iguais
    
    print(f"\nEstatísticas de comparação:")
    print(f"Total de itens com wikibase_item em comum: {total_comuns}")
    print(f"Nomes idênticos: {iguais} ({iguais/total_comuns:.1%})")
    print(f"Nomes diferentes: {diferentes} ({diferentes/total_comuns:.1%})")
    
    return df_comparacao

# Usar a função
df_comparacao_nomes = comparar_monumentos_por_wikibase(df_monumentos_extra, df_monumentos_pt)

# Mostrar alguns exemplos onde os nomes são diferentes (se houver)
if not df_comparacao_nomes[~df_comparacao_nomes['nomes_iguais']].empty:
    print("\nExemplos de monumentos com wikibase_item igual mas nomes diferentes:")
    display(df_comparacao_nomes[~df_comparacao_nomes['nomes_iguais']])


Estatísticas de comparação:
Total de itens com wikibase_item em comum: 210
Nomes idênticos: 84 (40.0%)
Nomes diferentes: 126 (60.0%)

Exemplos de monumentos com wikibase_item igual mas nomes diferentes:


Unnamed: 0,wikibase_item,nome_df1,nome_df2,nomes_iguais
3,Q5049810,Castelo de Penamacor,Castle of Penamacor,False
4,Q10327237,Mata Nacional do Buçaco,Buçaco Forest,False
5,Q5049744,Castelo de Alter do Chão,Castle of Alter do Chão,False
6,Q5049803,Castelo de Mértola,Castle of Mértola,False
7,Q3078430,Forte de Peniche,Peniche Fortress,False
...,...,...,...,...
203,Q69513,Palácio Nacional da Pena,Pena Palace,False
204,Q2970726,Castelo de Soure,Castle of Soure,False
205,Q1004331,Sé do Funchal,Cathedral of Funchal,False
206,Q17125232,Capela de Nossa Senhora de Guadalupe (Raposeira),Hermitage of Our Lady of Guadalupe,False


In [93]:
# Cópias dos DataFrames originais
df_pt = df_monumentos_pt.copy()
df_extra = df_monumentos_extra.copy()

# 1. Adicionar coluna SerMonumento: 1 para extra, 0 para pt
df_extra['SerMonumento'] = 1
df_pt['SerMonumento'] = 0

# 2. Garantir que 'urls' são listas
def garantir_lista(x):
    if isinstance(x, list):
        return x
    elif pd.isna(x) or x is None:
        return []
    else:
        return [x]

df_extra['urls'] = df_extra['urls'].apply(garantir_lista)
df_pt['urls'] = df_pt['urls'].apply(garantir_lista)

# 3. Vamos fazer merge usando df_extra como base
df_merged = df_extra.merge(df_pt[['wikibase_item', 'urls']], 
                          on='wikibase_item', 
                          how='left', 
                          suffixes=('_extra', '_pt'))

# 4. Concatenar as listas de URLs sem repetir
def unir_listas(lista1, lista2):
    set_urls = set(lista1)
    if isinstance(lista2, list):
        set_urls.update(lista2)
    return list(set_urls)

df_merged['urls'] = df_merged.apply(lambda row: unir_listas(row['urls_extra'], row['urls_pt']), axis=1)

# 5. Remover as colunas temporárias
df_merged.drop(columns=['urls_extra', 'urls_pt'], inplace=True)

# 6. Agora acrescentar as linhas do df_pt que não estão no df_extra
df_pt_rest = df_pt[~df_pt['wikibase_item'].isin(df_extra['wikibase_item'])].copy()

# Concatenar o df_merged (base extra com urls unidas) com os restantes do pt
df_monumentos_finalPT = pd.concat([df_merged, df_pt_rest], ignore_index=True)
df_monumentos_finalPT.drop(columns=['landmark_id'], inplace=True, errors='ignore')
df_monumentos_finalPT

Unnamed: 0,category,supercategory,hierarchical_label,lat,lon,nome_monumento,pais,localizacoes_administrativas,tipos_instancia_de,wikibase_item,SerMonumento,urls
0,https://commons.wikimedia.org/wiki/Category:Ca...,Military buildings in Lisbon,"['Buildings named after Saint George', 'Castle...",38.713890,-9.133330,Castelo de São Jorge,Portugal,Military buildings in Lisbon; Castles in Portu...,Wikimedia category,Q25816511,1,[https://upload.wikimedia.org/wikipedia/common...
1,https://commons.wikimedia.org/wiki/Category:Fo...,Fortresses in Viana do Castelo (district),"['Valença, Portugal', 'Monumentos Nacionais in...",42.027710,-8.646844,Fortificações da Praça de Valença do Minho,Portugal,Fortresses in Viana do Castelo (district); Mon...,fort; cultural heritage,Q10353229,1,[https://upload.wikimedia.org/wikipedia/common...
2,https://commons.wikimedia.org/wiki/Category:S%...,Roman Catholic cathedrals in Portugal,"['Churches in Silves', 'Monumentos Nacionais i...",37.190100,-8.438700,Sé Catedral de Silves,Portugal,Roman Catholic cathedrals in Portugal; Churche...,cathedral; cultural heritage,Q1107435,1,[https://upload.wikimedia.org/wikipedia/common...
3,https://commons.wikimedia.org/wiki/Category:Ca...,Monsaraz,['Castles in Portugal classified as Monumento ...,38.443184,-7.380692,Castelo de Monsaraz,Portugal,Castles in Portugal classified as Monumento Na...,castle; cultural heritage,Q5049794,1,[https://upload.wikimedia.org/wikipedia/common...
4,https://commons.wikimedia.org/wiki/Category:Ig...,Churches in Marco de Canaveses,"['Monumentos Nacionais in Porto (district)', '...",41.208120,-8.201104,Igreja de Santo André (Vila Boa de Quires),Portugal,Churches in Marco de Canaveses; Monumentos Nac...,church building; cultural heritage,Q10300684,1,[https://upload.wikimedia.org/wikipedia/common...
...,...,...,...,...,...,...,...,...,...,...,...,...
1187,http://commons.wikimedia.org/wiki/Category:Igr...,church building,church,32.680000,-17.104444,Igreja de Nossa Senhora da Luz,Portugal,['Ponta do Sol'],"['church building', 'cultural heritage', 'pari...",Q10300407,0,[https://upload.wikimedia.org/wikipedia/common...
1188,http://commons.wikimedia.org/wiki/Category:Bar...,Natura 2000 protected area,parks,40.970227,-8.644984,Barrinha de Esmoriz,Portugal,"['Centro region', 'Norte Region']","['Natura 2000 site', 'site of community import...",Q16497003,0,[https://upload.wikimedia.org/wikipedia/common...
1189,http://commons.wikimedia.org/wiki/Category:Col...,theater,theatre,41.146900,-8.605511,Coliseu do Porto,Portugal,"['Cedofeita, Santo Ildefonso, Sé, Miragaia, Sã...","['theatre building', 'movie theater', 'cultura...",Q2982699,0,[https://upload.wikimedia.org/wikipedia/common...
1190,http://commons.wikimedia.org/wiki/Category:Pic...,mountain,mountain,32.758733,-16.942256,Pico Ruivo,Portugal,['Madeira'],['mountain'],Q473169,0,[https://upload.wikimedia.org/wikipedia/common...


In [94]:
pd.set_option('display.max_colwidth', None)
df_monumentos_finalPT[["nome_monumento", "urls", "wikibase_item"]].head(2)

Unnamed: 0,nome_monumento,urls,wikibase_item
0,Castelo de São Jorge,"[https://upload.wikimedia.org/wikipedia/commons/5/56/Castelo_de_Sao_Jorge%2C_Lisbon_%2849933254953%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d7/Castelo_de_S%C3%A3o_Jorge_-_Lisboa_-_Portugal_%2851264228574%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/ff/Wikidata-logo.svg, https://upload.wikimedia.org/wikipedia/commons/e/eb/Castelo_de_S%C3%A3o_Jorge_-_Lisboa_-_Portugal_%2851286489491%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f2/2019-05-16_13-44-15_PT_Lisbon_JHe_K70_%2854146662415%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/58/Castelo_de_Sao_Jorge_%2840549415000%29.jpg, https://upload.wikimedia.org/wikipedia/commons/7/7d/Castelo_de_S%C3%A3o_Jorge_%2843981141894%29.jpg, https://upload.wikimedia.org/wikipedia/commons/7/72/2019-05-16_13-41-20_PT_Lisbon_JHe_K70_%2854145546222%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/fa/Castelo_de_S._Jorge_-_panoramio_%284%29.jpg, https://upload.wikimedia.org/wikipedia/commons/a/a9/Castelo_de_Sao_Jorge_%2840549409760%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e0/Castelo_De_S%C3%A3o_Jorge_%28214018829%29.jpeg, https://upload.wikimedia.org/wikipedia/commons/b/b7/Castelo_de_Sao_Jorge_%2841634004854%29.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c4/Castelo_de_Sao_Jorge_%2846750089072%29.jpg, https://upload.wikimedia.org/wikipedia/commons/8/81/2019-05-16_13-47-18_PT_Lisbon_JHe_K70_%2854148967216%29.jpg, https://upload.wikimedia.org/wikipedia/commons/7/78/2019-05-16_12-22-39_PT_Lisbon_JHe_K70_%2854144955867%29.jpg, https://upload.wikimedia.org/wikipedia/commons/b/be/Castelo_de_Sao_Jorge%2C_Lisbon_%2849934076657%29.jpg, https://upload.wikimedia.org/wikipedia/commons/7/76/Castelo_de_Sao_Jorge_%2843866649780%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f9/Castelo_de_Sao_Jorge%2C_Lisbon_%2849933253503%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e4/Castelo_de_Sao_Jorge_%2841633989634%29.jpg, https://upload.wikimedia.org/wikipedia/commons/0/09/Castelo_de_Sao_Jorge_%2842356426751%29.jpg, https://upload.wikimedia.org/wikipedia/commons/8/84/Castelo_de_Sao_Jorge_%2846077446494%29.jpg, https://upload.wikimedia.org/wikipedia/commons/2/29/Castelo_de_Sao_Jorge_%2841633992664%29.jpg, https://upload.wikimedia.org/wikipedia/commons/7/72/Castelo_de_Sao_Jorge_%2828482958148%29.jpg, https://upload.wikimedia.org/wikipedia/commons/b/b1/Castelo_de_S._Jorge_-_panoramio_%285%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/5c/Castelo_de_Sao_Jorge_%2842356427541%29.jpg, https://upload.wikimedia.org/wikipedia/commons/0/08/Castelo_de_Sao_Jorge_%2828482956838%29.jpg, https://upload.wikimedia.org/wikipedia/commons/1/14/Beautiful_Lisboa.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d3/Castelo_de_Sao_Jorge_%2841454462195%29.jpg, https://upload.wikimedia.org/wikipedia/commons/8/87/Castelo_de_Sao_Jorge_%2842307950552%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/ec/Castelo_de_S%C3%A3o_Jorge_-_Lisboa_-_Portugal_%2851320363477%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/5f/Castelo_de_S%C3%A3o_Jorge_-_Lisboa_-_Portugal_%2842730356934%29.jpg, https://upload.wikimedia.org/wikipedia/commons/1/1a/Castelo_de_Sao_Jorge_%2842307932622%29.jpg, https://upload.wikimedia.org/wikipedia/commons/9/97/Castelo_de_Sao_Jorge_%2841633991774%29.jpg, https://upload.wikimedia.org/wikipedia/commons/b/b7/Castelo_de_Sao_Jorge_%2842307943212%29.jpg, https://upload.wikimedia.org/wikipedia/commons/6/6a/Castelo_de_Sao_Jorge%2C_Lisbon_%2849933254573%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/5b/2019-05-16_13-47-50_PT_Lisbon_JHe_K70_%2854146358743%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d6/Castelo_de_Sao_Jorge_%2840549435680%29.jpg, https://upload.wikimedia.org/wikipedia/commons/2/2b/Castelo_de_S%C3%A3o_Jorge_-_Lisboa_-_Portugal_%2851295961970%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f9/Castelo_de_Sao_Jorge_%2832926595158%29.jpg, https://upload.wikimedia.org/wikipedia/commons/3/32/2019-05-16_13-38-40_PT_Lisbon_JHe_K70_%2854146726314%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/57/Castelo_de_Sao_Jorge_%2842356428231%29.jpg, https://upload.wikimedia.org/wikipedia/commons/b/b5/Castelo_de_Sao_Jorge_%2842356389141%29.jpg, https://upload.wikimedia.org/wikipedia/commons/9/97/Castelo_de_S%C3%A3o_Jorge_-_4974807356.jpg, https://upload.wikimedia.org/wikipedia/commons/7/7b/2019-05-16_14-29-16_PT_Lisbon_JHe_N_%2854146182619%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d1/Castelo_de_Sao_Jorge_%2827486619327%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/db/Castelo_de_Sao_Jorge_%2842356430621%29.jpg, https://upload.wikimedia.org/wikipedia/commons/b/b0/Openstreetmap_logo.svg, https://upload.wikimedia.org/wikipedia/commons/2/26/2019-05-16_13-49-25_PT_Lisbon_JHe_K70_%2854146073796%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/ea/Castelo_de_S%C3%A3o_Jorge_%2843981141344%29.jpg, https://upload.wikimedia.org/wikipedia/commons/b/be/Castelo_de_Sao_Jorge%2C_Lisbon_%2849933256003%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/51/Castelo_de_Sao_Jorge%2C_Lisbon_%2849934076507%29.jpg, https://upload.wikimedia.org/wikipedia/commons/4/44/Castelo_de_Sao_Jorge%2C_Lisbon_%2849933255213%29.jpg, https://upload.wikimedia.org/wikipedia/commons/7/71/Castelo_de_Sao_Jorge_%2827486653017%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/56/Castelo_de_S%C3%A3o_Jorge_-_Lisboa_-_Portugal_%2841802019120%29.jpg, https://upload.wikimedia.org/wikipedia/commons/a/a7/Igespar_logo_flyer_2.svg, https://upload.wikimedia.org/wikipedia/commons/e/eb/Castelo_de_Sao_Jorge_%2841633993624%29.jpg, https://upload.wikimedia.org/wikipedia/commons/7/77/2019-05-16_12-35-41_PT_Lisbon_JHe_K70_%2854145816071%29.jpg, https://upload.wikimedia.org/wikipedia/commons/3/37/2019-05-16_13-49-45_PT_Lisbon_JHe_K70_%2854145220727%29.jpg, https://upload.wikimedia.org/wikipedia/commons/4/47/Castelo_de_Sao_Jorge_%2842307938592%29.jpg, https://upload.wikimedia.org/wikipedia/commons/8/8d/Castelo_de_Sao_Jorge_%2841633981094%29.jpg, https://upload.wikimedia.org/wikipedia/commons/1/15/Castelo_de_Sao_Jorge_%2827486651547%29.jpg, https://upload.wikimedia.org/wikipedia/commons/4/43/Castelo_de_Sao_Jorge_%2842307931702%29.jpg, https://upload.wikimedia.org/wikipedia/commons/9/9f/Castelo_de_Sao_Jorge%2C_Lisbon_%2849934077722%29.jpg, https://upload.wikimedia.org/wikipedia/commons/4/45/Castelo_de_Sao_Jorge_%2841454488545%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/fb/Castelo_de_S%C3%A3o_Jorge_-_Lisboa_-_Portugal_%2851285742947%29.jpg, https://upload.wikimedia.org/wikipedia/commons/3/39/Castelo_de_Sao_Jorge%2C_Lisbon_%2849934075687%29.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c6/Castelo_de_Sao_Jorge_%2827486666847%29.jpg, https://upload.wikimedia.org/wikipedia/commons/b/b7/Castelo_de_Sao_Jorge_%2840549432090%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e9/Castelo_de_Sao_Jorge_%2841633989204%29.jpg, https://upload.wikimedia.org/wikipedia/commons/7/7b/Castelo_de_Sao_Jorge_%2842356426151%29.jpg, https://upload.wikimedia.org/wikipedia/commons/0/00/Castelo_de_S%C3%A3o_Jorge_-_Lisboa_-_Portugal_%2853042849774%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d3/011499_-_Lisboa_%2821691724913%29.jpg, https://upload.wikimedia.org/wikipedia/commons/7/7d/Castelo_de_Sao_Jorge_%2827486667807%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/db/Castelo_de_S%C3%A3o_Jorge_-_Lisboa_-_Portugal_%2829446405708%29.jpg, https://upload.wikimedia.org/wikipedia/commons/6/62/Castelo_de_S%C3%A3o_Jorge_-_Lisboa_-_Portugal_%2841926709960%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e6/2019-05-16_13-55-51_PT_Lisbon_JHe_K70_%2854146358393%29.jpg, https://upload.wikimedia.org/wikipedia/commons/0/04/Castelo_de_Sao_Jorge%2C_Lisbon%2C_Portugal.jpg, https://upload.wikimedia.org/wikipedia/commons/b/b6/Castelo_de_Sao_Jorge_%2840549391870%29.jpg, https://upload.wikimedia.org/wikipedia/commons/0/06/Castelo_de_Sao_Jorge_%2828482974718%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f1/Castelo_de_Sao_Jorge_%2841633991054%29.jpg, https://upload.wikimedia.org/wikipedia/commons/4/42/Castelo_de_Sao_Jorge_%2841634009874%29.jpg, https://upload.wikimedia.org/wikipedia/commons/4/4c/Castelo_de_S%C3%A3o_Jorge_-_Lisboa_-_Portugal_%2851284673248%29.jpg, https://upload.wikimedia.org/wikipedia/commons/9/93/Castelo_de_S%C3%A3o_Jorge_-_Lisboa_-_Portugal_%2851286661088%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/53/Castelo_de_S%C3%A3o_Jorge_-_Lisboa_-_Portugal_%2825274454018%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d7/Castelo_de_Sao_Jorge_%2840549418920%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/de/Castelo_de_Sao_Jorge_%2841454473585%29.jpg, https://upload.wikimedia.org/wikipedia/commons/9/92/Castelo_de_Sao_Jorge_%2841454482765%29.jpg, https://upload.wikimedia.org/wikipedia/commons/8/86/Castelo_de_S%C3%A3o_Jorge_%28Grande_Panorama_de_Lisboa%2C_MNAz%29.png, https://upload.wikimedia.org/wikipedia/commons/7/7f/Castelo_de_Sao_Jorge_%2842307941612%29.jpg, https://upload.wikimedia.org/wikipedia/commons/1/1e/2019-05-16_13-39-28_PT_Lisbon_JHe_K70_%2854146683388%29.jpg, https://upload.wikimedia.org/wikipedia/commons/2/2a/Castelo_de_Sao_Jorge_%2841454476445%29.jpg, https://upload.wikimedia.org/wikipedia/commons/4/45/Castelo_de_S%C3%A3o_Jorge_-_Lisboa_-_Portugal_%2831891063787%29.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c3/2019-05-16_14-29-44_PT_Lisbon_JHe_N_%2854146319320%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f2/Castelo_de_Sao_Jorge%2C_Lisbon_%2849933254703%29.jpg, https://upload.wikimedia.org/wikipedia/commons/2/28/2019-05-16_13-26-32_PT_Lisbon_JHe_N_%2854146773329%29.jpg, https://upload.wikimedia.org/wikipedia/commons/4/4b/SIPA_logo_%28cropped%29.png, https://upload.wikimedia.org/wikipedia/commons/c/c8/Castelo_de_S._Jorge_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/9/9f/Bridge_over_the_current_dry_moat%2C_S%C3%A3o_Jorge_Castle%2C_lisbon.jpg, https://upload.wikimedia.org/wikipedia/commons/4/4b/Castelo_de_Sao_Jorge_%2845684674751%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e0/Castelo_de_Sao_Jorge%2C_Lisbon_%2849933771266%29.jpg, ...]",Q25816511
1,Fortificações da Praça de Valença do Minho,"[https://upload.wikimedia.org/wikipedia/commons/3/3d/The_walls_of_Valen%C3%A7a_VI_%2850342417423%29.jpg, https://upload.wikimedia.org/wikipedia/commons/8/81/Valen%C3%A7a_do_Minho_%28P%29%2C_2011%2C_La_piazzaforte._%286220038158%29.jpg, https://upload.wikimedia.org/wikipedia/commons/6/6a/Fortaleza_de_Valen%C3%A7a_%283870879624%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e0/Detalle_das_defensas_de_Valen%C3%A7a_-eue_-_17.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c2/4930_Valen%C3%A7a%2C_Portugal_-_panoramio_%2820%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/ff/Wikidata-logo.svg, https://upload.wikimedia.org/wikipedia/commons/f/f0/Valen%C3%A7a_%2849599004043%29.jpg, https://upload.wikimedia.org/wikipedia/commons/0/04/Pra%C3%A7a-forte_de_Valen%C3%A7a_%2834%29.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c5/Valen%C3%A7a_-_panoramio_%2832%29.jpg, https://upload.wikimedia.org/wikipedia/commons/3/36/Darkness_doesn%27t_last_forever_%2850343097521%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d4/The_walls_of_Valen%C3%A7a_III_%2850340743108%29.jpg, https://upload.wikimedia.org/wikipedia/commons/9/91/Valenca%2C_Portugal-4_%288611372986%29.jpg, https://upload.wikimedia.org/wikipedia/commons/8/8e/4930_Valen%C3%A7a%2C_Portugal_-_panoramio_%282%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/df/4930_Valen%C3%A7a%2C_Portugal_-_panoramio_%283%29.jpg, https://upload.wikimedia.org/wikipedia/commons/7/7c/Fortaleza_de_Valen%C3%A7a_do_Minho_%2816139820298%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e3/Fortifica%C3%A7%C3%B5es_da_Pra%C3%A7a.005_-_Valen%C3%A7a_do_Minho.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c7/Pra%C3%A7a-forte_de_Valen%C3%A7a_%2831%29.jpg, https://upload.wikimedia.org/wikipedia/commons/b/b8/Fortifica%C3%A7%C3%B5es_da_Pra%C3%A7a.010_-_Valen%C3%A7a_do_Minho.jpg, https://upload.wikimedia.org/wikipedia/commons/8/8b/Town_hall_of_Valenca.jpg, https://upload.wikimedia.org/wikipedia/commons/1/19/4930_Valen%C3%A7a%2C_Portugal_-_panoramio_%289%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/54/Fortaleza_de_Valen%C3%A7a_%283870913716%29.jpg, https://upload.wikimedia.org/wikipedia/commons/3/35/Fortifica%C3%A7%C3%B5es_da_Pra%C3%A7a.002_-_Valen%C3%A7a_do_Minho.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f9/Pra%C3%A7a-forte_de_Valen%C3%A7a_%2829%29.jpg, https://upload.wikimedia.org/wikipedia/commons/8/8d/Valen%C3%A7a_%2849599200683%29.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c4/Pra%C3%A7a-forte_de_Valen%C3%A7a_%2827%29.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c1/Fortificacoes_da_Praca_de_Valenca_do_Minho_03.jpg, https://upload.wikimedia.org/wikipedia/commons/b/b1/Valen%C3%A7a_-_panoramio_%2848%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/df/Fortaleza_de_Valen%C3%A7a_-_Portugal_%F0%9F%87%B5%F0%9F%87%B9_%2854274933389%29.jpg, https://upload.wikimedia.org/wikipedia/commons/b/b5/Fortaleza_de_Valen%C3%A7a_-_Portugal_%F0%9F%87%B5%F0%9F%87%B9_%2854305587548%29.jpg, https://upload.wikimedia.org/wikipedia/commons/8/8a/Cal%C3%A7ada_da_Gaviarra%2C_Valen%C3%A7a_Fortress%2C_Minho%2C_Portugal_%28PPL3-Altered%29_julesvernex2.jpg, https://upload.wikimedia.org/wikipedia/commons/d/db/Fortaleza_de_Valen%C3%A7a_-_Portugal_%F0%9F%87%B5%F0%9F%87%B9_%2854274933409%29.jpg, https://upload.wikimedia.org/wikipedia/commons/6/61/Valen%C3%A7a_-_panoramio_%283%29.jpg, https://upload.wikimedia.org/wikipedia/commons/8/8e/4930_Valen%C3%A7a%2C_Portugal_-_panoramio_%2810%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f6/D%C3%ADa_soleado_en_Valen%C3%A7a_do_Mi%C3%B1o_%2813499126875%29.jpg, https://upload.wikimedia.org/wikipedia/commons/4/49/4930_Valen%C3%A7a%2C_Portugal_-_panoramio_%2811%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/55/3_Muralhas_da_Fortaleza_de_Valen%C3%A7a.JPG, https://upload.wikimedia.org/wikipedia/commons/7/7d/The_walls_of_Valen%C3%A7a_IV_%2850341137538%29.jpg, https://upload.wikimedia.org/wikipedia/commons/c/cf/Fortificacoes_da_Praca_de_Valenca_do_Minho_05.jpg, https://upload.wikimedia.org/wikipedia/commons/3/3f/Fortifica%C3%A7%C3%B5es_da_Pra%C3%A7a.001_-_Valen%C3%A7a_do_Minho.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d5/The_walls_of_Valen%C3%A7a_XIV_%2850349786233%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/5c/Portugal-Fortaleza_de_Valenca_do_Minho-P1240617_%2825798535051%29.jpg, https://upload.wikimedia.org/wikipedia/commons/1/15/Fortifica%C3%A7%C3%B5es_da_Pra%C3%A7a.008_-_Valen%C3%A7a_do_Minho.jpg, https://upload.wikimedia.org/wikipedia/commons/0/01/Pra%C3%A7a-forte_de_Valen%C3%A7a_%2821%29.jpg, https://upload.wikimedia.org/wikipedia/commons/a/a9/Pra%C3%A7a-forte_de_Valen%C3%A7a_%2830%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/dd/Fortaleza_de_Valen%C3%A7a_do_Minho_%2816326524752%29_%282%29.jpg, https://upload.wikimedia.org/wikipedia/commons/7/78/Pra%C3%A7a-forte_de_Valen%C3%A7a_%2815%29.jpg, https://upload.wikimedia.org/wikipedia/commons/0/06/Castelo_de_Valen%C3%A7a_%2815297980463%29.jpg, https://upload.wikimedia.org/wikipedia/commons/a/ae/Pra%C3%A7a-forte_de_Valen%C3%A7a_%2811%29.jpg, https://upload.wikimedia.org/wikipedia/commons/3/3f/Valen%C3%A7a.jpg, https://upload.wikimedia.org/wikipedia/commons/e/ec/Tui_e_Valen%C3%A7a_no_Livro_das_Fortalezas_de_Duarte_de_Armas_%281509-1510%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f9/Apr_26%2C_2022_El_Kinder_Camino_Portugu%C3%A9s_Tour_%2852081338022%29.jpg, https://upload.wikimedia.org/wikipedia/commons/1/1a/Portugal-Fortaleza_de_Valenca_do_Minho-P1160845_%2825772712332%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e5/Fortaleza_de_Valen%C3%A7a_%283870902326%29.jpg, https://upload.wikimedia.org/wikipedia/commons/8/82/Fortaleza_de_Valen%C3%A7a_-_Portugal_%F0%9F%87%B5%F0%9F%87%B9_%2854273804627%29.jpg, https://upload.wikimedia.org/wikipedia/commons/1/16/Fortifica%C3%A7%C3%B5es_em_Valen%C3%A7a_do_Minho.jpg, https://upload.wikimedia.org/wikipedia/commons/7/75/Valenca_alfredo_magalhaes_busto.JPG, https://upload.wikimedia.org/wikipedia/commons/b/b0/Openstreetmap_logo.svg, https://upload.wikimedia.org/wikipedia/commons/9/9f/Valen%C3%A7a_-_panoramio_%2853%29.jpg, https://upload.wikimedia.org/wikipedia/commons/2/26/Pra%C3%A7a-forte_de_Valen%C3%A7a_%2813%29.jpg, https://upload.wikimedia.org/wikipedia/commons/f/f1/The_walls_of_Valen%C3%A7a_XIX_%2850369451647%29.jpg, https://upload.wikimedia.org/wikipedia/commons/9/95/Fortifica%C3%A7%C3%B5es_da_Pra%C3%A7a_de_Valen%C3%A7a.jpg, https://upload.wikimedia.org/wikipedia/commons/a/ad/Fort_Valenca_%2847002050182%29.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c6/Fortaleza_de_Valen%C3%A7a_%283870197861%29.jpg, https://upload.wikimedia.org/wikipedia/commons/0/0f/Valen%C3%A7a_%2849599107993%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/ee/The_walls_of_Valen%C3%A7a_XI_%2850346843831%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/ea/Fortaleza_de_Valen%C3%A7a_%283870125123%29.jpg, https://upload.wikimedia.org/wikipedia/commons/a/a7/Igespar_logo_flyer_2.svg, https://upload.wikimedia.org/wikipedia/commons/d/d3/Fortaleza_de_Valen%C3%A7a_%283870958878%29.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e8/Fortificacoes_da_Praca_de_Valenca_do_Minho_08.jpg, https://upload.wikimedia.org/wikipedia/commons/d/df/Fortaleza_de_Valen%C3%A7a_do_Minho_%2816141483797%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/d4/Fortifica%C3%A7%C3%B5es_da_Pra%C3%A7a.007_-_Valen%C3%A7a_do_Minho.jpg, https://upload.wikimedia.org/wikipedia/commons/7/79/Fortaleza_de_Valen%C3%A7a_-_3.jpg, https://upload.wikimedia.org/wikipedia/commons/7/7b/Monumento_a_los_ca%C3%ADdos_portugueses_de_la_Primera_Guerra_Mundial_%283870107219%29.jpg, https://upload.wikimedia.org/wikipedia/commons/6/66/Valen%C3%A7a_%2849599119168%29.jpg, https://upload.wikimedia.org/wikipedia/commons/a/a1/Fortaleza_de_Valen%C3%A7a_%283870860222%29.jpg, https://upload.wikimedia.org/wikipedia/commons/9/9d/Murallas_de_Valen%C3%A7a_do_Minho.JPG, https://upload.wikimedia.org/wikipedia/commons/6/6a/Fortifica%C3%A7%C3%B5es_da_Pra%C3%A7a.012_-_Valen%C3%A7a_do_Minho.jpg, https://upload.wikimedia.org/wikipedia/commons/c/cd/Pra%C3%A7a-forte_de_Valen%C3%A7a_%2832%29.jpg, https://upload.wikimedia.org/wikipedia/commons/1/11/Valen%C3%A7a_do_Minho_%28P%29%2C_2011%2C_La_piazzaforte._%286220038066%29.jpg, https://upload.wikimedia.org/wikipedia/commons/3/3f/Rua_Dr._Jose_Augusto_Vieira_16_in_Valenca.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e8/Pra%C3%A7a-forte_de_Valen%C3%A7a_%2825%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/dd/Valen%C3%A7a_%2815295081343%29.jpg, https://upload.wikimedia.org/wikipedia/commons/5/55/Fortifica%C3%A7%C3%B5es_de_Valen%C3%A7a_do_Minho.jpg, https://upload.wikimedia.org/wikipedia/commons/0/05/VALENCIA_DO_MI%C3%91O_-_panoramio.jpg, https://upload.wikimedia.org/wikipedia/commons/8/86/Rio_Mi%C3%B1o_y_al_fondo_las_murallas_de_Valen%C3%87a.jpg, https://upload.wikimedia.org/wikipedia/commons/8/89/Murallas_de_la_Fortaleza_Valen%C3%A7a_do_Mi%C3%B1o.jpg, https://upload.wikimedia.org/wikipedia/commons/c/ce/Revelim_-_Valen%C3%A7a.jpg, https://upload.wikimedia.org/wikipedia/commons/4/4d/Fortaleza_de_Valen%C3%A7a_%283870091269%29.jpg, https://upload.wikimedia.org/wikipedia/commons/d/db/Fortaleza_de_Valen%C3%A7a_-_5.jpg, https://upload.wikimedia.org/wikipedia/commons/6/6f/Livro_das_Fortalezas_112-2_Valen%C3%A7a.jpg, https://upload.wikimedia.org/wikipedia/commons/e/e9/Rua_Conselheiro_Lopes_da_Silva_in_Valenca_%282%29.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c0/Tres_%284359445451%29.jpg, https://upload.wikimedia.org/wikipedia/commons/3/34/Valen%C3%A7a_%2849598807608%29.jpg, https://upload.wikimedia.org/wikipedia/commons/b/bc/Fortaleza_de_Valen%C3%A7a_-_4.jpg, https://upload.wikimedia.org/wikipedia/commons/c/c5/The_walls_of_Valen%C3%A7a_XV_%2850350486296%29.jpg, https://upload.wikimedia.org/wikipedia/commons/1/1b/The_walls_of_Valen%C3%A7a_XVIII_%2850367024847%29.jpg, https://upload.wikimedia.org/wikipedia/commons/3/34/Capela_de_Nosso_Senhor_do_Encontro_in_Valenca.jpg, https://upload.wikimedia.org/wikipedia/commons/3/39/Fortifica%C3%A7%C3%B5es_da_Pra%C3%A7a.006_-_Valen%C3%A7a_do_Minho.jpg, https://upload.wikimedia.org/wikipedia/commons/6/69/Fort_Valenca_%2847002054112%29.jpg, https://upload.wikimedia.org/wikipedia/commons/a/a0/Fortaleza_de_Valen%C3%A7a_-_Portugal_%F0%9F%87%B5%F0%9F%87%B9_%2854305587438%29.jpg, ...]",Q10353229


In [101]:
import requests
from urllib.parse import urlparse
import concurrent.futures
from tqdm import tqdm
import pandas as pd
from pathlib import Path

# Caminho para o ficheiro
file_path = dataLandMarkTrain_dir / 'df_monumentos_finalPT.csv'

# Funções de verificação e limpeza
def verificar_url_imagem(url, timeout=5):
    try:
        parsed = urlparse(url)
        if not all([parsed.scheme, parsed.netloc]):
            return False
        response = requests.head(url, timeout=timeout, allow_redirects=True)
        content_type = response.headers.get('Content-Type', '').lower()
        return response.status_code == 200 and content_type.startswith('image/')
    except Exception:
        return False

def limpar_urls_imagens(urls, verificar_existencia=True, max_workers=20):
    if not isinstance(urls, list):
        return urls
    palavras_indesejadas = ['logo', 'icon', 'symbol', 'badge', 'emblem', 'shield', 'sipa']
    urls_filtradas = []
    for url in urls:
        lower_url = url.lower()
        if any(lower_url.endswith(ext) for ext in ['.svg', '.gif']):
            continue
        if any(palavra in lower_url for palavra in palavras_indesejadas):
            continue
        urls_filtradas.append(url)
    if verificar_existencia and urls_filtradas:
        urls_validas = []
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_url = {executor.submit(verificar_url_imagem, url): url for url in urls_filtradas}
            for future in tqdm(concurrent.futures.as_completed(future_to_url),
                               total=len(urls_filtradas),
                               desc="Verificando URLs"):
                url = future_to_url[future]
                try:
                    if future.result():
                        urls_validas.append(url)
                except Exception:
                    continue
        return urls_validas
    return urls_filtradas

# Verificação de existência do ficheiro
if file_path.exists():
    print(f"Ficheiro já existe: {file_path.name}. A carregar...")
    df_monumentos_finalPT = pd.read_csv(file_path)
    df_monumentos_finalPT.head()
else:
    print("Ficheiro não existe. A processar URLs e a gerar DataFrame...")
    # Aqui assumes que já tens o DataFrame df_monumentos_finalPT original carregado com as URLs (como listas)
    df_monumentos_finalPT['urls'] = [limpar_urls_imagens(url_list) for url_list in tqdm(df_monumentos_finalPT['urls'], desc="Processando monumentos")]
    df_monumentos_finalPT.to_csv(file_path, index=False)
    print(f"Ficheiro guardado em: {file_path}")
    df_monumentos_finalPT.head()

Filtrando imagens...


Verificando URLs: 100%|██████████| 201/201 [00:11<00:00, 16.98it/s]
Verificando URLs: 100%|██████████| 246/246 [00:14<00:00, 17.23it/s]86s/it]
Verificando URLs: 100%|██████████| 112/112 [00:06<00:00, 16.66it/s]30s/it]
Verificando URLs: 100%|██████████| 116/116 [00:06<00:00, 16.61it/s]30s/it]
Verificando URLs: 100%|██████████| 13/13 [00:00<00:00, 15.05it/s]9.00s/it]
Verificando URLs: 100%|██████████| 21/21 [00:01<00:00, 14.07it/s]6.07s/it]
Verificando URLs: 100%|██████████| 156/156 [00:09<00:00, 16.92it/s]52s/it]
Verificando URLs: 100%|██████████| 201/201 [00:11<00:00, 17.02it/s]06s/it]
Verificando URLs: 100%|██████████| 17/17 [00:01<00:00, 15.87it/s]7.90s/it]
Verificando URLs: 100%|██████████| 45/45 [00:02<00:00, 15.19it/s]5.77s/it]
Verificando URLs: 100%|██████████| 6/6 [00:00<00:00, 10.81it/s],  4.91s/it]
Verificando URLs: 100%|██████████| 17/17 [00:01<00:00, 15.85it/s] 3.58s/it]
Verificando URLs: 100%|██████████| 18/18 [00:01<00:00, 15.79it/s].82s/it]  
Verificando URLs: 100%|██████

KeyboardInterrupt: 

### 4.2. Visualização da localização dos "Monumentos"

In [100]:
import folium
from folium import IFrame
from folium.plugins import Fullscreen, MiniMap
from branca.element import Template, MacroElement
import html

# Criar o mapa centrado em Portugal
mapa = folium.Map(location=[38.5, -21.0], zoom_start=6, tiles=None, control_scale=True)

# Adicionar diferentes estilos de mapa
folium.TileLayer('CartoDB positron', name='Claro').add_to(mapa)
folium.TileLayer('CartoDB dark_matter', name='Escuro').add_to(mapa)
folium.TileLayer('OpenStreetMap', name='Padrão').add_to(mapa)

# Plugins extra
Fullscreen(position='topright').add_to(mapa)
MiniMap(toggle_display=True, position='bottomright').add_to(mapa)

# Criar um dicionário para armazenar as camadas por supercategoria
camadas = {}

# Definir cores baseado em SerMonumento
cores = {
    1: {'color': '#27ae60', 'fill_color': '#2ecc71'},  # Verde para monumentos
    0: {'color': '#e74c3c', 'fill_color': '#c0392b'}   # Vermelho para não-monumentos
}

# Iterar sobre os monumentos
for _, row in df_monumentos_finalPT.dropna(subset=['lat', 'lon']).iterrows():
    nome = html.escape(str(row.get('nome_monumento', 'Monumento')))
    
    # Verificar se é monumento (default para 1 se não existir a coluna)
    ser_monumento = row.get('SerMonumento', 1)
    cor_config = cores.get(ser_monumento, {'color': '#3498db', 'fill_color': '#2980b9'})  # Default azul
    
    imagem_urls = row.get('urls', [])
    imagem = imagem_urls[0] if isinstance(imagem_urls, list) and imagem_urls else None

    supercat_raw = row.get('supercategory', [])
    supercat_list = supercat_raw if isinstance(supercat_raw, list) else [supercat_raw]
    local = ', '.join(row.get('localizacoes_administrativas', [])) if isinstance(row.get('localizacoes_administrativas'), list) else row.get('localizacoes_administrativas', '')
    tipos = ', '.join(row.get('tipos_instancia_de', [])) if isinstance(row.get('tipos_instancia_de'), list) else row.get('tipos_instancia_de', '')

    # Adicionar badge indicando monumento/não-monumento
    status = "Monumento" if ser_monumento == 1 else "Não-Monumento"
    status_color = "#27ae60" if ser_monumento == 1 else "#e74c3c"
    
    # HTML estilizado
    html_popup = f"""
    <div style="width:240px; font-family:'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background:white; border-radius:10px; box-shadow:0 4px 12px rgba(0,0,0,0.2); overflow:hidden;">
        {"<img src='"+imagem+"' style='width:100%; height:auto; display:block;'>" if imagem else ""}
        <div style="padding:10px;">
            <h4 style="margin:0 0 10px 0; font-size:16px; text-align:center; color:#2c3e50;">{nome}</h4>
            <div style="background:{status_color}; color:white; padding:2px 5px; border-radius:3px; display:inline-block; margin-bottom:8px; font-size:12px;">
                {status}
            </div>
            <p style="margin:4px 0;"><b>Supercategoria:</b> {html.escape(', '.join(supercat_list))}</p>
            <p style="margin:4px 0;"><b>Localização:</b> {html.escape(str(local))}</p>
            <p style="margin:4px 0;"><b>Tipo:</b> {html.escape(str(tipos))}</p>
        </div>
    </div>
    """

    iframe = IFrame(html=html_popup, width=260, height=360)
    popup = folium.Popup(iframe, max_width=300)

    for supercat in supercat_list:
        if not supercat:
            supercat = "Sem Categoria"

        # Criar camada se ainda não existir
        if supercat not in camadas:
            camada = folium.FeatureGroup(name=supercat, show=True)
            camadas[supercat] = camada
            camada.add_to(mapa)

        # Adicionar marcador à camada correspondente
        folium.CircleMarker(
            location=[row['lat'], row['lon']],
            radius=8,
            color=cor_config['color'],
            fill=True,
            fill_color=cor_config['fill_color'],
            fill_opacity=0.9,
            tooltip=f"{nome} ({status})",
            popup=popup
        ).add_to(camadas[supercat])

# Adicionar legenda
template = """
{% macro html(this, kwargs) %}
<div style="
    position: fixed; 
    bottom: 50px;
    left: 50px;
    width: 150px;
    height: 80px;
    z-index:9999;
    font-size:14px;
    background: white;
    padding: 10px;
    border-radius: 5px;
    box-shadow: 0 0 10px rgba(0,0,0,0.2);
">
    <p style="margin:0 0 5px 0;"><b>Legenda</b></p>
    <div style="display:flex; align-items:center; margin-bottom:3px;">
        <div style="background:#27ae60; width:12px; height:12px; border-radius:50%; margin-right:5px;"></div>
        Monumentos
    </div>
    <div style="display:flex; align-items:center;">
        <div style="background:#e74c3c; width:12px; height:12px; border-radius:50%; margin-right:5px;"></div>
        Não-Monumentos
    </div>
</div>
{% endmacro %}
"""

macro = MacroElement()
macro._template = Template(template)
mapa.get_root().add_child(macro)

# Adicionar controlo de camadas
folium.LayerControl(collapsed=False, position='topleft').add_to(mapa)

# Guardar o mapa
mapa.save("mapa_monumentos.html")

### 4.3. Guardar as imagens

In [None]:
import unicodedata
import re
pd.reset_option('display.max_colwidth')

# Função para limpar nomes de pastas
def limpar_nome(nome):
    nome = unicodedata.normalize('NFKD', nome).encode('ASCII', 'ignore').decode('utf-8')
    nome = re.sub(r'[\\/*?:"<>|]', "", nome)  # remover caracteres inválidos
    nome = nome.strip().replace(' ', '_')
    return nome

# Pasta Train Base
dataTrain_dir = data_dir / 'train'
dataTrain_dir.mkdir(exist_ok=True)

# Simular browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# Função para descarregar uma imagem (com cache e paralelismo)
def descarregar_imagem(monumento_nome, url, i):
    try:
        nome_limpo = limpar_nome(monumento_nome)
        pasta_monumento = dataTrain_dir / nome_limpo
        pasta_monumento.mkdir(exist_ok=True)

        # Geração de nome único por hash da URL
        hash_url = hashlib.md5(url.encode()).hexdigest()
        extensao = url.split('.')[-1].split('?')[0][:4]
        nome_ficheiro = f"{i:03d}_{hash_url}.{extensao}"
        caminho_ficheiro = pasta_monumento / nome_ficheiro

        if caminho_ficheiro.exists():
            return  # já existe

        resposta = requests.get(url, headers=headers, timeout=15)
        if resposta.status_code == 200:
            with open(caminho_ficheiro, 'wb') as f:
                f.write(resposta.content)
        else:
            print(f"Erro ao descarregar {url}: status {resposta.status_code}")
    except Exception as e:
        print(f"Erro com '{monumento_nome}': {e}")

# Criar lista de tarefas
tarefas = []
for _, row in df_monumentos_pt.iterrows():
    nome = str(row.get("nome_monumento", "monumento_desconhecido"))
    urls = row.get("urls", [])
    if isinstance(urls, list):
        for i, url in enumerate(urls):
            tarefas.append((nome, url, i))

# Descarregar em paralelo
with ThreadPoolExecutor(max_workers=16) as executor:
    list(tqdm(executor.map(lambda x: descarregar_imagem(*x), tarefas), total=len(tarefas)))

  6%|▌         | 869/14948 [00:57<08:23, 27.96it/s]  

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/f/f1/2010-03-02_12_41_08_Portugal-Santa_Luzia.jpg: status 404
Erro ao descarregar http://upload.wikimedia.org/wikipedia/commons/6/64/2010-03-02_12_40_20_Portugal-Santa_Luzia.jpg: status 404


  9%|▉         | 1406/14948 [02:03<52:08,  4.33it/s]  

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/4/4b/Pico_do_Arieiro_DSC_0084.JPG_%2835210542960%29.jpg: status 404


 10%|▉         | 1426/14948 [02:06<50:58,  4.42it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/f/f0/Pico_do_Arieiro_DSC_0079.JPG_%2835558479826%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/b/ba/Pico_do_Arieiro_DSC_0080.JPG_%2834788339763%29.jpg: status 404


 11%|█         | 1660/14948 [02:13<05:42, 38.83it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/b/b5/Raiway_bridges_%2836675374855%29.jpg: status 404


 11%|█▏        | 1708/14948 [02:18<13:39, 16.16it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/f/f8/D._Lu%C3%ADs_Bridge.JPG: status 404


 13%|█▎        | 1992/14948 [03:27<4:42:37,  1.31s/it]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/0/07/P8062937_ShiftN2_%287833274928%29.jpg: status 404


 16%|█▌        | 2421/14948 [03:38<08:55, 23.38it/s]  

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/a/af/333524_a.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/5/51/Adro_da_Capela_de_Santo_Amaro_-_Lisboa_-_Portuga_%2837669571036%29.jpg: status 404


 20%|█▉        | 2921/14948 [04:11<15:41, 12.78it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/1/1f/IMG_2799.JPG_%288059849169%29.jpg: status 404


 22%|██▏       | 3294/14948 [04:48<14:23, 13.49it/s]  

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/6/60/Lisbon-20160210-029_%2825788996065%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/c/c9/2008_-_panoramio_-_%D0%92%D0%B0%D0%BB%D0%B5%D1%80%D0%B8%D0%B9_%D0%94%D0%B5%D0%B4_%283%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/2/2f/Plaza_del_Comercio%2C_Lisboa%2C_Portugal%2C_2012-05-12%2C_DD_03.JPG: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/8/84/Plaza_del_Comercio%2C_Lisboa%2C_Portugal%2C_2012-05-12%2C_DD_01.JPG: status 404


 26%|██▌       | 3836/14948 [04:57<04:04, 45.40it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/f/fa/Le%C3%A7a_da_Palmeira_IMG_3158.JPG_%286104708437%29.jpg: status 404Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/9/94/Le%C3%A7a_da_Palmeira_IMG_3160.JPG_%286104711629%29.jpg: status 404

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/c/c3/Le%C3%A7a_da_Palmeira_IMG_3089.JPG_%286104517397%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/6/6e/Le%C3%A7a_da_Palmeira_IMG_3165.JPG_%286105266406%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/d/d4/Le%C3%A7a_da_Palmeira_IMG_3079.JPG_%286105048112%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/7/77/Le%C3%A7a_da_Palmeira_IMG_3162.JPG_%286105260738%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/7/7f/Le%C3%A7a_da_Palmeira_IMG_3088.JPG_%286104514785%29.jpg: status 404
Erro ao descarregar 

 26%|██▋       | 3948/14948 [05:00<04:16, 42.97it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/0/0b/Le%C3%A7a_da_Palmeira_IMG_3145.JPG_%286104672661%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/7/72/Le%C3%A7a_da_Palmeira_IMG_3102.JPG_%286105105458%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/b/b7/Le%C3%A7a_da_Palmeira_IMG_3087.JPG_%286104995067%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/5/50/Le%C3%A7a_da_Palmeira_IMG_3135.JPG_%286105197938%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/4/4d/Le%C3%A7a_da_Palmeira_IMG_3100.JPG_%286105100890%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/d/d9/Le%C3%A7a_da_Palmeira_IMG_3149.JPG_%286105229718%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/6/6e/Le%C3%A7a_da_Palmeira_IMG_3134.JPG_%286104648291%29.jpg: status 404
Erro ao descarregar 

 29%|██▉       | 4300/14948 [05:16<05:09, 34.39it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/b/b8/Ponte_Luis_I.jpg: status 404


 40%|███▉      | 5935/14948 [08:16<13:51, 10.84it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/b/b5/PM_33428_P_Lamego.jpg: status 404


 45%|████▍     | 6700/14948 [09:11<27:42,  4.96it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/c/c2/Est%C3%A1dio_do_Restelo_%28croppped%29.png: status 404


 48%|████▊     | 7224/14948 [09:56<19:35,  6.57it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/4/40/Exercise_TRIDENT_JUNCTURE_%2821808370373%29.jpg: status 404


 48%|████▊     | 7228/14948 [09:56<19:01,  6.76it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/f/fb/Exercise_TRIDENT_JUNCTURE_%2822241527080%29.jpg: status 404


 56%|█████▋    | 8429/14948 [11:08<04:07, 26.35it/s]

Erro ao descarregar http://upload.wikimedia.org/wikipedia/commons/8/8c/Portugal_Cabo_S_Vincence.jpg: status 404


 61%|██████    | 9142/14948 [11:59<08:00, 12.09it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/4/4f/%D0%A4%D0%BE%D1%80%D1%82_%D0%B4%D0%B5_%D0%A1%D0%B0%D0%BD_%D0%91%D1%80%D0%B0%D1%88_%D0%B4%D0%B5_%D0%A1%D0%B0%D0%BD%D0%BA%D1%81%D0%B5%D1%82_%28Forte_de_S%C3%A3o_Br%C3%A1s_de_Sanxete%29_-_panoramio.jpg: status 404


 62%|██████▏   | 9321/14948 [12:07<04:25, 21.16it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/a/a0/Royal_barge_of_Joao_VI%2C_Navy_Museum_of_Lisbon_%28Portugal%29.JPG: status 404


 64%|██████▍   | 9632/14948 [13:00<20:09,  4.40it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/e/e0/IMG_2733.JPG_%288056656868%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/0/0e/IMG_2739.JPG_%288056657517%29.jpg: status 404


 66%|██████▌   | 9838/14948 [13:12<08:54,  9.55it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/8/8b/IMG_2744.JPG_%288056664408%29.jpg: status 404


 67%|██████▋   | 9966/14948 [13:24<07:38, 10.88it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/1/19/Igreja_de_S%C3%A3o_Domingos_em_Viana_do_Castelo.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/6/69/P8093304crw_%287881501342%29.jpg: status 404


 68%|██████▊   | 10205/14948 [13:38<06:35, 11.98it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/8/8e/Igreja_das_Carmelitas_Porto_01.jpg: status 404


 70%|██████▉   | 10428/14948 [14:01<07:38,  9.87it/s]

Erro ao descarregar http://upload.wikimedia.org/wikipedia/commons/b/be/FortalzaSagres.jpg: status 404


 72%|███████▏  | 10730/14948 [14:23<05:51, 11.99it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/b/bf/DIMG_7326_%281873279911%29.jpg: status 404


 80%|███████▉  | 11952/14948 [15:23<00:45, 65.24it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/9/97/Altstadt_Faro_2.jpg: status 404


 91%|█████████▏| 13653/14948 [17:54<00:48, 26.52it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/8/87/Monastery_%2839047661370%29.jpg: status 404


 93%|█████████▎| 13925/14948 [18:45<03:18,  5.16it/s]

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/3/38/IMG_2973.JPG_%288071340831%29.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/d/da/IMG_2967.JPG_%288071332448%29.jpg: status 404


 99%|█████████▉| 14804/14948 [19:13<00:05, 24.73it/s] 

Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/d/d8/Silves_-_Rua_Diogo_Manuel_03.2018.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/9/91/Cidade_de_Silves12.jpg: status 404
Erro ao descarregar https://upload.wikimedia.org/wikipedia/commons/f/f6/Cidade_de_Silves.JPG: status 404


100%|██████████| 14948/14948 [19:18<00:00, 12.90it/s]


In [None]:
# Contagem
total_monumentos = 0
total_imagens = 0
imagens_por_monumento = {}

# Iterar pelas subpastas (cada pasta = um "monumento")
for pasta_monumento in sorted(dataTrain_dir.iterdir()):
    if pasta_monumento.is_dir():
        imagens = list(pasta_monumento.glob('*.*'))  # todas as imagens (qualquer extensão)
        num_imagens = len(imagens)
        imagens_por_monumento[pasta_monumento.name] = num_imagens
        total_monumentos += 1
        total_imagens += num_imagens

# Resultados
print(f"Total de monumentos: {total_monumentos}")
print(f"Total de imagens: {total_imagens}")
print("\nNúmero de imagens por monumento:")
for nome, num in imagens_por_monumento.items():
    print(f"- {nome}: {num}")


Total de monumentos: 601
Total de imagens: 15023

Número de imagens por monumento:
- 25_de_Abril_Bridge: 122
- 25_Fontes_Falls: 39
- Achada_do_Teixeira: 4
- Alcobaca_Monastery: 41
- Alfanzina_Lighthouse: 3
- Alminhas_da_Ponte: 15
- Alto_de_Sao_Joao_Cemetery: 31
- Alto_Lindoso_Dam: 4
- Amoreira_Aqueduct: 16
- Animatografo_do_Rossio: 6
- Anta_de_Pavia: 8
- Anta_de_Pera_do_Moco: 15
- Antiga_Forca_de_Freixiel: 2
- Apulia_beach: 60
- Aqueduto_da_Prata: 18
- Aqueduto_de_Santa_Clara_(Vila_do_Conde): 20
- Aqueduto_de_Sao_Sebastiao: 22
- Arco_da_Porta_Nova: 24
- Arco_da_Vila: 29
- Arnel_Point_Lighthouse: 12
- Arrabida_Bridge: 165
- Arrabida_Natural_Park: 35
- Arripiado: 5
- Aveiro_Lighthouse: 29
- Avenida_dos_Aliados: 87
- Azenhas_do_Mar: 17
- Barrinha_de_Esmoriz: 2
- Basilica_de_Sao_Pedro: 5
- Basilica_do_Sagrado_Coracao_de_Jesus: 4
- Basilica_of_Our_Lady_of_the_Martyrs,_Lisboa: 5
- Basilica_of_the_Holy_Trinity,_Fatima: 34
- Batalha_Monastery: 112
- Belem_Cultural_Center: 63
- Bom_Jesus_do_Mon