# IMPORT DES DONNÉES

In [143]:
import requests
import shutil
import os
from PIL import Image
import PIL.ExifTags
import pandas as pd
import sys
import json
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

# Get cities

user_agent = "Mozilla/5.0"

PROJECT_DATA_DIR = "../../data/projet"
SUB_DATA_DIR = "/import"


def get_exif(img_path: str):
    img = PIL.Image.open(img_path)
    return {
        PIL.ExifTags.TAGS[k]: v
        for k, v in img._getexif().items()
        if k in PIL.ExifTags.TAGS
    }


def merge_metadata(data):
    for i in range(len(data)):
        metadata = get_exif(data[i]["local_path"])

        data[i]["have_exif"] = len(metadata) != 0

        if data[i]["have_exif"]:
            data[i] = data[i] | metadata


def format_import_data_path(image_id):
    return f"{PROJECT_DATA_DIR}{SUB_DATA_DIR}/images/{image_id}.jpg"


def get_results(endpoint_url, query):
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


def download_images(row):
    print(type(row))

    def download_image(url, save_path=None):
        headers = {
            "User-Agent": "Mozilla/5.0",
        }

        if not save_path:
            save_path = os.path.basename(url)

        request = requests.get(url, allow_redirects=True, headers=headers, stream=True)

        if request.status_code != 200:
            print(f"Erreur lors du téléchargement des images {request.status_code}")
        else:
            with open(save_path, "wb") as image:
                request.raw.decode_content = True

                shutil.copyfileobj(request.raw, image)
                print("> Image sauvegardé :", save_path)

        return request.status_code

    ret = []

    print("Téléchargement de l'image :", row["local_path"])
    download_image(row["origin_path"],row["local_path"])


def conv(x):
    year, month, day = tuple(x.split("T")[0].split('-'))
    return pd.Period(year=int(year), month=int(month), day=int(day), freq="D")


def format_cities_data(raw_data):
    ret: [] = []
    last_row = {}
    i = 0
    for row in raw_data:
        if row["image"]["value"].endswith("jpg") or row["image"]["value"].endswith("jpeg"):
            try:
                new_row = {
                    "titre": row["peintureLabel"]["value"],
                    "date": conv(row["inceptionLabel"]["value"]),
                    "createur": row["createurLabel"]["value"],
                    "location": row["locationLabel"]["value"],
                    "origin_path": row["image"]["value"]
                }

                is_new = True

                for key in new_row.keys():
                    if new_row[key] == last_row.get(key,""):
                        is_new = False

                if is_new:
                    ret.append(
                        new_row
                    )
                    new_row["local_path"] = format_import_data_path(i)
                    i+=1

                    last_row = new_row

            except ValueError as e:
                pass

    return ret

In [144]:
results = get_results(endpoint_url, """
    SELECT DISTINCT ?peintureLabel ?createurLabel  ?inceptionLabel ?locationLabel ?image WHERE {
      ?peinture wdt:P31 wd:Q11060274;
        wdt:P170 ?createur;
        wdt:P276 ?location;
        wdt:P571 ?inception;
        wdt:P18 ?image.
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    LIMIT 2500
""")

In [145]:
from pandas import json_normalize

raw_data = results["results"]["bindings"]

data_dict = format_cities_data(raw_data)

dataframe = json_normalize(data_dict)

dataframe = dataframe.astype(
    dtype={"titre": "<U200", "date": pd.PeriodDtype.type()}
)

dataframe.reset_index()

# dataframe.apply(download_images, axis=1)

# merge_metadata(data_dict)

# merge_metadata(res)
# print(res)
dataframe

Unnamed: 0,titre,date,createur,location,origin_path,local_path
0,The Small Passion: Christ Before Annas,1511-01-01,Albrecht Dürer,Cleveland Museum of Art,http://commons.wikimedia.org/wiki/Special:File...,../../data/projet/import/images/0.jpg
1,Portrait of Jacobus Revius,1630-01-01,Jonas Suyderhoef,http://www.wikidata.org/.well-known/genid/b97d...,http://commons.wikimedia.org/wiki/Special:File...,../../data/projet/import/images/1.jpg
2,Ecce Homo,1515-01-01,Albrecht Dürer,Chester Beatty Library,http://commons.wikimedia.org/wiki/Special:File...,../../data/projet/import/images/2.jpg
3,"Fuji from Ushibori, Province of Hitachi",1831-01-01,Katsushika Hokusai,Cleveland Museum of Art,http://commons.wikimedia.org/wiki/Special:File...,../../data/projet/import/images/3.jpg
4,The Flagellation,1512-01-01,Albrecht Dürer,National Gallery of Art,http://commons.wikimedia.org/wiki/Special:File...,../../data/projet/import/images/4.jpg
...,...,...,...,...,...,...
141,Q22243367,1608-01-01,Aegidius Sadeler,Groeningemuseum,http://commons.wikimedia.org/wiki/Special:File...,../../data/projet/import/images/141.jpg
142,Q22251528,1899-01-01,Maurice Denis,Museum of Fine Arts Ghent (MSK),http://commons.wikimedia.org/wiki/Special:File...,../../data/projet/import/images/142.jpg
143,Tenby from the N.W,1812-01-01,Charles Norris,National Library of Wales,http://commons.wikimedia.org/wiki/Special:File...,../../data/projet/import/images/143.jpg
144,Q22281831,1918-01-01,Léon Spilliaert,Mu.ZEE - Kunstmuseum aan Zee,http://commons.wikimedia.org/wiki/Special:File...,../../data/projet/import/images/144.jpg


In [137]:
with open(f"{PROJECT_DATA_DIR}{SUB_DATA_DIR}/data.json", "w") as f:
    json_str = json.dumps(res, indent=4)
    f.write(json_str)

print(res)
# download_image("http://toto.fr")

NameError: name 'res' is not defined