# IMPORT DES DONNÉES

In [62]:
import requests
import shutil
import os
from PIL import Image
import PIL.ExifTags
import pandas as pd
import sys
import json
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

# Get cities

user_agent = "Mozilla/5.0"

PROJECT_DATA_DIR = "../../data/projet"
SUB_DATA_DIR = "/import"


def get_exif(img_path: str):
    img = PIL.Image.open(img_path)
    return {
        PIL.ExifTags.TAGS[k]: v
        for k, v in img._getexif().items()
        if k in PIL.ExifTags.TAGS
    }


def merge_metadata(data):
    for i in range(len(data)):
        metadata = get_exif(data[i]["local_path"])

        data[i]["have_exif"] = len(metadata) != 0

        if data[i]["have_exif"]:
            data[i] = data[i] | metadata


def format_import_data_path(image_id):
    return f"{PROJECT_DATA_DIR}{SUB_DATA_DIR}/images/{image_id}.jpg"


def get_results(endpoint_url, query):
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


def download_images(row):
    print(type(row))

    def download_image(url, save_path=None):
        headers = {
            "User-Agent": "Mozilla/5.0",
        }

        if not save_path:
            save_path = os.path.basename(url)

        request = requests.get(url, allow_redirects=True, headers=headers, stream=True)

        if request.status_code != 200:
            print(f"Erreur lors du téléchargement des images {request.status_code}")
        else:
            with open(save_path, "wb") as image:
                request.raw.decode_content = True

                shutil.copyfileobj(request.raw, image)
                print("> Image sauvegardé :", save_path)

        return request.status_code

    ret = []

    print("Téléchargement de l'image :", row["local_path"])
    download_image(row["origin_path"],row["local_path"])


def conv(x):
    year, month, day = tuple(x.split("T")[0].split('-'))
    return pd.Period(year=int(year), month=int(month), day=int(day), freq="D")


def format_cities_data(raw_data):
    ret: [] = []
    for i, row in enumerate(raw_data):
        if row["image"]["value"].endswith("jpg") or row["image"]["value"].endswith("jpeg"):
            try:
                d = conv(row["inceptionLabel"]["value"])

                ret.append(
                    {
                        "titre": row["peintureLabel"]["value"],
                        "date": d,
                        "createur": row["createurLabel"]["value"],
                        "location": row["locationLabel"]["value"],
                        "local_path": format_import_data_path(i),
                        "origin_path": row["image"]["value"]
                    }
                )

            except ValueError as e:
                pass

    return ret

In [None]:
results = get_results(endpoint_url, """
    SELECT DISTINCT ?peintureLabel ?createurLabel  ?inceptionLabel ?locationLabel ?image WHERE {
      ?peinture wdt:P31 wd:Q11060274;
        wdt:P170 ?createur;
        wdt:P276 ?location;
        wdt:P571 ?inception;
        wdt:P18 ?image.
      SERVICE wikibase:label { bd:serviceParam wikibase:language "fr". }
    }
    LIMIT 1500
""")

In [63]:
from pandas import json_normalize

raw_data = results["results"]["bindings"]

data_dict = format_cities_data(raw_data)

dataframe = json_normalize(data_dict)

dataframe = dataframe.astype(
    dtype={"titre": "<U200", "date": pd.PeriodDtype.type(), "createur": "<U200"}
)

dataframe.reset_index()

dataframe.apply(download_images, axis=1)

# merge_metadata(data_dict)

# merge_metadata(res)
# print(res)
dataframe


<class 'pandas.core.series.Series'>
Téléchargement de l'image : ../../data/projet/import/images/0.jpg
> Image sauvegardé : ../../data/projet/import/images/0.jpg
<class 'pandas.core.series.Series'>
Téléchargement de l'image : ../../data/projet/import/images/1.jpg
> Image sauvegardé : ../../data/projet/import/images/1.jpg
<class 'pandas.core.series.Series'>
Téléchargement de l'image : ../../data/projet/import/images/2.jpg
> Image sauvegardé : ../../data/projet/import/images/2.jpg
<class 'pandas.core.series.Series'>
Téléchargement de l'image : ../../data/projet/import/images/4.jpg
> Image sauvegardé : ../../data/projet/import/images/4.jpg
<class 'pandas.core.series.Series'>
Téléchargement de l'image : ../../data/projet/import/images/5.jpg
> Image sauvegardé : ../../data/projet/import/images/5.jpg
<class 'pandas.core.series.Series'>
Téléchargement de l'image : ../../data/projet/import/images/6.jpg
> Image sauvegardé : ../../data/projet/import/images/6.jpg
<class 'pandas.core.series.Series'

KeyboardInterrupt: 

In [None]:
with open(f"{PROJECT_DATA_DIR}{SUB_DATA_DIR}/data.json", "w") as f:
    json_str = json.dumps(res, indent=4)
    f.write(json_str)

print(res)
# download_image("http://toto.fr")