# IMPORT DES DONNÉES

In [69]:
import requests
import shutil
import os
from PIL import Image
import PIL.ExifTags
import pandas as pd
import sys
import json
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

# Get cities

user_agent = "Mozilla/5.0"

PROJECT_DATA_DIR = "../../data/projet"
SUB_DATA_DIR = "/import"


def get_exif(img_path: str):
    img = PIL.Image.open(img_path)
    if img._getexif() is None:
        return {}


    return {
        PIL.ExifTags.TAGS[k]: v
        for k, v in img._getexif().items()
        if k in PIL.ExifTags.TAGS
    }


def merge_metadata(row):
    metadata = get_exif(row["local_path"])

    ret = {"have_exif": len(metadata) != 0}

    print(ret)

    if ret["have_exif"]:
        ret["orientation"] = metadata.get("Orientation")

    return ret


def format_import_data_path(image_id):
    return f"{PROJECT_DATA_DIR}{SUB_DATA_DIR}/images/{image_id}.jpg"


def get_results(endpoint_url, query):
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


def download_images(row):
    print(type(row))

    def download_image(url, save_path=None):
        headers = {
            "User-Agent": "Mozilla/5.0",
        }

        if not save_path:
            save_path = os.path.basename(url)

        request = requests.get(url, allow_redirects=True, headers=headers, stream=True)

        if request.status_code != 200:
            print(f"Erreur lors du téléchargement des images {request.status_code}")
        else:
            with open(save_path, "wb") as image:
                request.raw.decode_content = True

                shutil.copyfileobj(request.raw, image)
                print("> Image sauvegardé :", save_path)

        return request.status_code

    ret = []

    print("Téléchargement de l'image :", row["local_path"])
    download_image(row["origin_path"], row["local_path"])


def conv(x):
    year, month, day = tuple(x.split("T")[0].split('-'))
    return pd.Period(year=int(year), month=int(month), day=int(day), freq="D")


def format_cities_data(raw_data):
    ret: [] = []
    last_row = {}
    i = 0
    for row in raw_data:
        if row["image"]["value"].endswith("jpg") or row["image"]["value"].endswith("jpeg"):
            try:
                new_row = {
                    "titre": row["peintureLabel"]["value"],
                    "date": conv(row["inceptionLabel"]["value"]),
                    "createur": row["createurLabel"]["value"],
                    "location": row["locationLabel"]["value"],
                    "origin_path": row["image"]["value"]
                }

                is_new = True

                for key in new_row.keys():
                    if new_row[key] == last_row.get(key, ""):
                        is_new = False

                if is_new:
                    ret.append(
                        new_row
                    )
                    new_row["local_path"] = format_import_data_path(i)
                    i += 1

                    last_row = new_row

            except ValueError as e:
                pass

    return ret

In [84]:
results = get_results(endpoint_url, """
    SELECT DISTINCT ?peintureLabel ?createurLabel  ?inceptionLabel ?locationLabel ?image WHERE {
      ?peinture wdt:P31 wd:Q11060274;
        wdt:P170 ?createur;
        wdt:P276 ?location;
        wdt:P571 ?inception;
        wdt:P18 ?image.
      SERVICE wikibase:label { bd:serviceParam wikibase:language "fr". }
    }
    ORDER BY DESC(?peintureLabel)
    LIMIT 2500
""")

In [90]:
from pandas import json_normalize

raw_data = results["results"]["bindings"]

data_dict = format_cities_data(raw_data)

dataframe = json_normalize(data_dict)

dataframe = dataframe.astype(
    dtype={"titre": "<U200", "date": pd.PeriodDtype.type()}
)

dataframe.reset_index()

# dataframe.apply(download_images, axis=1)
# tmp = dataframe.apply(merge_metadata, axis=1)

# merge_metadata(data_dict)
# dataframe.apply(download_images, axis=1)

dataframe.to_json(f"{PROJECT_DATA_DIR}{SUB_DATA_DIR}/data.json",orient="index")
# print(res)

0         {'have_exif': True, 'orientation': 1}
1                          {'have_exif': False}
2                          {'have_exif': False}
3                          {'have_exif': False}
4                          {'have_exif': False}
                         ...                   
301    {'have_exif': True, 'orientation': None}
302    {'have_exif': True, 'orientation': None}
303    {'have_exif': True, 'orientation': None}
304    {'have_exif': True, 'orientation': None}
305    {'have_exif': True, 'orientation': None}
Length: 306, dtype: object