# Project

## Dépendance

In [80]:
! pip install SPARQLWrapper
! pip install exifread
! pip install pandas
! pip install PIL

Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: gpg 1.14.0-unknown has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of gpg or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mDefaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: gpg 1.14.0-unknown has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of gpg or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

## Collecte de données

In [81]:
import pandas as pd
import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

# Get cities
query = """SELECT DISTINCT ?grandeville ?grandevilleLabel ?pays ?paysLabel ?image {
  ?grandeville wdt:P31 wd:Q1549591;
               wdt:P17 ?pays;
               wdt:P18 ?image.
 SERVICE wikibase:label { bd:serviceParam wikibase:language "fr". }
}
LIMIT 100"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (
        sys.version_info[0],
        sys.version_info[1],
    )
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


array = []
results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    array.append(
        (
            result["grandevilleLabel"]["value"],
            result["paysLabel"]["value"],
            result["image"]["value"],
        )
    )

dataframe = pd.DataFrame(array, columns=["ville", "pays", "image"])
dataframe = dataframe.astype(
    dtype={"ville": "<U200", "pays": "<U200", "image": "<U200"}
)
dataframe

Unnamed: 0,ville,pays,image
0,Krasnoïarsk,Russie,http://commons.wikimedia.org/wiki/Special:File...
1,São Paulo,Brésil,http://commons.wikimedia.org/wiki/Special:File...
2,Boston,États-Unis,http://commons.wikimedia.org/wiki/Special:File...
3,Le Caire,Égypte,http://commons.wikimedia.org/wiki/Special:File...
4,Szczecin,Pologne,http://commons.wikimedia.org/wiki/Special:File...
...,...,...,...
95,Padoue,Italie,http://commons.wikimedia.org/wiki/Special:File...
96,Buenos Aires,Argentine,http://commons.wikimedia.org/wiki/Special:File...
97,Lyon,France,http://commons.wikimedia.org/wiki/Special:File...
98,Le Mans,France,http://commons.wikimedia.org/wiki/Special:File...


### Téléchargement des images

In [82]:
import requests
import shutil
import os

def download_image(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    request = requests.get(url, allow_redirects=True, headers=headers, stream=True)
    if request.status_code == 200:
        # Create 'images' directory if it doesn't exist
        if not os.path.exists("images"):
            os.makedirs("images")

        # Extract the filename from the URL and save the image in 'images' directory
        filename = os.path.join("images", os.path.basename(url))
        with open(filename, "wb") as image:
            request.raw.decode_content = True
            shutil.copyfileobj(request.raw, image)
    return request.status_code

def clear_images_directory():
    # Clear 'images' directory if it exists
    if os.path.exists("images"):
        shutil.rmtree("images")

# Clear 'images' directory before every execution
clear_images_directory()

# Assuming 'dataframe' is a DataFrame containing URLs under column 'image'
# Apply the download_image function to each URL in the 'image' column
dataframe.image.apply(download_image)

0     200
1     200
2     200
3     200
4     200
     ... 
95    200
96    200
97    200
98    200
99    200
Name: image, Length: 100, dtype: int64

### Enregistrement des métadonnées
#### Import

In [1]:
import os
import json
from PIL import Image, TiffImagePlugin
import PIL.ExifTags
from sklearn.cluster import KMeans
import numpy as np

#### Define the directory path and initialize the list to store metadata:

In [2]:
directory_path = "images"
all_metadata = []

if not os.path.isdir(directory_path):
    print(f"Error: Directory '{directory_path}' does not exist!")
    sys.exit()

#### Define the function to cast data types

In [3]:
def cast(v):
    if isinstance(v, TiffImagePlugin.IFDRational):
        if v.denominator == 0:
            return None  # Handle division by zero gracefully
        return float(v.numerator) / float(v.denominator)
    elif isinstance(v, tuple):
        return tuple(cast(t) for t in v)
    elif isinstance(v, bytes):
        return v.decode(errors="replace")
    elif isinstance(v, dict):
        for kk, vv in v.items():
            v[kk] = cast(vv)
        return v
    else:
        return v

#### Define the function to get main colors

In [4]:
import numpy as np
from PIL import Image
from sklearn.cluster import KMeans

def get_main_colors(image_path, num_clusters):
    numarray = np.array(image_path.getdata(), dtype=np.uint8)
    clusters = KMeans(n_clusters=num_clusters, n_init=2)
    clusters.fit(numarray)

    main_colors = []
    for cluster_center in clusters.cluster_centers_:
        main_colors.append(cluster_center.astype(int))

    return main_colors


#### Define the function to get the images properties

In [7]:
def get_image_metadata(imgfile, hasExif):
    metadata = {}

    # Get image format
    metadata['format'] = imgfile.format

    # Get image size
    metadata['size'] = imgfile.size
    
    # Get 3 main Colors
    metadata['main_colors'] = get_main_colors(imgfile, 3)
    
    if hasExif:
        exif = dict(imgfile._getexif().items())
        if exif:
            # Get image orientation (landscape, portrait, square, etc.)
            if 274 in exif:
                orientation = exif[274]
                if orientation == 1:
                    metadata['orientation'] = 'Landscape'
                elif orientation == 3:
                    metadata['orientation'] = 'Portrait'
                else:
                    metadata['orientation'] = 'Unknown'
            else:
                metadata['orientation'] = 'Unknown'
    else:
        metadata['orientation'] = 'Unknown'
    return metadata

#### Loop through files in the directory and extract metadata

In [8]:
import os
import json
import sys
from PIL import Image, TiffImagePlugin
import PIL.ExifTags

directory_path = "images"
all_metadata = []

if not os.path.isdir(directory_path):
    print(f"Error: Directory '{directory_path}' does not exist!")
    sys.exit()

for filename in os.listdir(directory_path):
    if os.path.isfile(os.path.join(directory_path, filename)) and filename.lower().endswith((".jpg", ".png")):
        print(f"Found image: {filename}")

        try:
            imgfile = Image.open(os.path.join(directory_path, filename))
            exif_data = imgfile._getexif()
            
            metadata_dict = {}
            
            hasExif = False

            if exif_data:
                hasExif = True
                for k, v in imgfile._getexif().items():
                    if k in PIL.ExifTags.TAGS:
                        v = cast(v)
                        metadata_dict[PIL.ExifTags.TAGS[k]] = v
            else:
                print(f"  - No EXIF data found for {filename}")

            # Get additional image metadata (format, size, orientation)
            image_metadata = get_image_metadata(imgfile, hasExif)
            metadata_dict.update(image_metadata)
            
            all_metadata.append({filename: metadata_dict})
            if not metadata_dict:
                print(f"  - No metadata found for {filename}")

        except Exception as e:
            print(f"  - Error processing {filename}: {e}")

directory_path = "output"
output_file = "directory_metadata.json"

if not os.path.exists(directory_path):
            os.makedirs(directory_path)

output_location = directory_path + "/" + output_file
with open(output_location, "w") as json_file:
    json.dump(all_metadata, json_file, indent=4)

print(f"Directory metadata saved to {output_location} (if no errors occurred)")


Found image: 19-03-03-Maribor-RalfR-DJI%200444.jpg
Found image: Szczecin%20aerial%203a.jpg
Found image: Omsk%20Collage%202016.png
  - No EXIF data found for Omsk%20Collage%202016.png
Found image: Zagreb%20%2829255640143%29.jpg
Found image: Grenoble%2001.JPG
Found image: Stadtbild%20K%C3%B6ln%20%2850MP%29.jpg


# Tagging
## Main colors

In [2]:

import numpy
from PIL import Image
from sklearn.cluster import KMeans

def get_main_colors(image_path, num_clusters):
    imgfile = Image.open(image_path)
    print(imgfile)
    numarray = numpy.array(imgfile.getdata(), numpy.uint8)
    clusters = KMeans(n_clusters=num_clusters, n_init=2)
    clusters.fit(numarray)

    main_colors = []
    for cluster_center in clusters.cluster_centers_:
        main_colors.append(cluster_center.astype(int))

    return main_colors

# Example usage:
main_colors = get_main_colors("../../images/flower.jpg", 3)
print(main_colors)

<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x480 at 0x7FF3CFC43880>
[array([239, 178,   7]), array([20, 23, 18]), array([63, 79, 61])]
