Projet
===

- Johan PLANCHON
- Julien CAPOSIENA

## 1. Collecte de données

1) Créer un dossier appelé `images`

In [51]:
import os

if not os.path.exists('./images'):
    os.mkdir('./images')

2) Télécharger les images sous licence ouverte dans le dossier images (minimum 100 images).

In [None]:
!pip install sparqlwrapper

On va récupérer 100 langages de programmation et leur logo

In [121]:
import sys
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """
SELECT DISTINCT ?catBreed ?catBreedLabel ?image {
    ?catBreed wdt:P31 wd:Q43577;
        wdt:P18 ?image.
    SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,en". }
}
LIMIT 100
"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (
        sys.version_info[0],
        sys.version_info[1],
    )
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


array = []
results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    array.append(
        (
            result["catBreedLabel"]["value"],
            result["image"]["value"],
        )
    )

In [122]:
dataframe = pd.DataFrame(array, columns=["catBreed", "image"])
dataframe = dataframe.astype(
    dtype={"catBreed": "<U200", "image": "<U200"}
)
dataframe

Unnamed: 0,catBreed,image
0,Modern Siamese,http://commons.wikimedia.org/wiki/Special:File...
1,Lykoi,http://commons.wikimedia.org/wiki/Special:File...
2,Burmese,http://commons.wikimedia.org/wiki/Special:File...
3,Balinais,http://commons.wikimedia.org/wiki/Special:File...
4,Balinais,http://commons.wikimedia.org/wiki/Special:File...
...,...,...
92,Mau arabe,http://commons.wikimedia.org/wiki/Special:File...
93,European shorthair,http://commons.wikimedia.org/wiki/Special:File...
94,Burmilla,http://commons.wikimedia.org/wiki/Special:File...
95,Oriental shorthair,http://commons.wikimedia.org/wiki/Special:File...


On télécharge les images dans le dossier `images`

In [125]:
import requests
import shutil

def download_image(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    request = requests.get(url, allow_redirects=True, headers=headers, stream=True)
    if request.status_code == 200:
        with open('images\\' + os.path.basename(url), "wb") as image:
            request.raw.decode_content = True
            shutil.copyfileobj(request.raw, image)
    return request.status_code

In [126]:
dataframe.image.apply(download_image)

0     200
1     200
2     200
3     200
4     200
     ... 
92    200
93    200
94    200
95    200
96    200
Name: image, Length: 97, dtype: int64

3) On enregistre les métadonnées de chaque image comme la taille de l'image, le format de l'image (.jpeg, .png, etc.), l'orientation de l'image (paysage, portrait, carré, etc.), date de création, modèle d'appareil photo, etc. dans un ou plusieurs fichiers JSON. Vous pouvez utiliser les informations Exif présentes dans les fichiers d'images.

In [425]:
import mimetypes as mt
import pandas as pd
from PIL import Image
from PIL import TiffImagePlugin
from PIL.ExifTags import TAGS

def get_metadata():
    metadata = dict()
    headers = ["Extension"]

    for filename in os.listdir('.\\images'):
        if filename.endswith('.svg'):
            continue

        image = Image.open('.\\images\\' + filename)
        exif_data = image.getexif()

        exif = {}
        exif["Extension"] = mt.guess_type('.\\images\\' + filename)[0]

        if exif_data:  # s'il existe des informations EXIF
            for tag, value in exif_data.items():
                if tag in TAGS:
                    if isinstance(value, TiffImagePlugin.IFDRational):
                        exif[TAGS[tag]] = float(value)
                    elif isinstance(value, tuple):
                        exif[TAGS[tag]] = tuple(float(t) if isinstance(t, TiffImagePlugin.IFDRational) else t for t in value)
                    elif isinstance(value, bytes):
                        exif[TAGS[tag]] = value.decode(errors="replace")
                    else:
                        exif[TAGS[tag]] = value

                    if TAGS[tag] not in headers:
                        headers.append(TAGS[tag])

        metadata[filename] = exif

    return metadata, headers


metadata, headers = get_metadata()

framed_metadata = pd.DataFrame.from_dict(metadata, orient='index', columns=headers)
for data in framed_metadata:
    for header in headers:
        framed_metadata.loc[framed_metadata[header].isna(), [header]] = "No " + header

framed_metadata

%D0%A4%D0%BE%D1%82%D0%BE%20%D0%BA%D1%83%D0%BD%D0%BE%D0%B2.jpg    image/jpeg
0612%20chartreux%20vitscha%20orsbleus.jpg                        image/jpeg
8-month-old%20male%20Lykoi.jpg                                   image/jpeg
Abessinierkater1.jpg                                             image/jpeg
Abyssinian%20cat.png                                              image/png
                                                                    ...    
Tonkinese.gif                                                     image/gif
Toyger%20male%20queenanne.JPG                                    image/jpeg
Turkse%20angora.jpg                                              image/jpeg
Tuxedo%20longhair%20cat%20-%20Spanky.jpg                         image/jpeg
Van%20Cat%201%202015.JPG                                         image/jpeg
Name: Extension, Length: 97, dtype: object
%D0%A4%D0%BE%D1%82%D0%BE%20%D0%BA%D1%83%D0%BD%D0%BE%D0%B2.jpg    2.0
0612%20chartreux%20vitscha%20orsbleus.jpg           

Unnamed: 0,Extension,ResolutionUnit,ExifOffset,Make,Model,Software,Orientation,DateTime,YCbCrPositioning,XResolution,...,InterColorProfile,XPKeywords,YCbCrCoefficients,WhitePoint,PrimaryChromaticities,ExposureTime,SubjectDistanceRange,XMLPacket,ImageResources,NewSubfileType
%D0%A4%D0%BE%D1%82%D0%BE%20%D0%BA%D1%83%D0%BD%D0%BE%D0%B2.jpg,image/jpeg,2.0,220.0,Canon,Canon EOS 10D,Adobe Photoshop CS3 Windows,1.0,2007:11:23 12:35:11,1.0,180.0,...,No InterColorProfile,No XPKeywords,No YCbCrCoefficients,No WhitePoint,No PrimaryChromaticities,No ExposureTime,No SubjectDistanceRange,No XMLPacket,No ImageResources,No NewSubfileType
0612%20chartreux%20vitscha%20orsbleus.jpg,image/jpeg,No ResolutionUnit,No ExifOffset,No Make,No Model,No Software,No Orientation,No DateTime,No YCbCrPositioning,No XResolution,...,No InterColorProfile,No XPKeywords,No YCbCrCoefficients,No WhitePoint,No PrimaryChromaticities,No ExposureTime,No SubjectDistanceRange,No XMLPacket,No ImageResources,No NewSubfileType
8-month-old%20male%20Lykoi.jpg,image/jpeg,No ResolutionUnit,38.0,No Make,No Model,No Software,1.0,No DateTime,No YCbCrPositioning,No XResolution,...,No InterColorProfile,No XPKeywords,No YCbCrCoefficients,No WhitePoint,No PrimaryChromaticities,No ExposureTime,No SubjectDistanceRange,No XMLPacket,No ImageResources,No NewSubfileType
Abessinierkater1.jpg,image/jpeg,No ResolutionUnit,No ExifOffset,No Make,No Model,No Software,No Orientation,No DateTime,No YCbCrPositioning,No XResolution,...,No InterColorProfile,No XPKeywords,No YCbCrCoefficients,No WhitePoint,No PrimaryChromaticities,No ExposureTime,No SubjectDistanceRange,No XMLPacket,No ImageResources,No NewSubfileType
Abyssinian%20cat.png,image/png,No ResolutionUnit,No ExifOffset,No Make,No Model,No Software,No Orientation,No DateTime,No YCbCrPositioning,No XResolution,...,No InterColorProfile,No XPKeywords,No YCbCrCoefficients,No WhitePoint,No PrimaryChromaticities,No ExposureTime,No SubjectDistanceRange,No XMLPacket,No ImageResources,No NewSubfileType
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Tonkinese.gif,image/gif,No ResolutionUnit,No ExifOffset,No Make,No Model,No Software,No Orientation,No DateTime,No YCbCrPositioning,No XResolution,...,No InterColorProfile,No XPKeywords,No YCbCrCoefficients,No WhitePoint,No PrimaryChromaticities,No ExposureTime,No SubjectDistanceRange,No XMLPacket,No ImageResources,No NewSubfileType
Toyger%20male%20queenanne.JPG,image/jpeg,2.0,260.0,Canon,Canon EOS 550D,Paint Shop Pro Photo 12.00��������������,1.0,2012:03:15 12:41:06�,2.0,72.0,...,No InterColorProfile,l�u�c�k�y���,No YCbCrCoefficients,No WhitePoint,No PrimaryChromaticities,No ExposureTime,No SubjectDistanceRange,No XMLPacket,No ImageResources,No NewSubfileType
Turkse%20angora.jpg,image/jpeg,2.0,288.0,"CASIO COMPUTER CO.,LTD",EX-Z750,Adobe Photoshop CS4 Windows,1.0,2009:09:14 22:54:16,1.0,72.0,...,No InterColorProfile,No XPKeywords,No YCbCrCoefficients,No WhitePoint,No PrimaryChromaticities,No ExposureTime,No SubjectDistanceRange,No XMLPacket,No ImageResources,No NewSubfileType
Tuxedo%20longhair%20cat%20-%20Spanky.jpg,image/jpeg,No ResolutionUnit,No ExifOffset,No Make,No Model,No Software,No Orientation,No DateTime,No YCbCrPositioning,No XResolution,...,No InterColorProfile,No XPKeywords,No YCbCrCoefficients,No WhitePoint,No PrimaryChromaticities,No ExposureTime,No SubjectDistanceRange,No XMLPacket,No ImageResources,No NewSubfileType


In [30]:
with open('metadata.json', 'w') as f:
    f.write(framed_metadata.to_json(orient='index'))

## Étiquetage et annotation

Pour cette tâche, vous devez rechercher les sources disposant d'informations supplémentaires comme les balises, les catégories, etc.
Premièrement on créé une fonction pour transformer un tuple RGB en couleur humaine (ex (255, 0, 0) -> Red)

In [31]:
from scipy.spatial import KDTree
from webcolors import (
    CSS3_HEX_TO_NAMES,
    hex_to_rgb,
)

def convert_rgb_to_names(rgb_tuple):
    css3_db = CSS3_HEX_TO_NAMES
    names = []
    rgb_values = []
    for color_hex, color_name in css3_db.items():
        names.append(color_name)
        rgb_values.append(hex_to_rgb(color_hex))

    kdt_db = KDTree(rgb_values)
    distance, index = kdt_db.query(rgb_tuple)
    return names[index]

Ensuite on créé une fonction pour récupérer les 2 couleurs les plus présentes dans l'image passée en paramètres

In [32]:
from PIL import Image
import numpy as np
import math
from sklearn.cluster import KMeans

def get_most_seen_colors(file_path):
    img_file = Image.open(file_path).convert("RGBA")

    nb_colors = 2
    clusters = KMeans(n_clusters=nb_colors, n_init=2)

    numarray = np.array(img_file.getdata(), np.uint8)
    clusters.fit(numarray)

    np.arange(0, nb_colors + 1)

    most_seen_colors = []
    for i in range(nb_colors):
        most_seen_colors.append(
            (
                math.ceil(clusters.cluster_centers_[i][0]),
                math.ceil(clusters.cluster_centers_[i][1]),
                math.ceil(clusters.cluster_centers_[i][2]),
            )
        )

    return most_seen_colors

On ajoute les 2 couleurs les plus présentes dans chaque image en tant que tag dans le fichier `metadata.json`

In [33]:
import json

with open('metadata.json', 'r') as metadata_file:
    metadata = json.load(metadata_file)

    for image in metadata:
        most_seen_colors = get_most_seen_colors('images/' + image)

        for index, color in enumerate(most_seen_colors):
            metadata[image]["Color" + str(index + 1)] = convert_rgb_to_names(color)

In [34]:
with open('metadata.json', 'w') as metadata_file:
    json.dump(metadata, metadata_file)

On ajoute un tag à toutes les images avec l'input utilisateur

In [35]:
import json

with open('metadata.json', 'r') as metadata_file:
    metadata = json.load(metadata_file)

    tag = input('Ajouter un tag aux images: ')

    for image in metadata:
        metadata[image][tag] = False

with open('metadata.json', 'w') as metadata_file:
    json.dump(metadata, metadata_file)
    metadata_file.close()

## Analyses de données

On attribue `y` images likées à `i` utilisateurs (sans doublon)

In [365]:
seed = 1312

In [3]:
import random, os

nb_users = 5
nb_images_likes = 20
images = os.listdir('.\\images')
nb_files = len(images)
users = []

for i in range(nb_users):
    users.append({ 'liked_images': [] })

    for y in range(nb_images_likes):
        liked_image = random.randrange(0, nb_files)

        if liked_image not in users[i]:
            users[i]['liked_images'].append(images[liked_image])

users

[{'liked_images': ['Chairman%20Meow%20Bao.jpg',
   'British%20Longhair%20-%20Blue%20Bicolor.jpg',
   'SOMns.jpg',
   'High%20mitted.jpg',
   'Serengetimalecat.jpg',
   'Cat%200063.JPG',
   'Tuxedo%20longhair%20cat%20-%20Spanky.jpg',
   'KOR%201149.jpg',
   'Chairman%20Meow%20Bao.jpg',
   'Cream%20tabby%20exotic%20cat.jpg',
   'Gustav%20chocolate.jpg',
   'Minskin%20Kitten%20Female%20blue%20tabby%20color-pattern.jpg',
   'Serengetimalecat.jpg',
   'Shadow%20the%20Bombay%20Cat.jpg',
   'Javanese%20cat.jpg',
   'Orientalshorthairs2.jpg',
   'Maine%20Coon%20kittens%20NO%20Sigdalskauen.jpg',
   'Toyger%20male%20queenanne.JPG',
   'JapaneseBobtailBlueEyedMi-ke.JPG',
   '%D0%A4%D0%BE%D1%82%D0%BE%20%D0%BA%D1%83%D0%BD%D0%BE%D0%B2.jpg']},
 {'liked_images': ['European-cat%20show%20Helsinki%202008.JPG',
   'SOMns.jpg',
   'Lazuli01.jpg',
   'British%20Longhair%20-%20Blue%20Bicolor.jpg',
   'Sokoke%20dalili.jpg',
   'Mainecoonbandit.jpg',
   'Seal%20Point%20Siamese%20Kitten.JPG',
   'Bluebell%202an

Ensuite on test des images pour savoir si nos 5 utilisateurs vont aimer les images.
On a dû ajuster les données car si les champs `XResolution` et `YResolution` étaient vide ils contenaient `"No XResoolution"` au lieu de `0.0`

In [415]:
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn import tree
import pandas as pd
import math

def will_user_like_given_image(user, image_to_test):
    images = []
    result = []
    for image in metadata:
        if image in user['liked_images']:
            result.append("Favorite")
        else:
            result.append("Not favorite")
        images.append(metadata[image])

    dataframe = pd.DataFrame(images, columns=["Color1", "Color2", "XResolution", "YResolution", "DateTime"])
    resultframe = pd.DataFrame(result, columns=["Favorite"])

    le1 = LabelEncoder()
    dataframe["Color1"] = le1.fit_transform(dataframe["Color1"])

    le2 = LabelEncoder()
    dataframe["Color2"] = le2.fit_transform(dataframe["Color2"])

    le3 = LabelEncoder()
    dataframe["XResolution"] = le3.fit_transform(dataframe["XResolution"])

    le4 = LabelEncoder()
    dataframe["YResolution"] = le4.fit_transform(dataframe["YResolution"])

    le5 = LabelEncoder()
    dataframe["DateTime"] = le5.fit_transform(dataframe["DateTime"])


    le6 = LabelEncoder()
    resultframe["Favorite"] = le6.fit_transform(resultframe["Favorite"])

    dtc = tree.DecisionTreeClassifier(random_state=seed)
    dtc = dtc.fit(dataframe.values, resultframe)

    # prediction
    prediction = dtc.predict(
        [
            [
                le1.transform([image_to_test["Color1"]])[0],
                le2.transform([image_to_test["Color2"]])[0],
                le3.transform([image_to_test["XResolution"]])[0],
                le4.transform([image_to_test["YResolution"]])[0],
                le5.transform([image_to_test["DateTime"]])[0],
            ]
        ]
    )

    print(le6.inverse_transform(prediction)[0])
    print(dtc.feature_importances_)

In [429]:
import json

image_to_test = metadata["Bluebell%202ans.jpg"]
metadata = json.load(open('metadata.json'))

for user in users:
    will_user_like_given_image(user, image_to_test)

Not favorite
[0.38583515 0.33717865 0.02369489 0.         0.25329131]
Favorite
[0.42754203 0.26203128 0.13328265 0.01765562 0.15948841]
Not favorite
[0.2607727  0.28454018 0.05075485 0.11026867 0.29366361]
Not favorite
[0.30134584 0.23112235 0.01100448 0.08054833 0.37597901]
Favorite
[0.31665089 0.20436066 0.14383319 0.07488117 0.26027409]
