# Projet de traitement de données massive

## Partie 1

### Collecte de données

#### Extract informations

Cloner le dataset depuis git


In [72]:
import git
# Check out via HTTPS
#git.Repo.clone_from('https://github.com/Lseig/Images_TDM.git', 'Images_git')


Extraction des informations des images


In [73]:
import PIL.Image
import os
import json
from sklearn.cluster import KMeans
import numpy
from colorthief import ColorThief
import time
import git

from scipy.spatial import KDTree
from webcolors import hex_to_rgb
from webcolors import CSS3_HEX_TO_NAMES


class ImageAnalysis:
    def __init__(self, directory):
        self.extract_metadata(directory)

    def orientation(self, height, width):
        if height > width:
            return "Portrait"
        elif width > height:
            return "Paysage"
        else:
            return "Carre"
            
    def taille(self, height, width):
        if height*width >= (1200*900):
            return "Big"
        if height*width >= (500*500) and height*width < (1200*900):
            return "Medium"
        else:
            return "Small"

    def convert_rgb_to_names(self, rgb_tuple):
        # a dictionary of all the hex and their respective names in css3
        css3_db = CSS3_HEX_TO_NAMES
        names = []
        rgb_values = []
        for color_hex, color_name in css3_db.items():
            # Ajoute à la liste names les noms des couleurs
            names.append(color_name)
            rgb_values.append(hex_to_rgb(color_hex))

        kdt_db = KDTree(rgb_values)
        distance, index = kdt_db.query(rgb_tuple)
        return f'{names[index]}'

    def extract_metadata(self, directory_name):
        directory = os.fsencode(directory_name)
        dataDict = {}
        for file in os.listdir(directory):
            filename = os.fsdecode(file)
            img = PIL.Image.open(directory_name+"/"+filename)
            exif_data = img._getexif()
            color_thief = ColorThief(directory_name+"/"+filename)
            tabcolor = color_thief.get_palette(color_count=5)
            dataDict[filename] = {
                "Height": img.height,
                "Width": img.width,
                "Orientation": self.orientation(img.height, img.width),
                "Taille": self.taille(img.height, img.width),
                "Exif": str(exif_data),
                "Couleur dominante 1": self.convert_rgb_to_names(tabcolor[0]),
                "Couleur dominante 2": self.convert_rgb_to_names(tabcolor[1]),
                "Couleur dominante 3": self.convert_rgb_to_names(tabcolor[2]),
                "Couleur dominante 4": self.convert_rgb_to_names(tabcolor[3]),
                "Couleur dominante 5": self.convert_rgb_to_names(tabcolor[4])
            }

        with open('json_data.json', 'w') as outfile:
            json.dump(dataDict, outfile)


In [74]:
def generate_training_file():
    like_dict = {}

    directory = os.fsencode("image_like")
    for file in os.listdir(directory):
        filename = os.fsdecode(file)

        like = input("Entrer Y si vous aimez l'image" + filename +
                     " ou N si vous ne l'aimez pas ou U si vous n'avez pas d'avis")
        like_dict[filename] = {"like": like}
    with open('training_file.json', 'w') as outfile:
        json.dump(like_dict, outfile)


generate_training_file()


In [75]:
class LikeAnalysis:
    def __init__(self, training_file, image_folder):
        self.training_file = training_file
        imageAnalysis = ImageAnalysis(image_folder)
        self.like = []
        self.dislike = []
        self.aDict = {}
        self.image_folder = image_folder
        self.concatenateLikesAnalysis()
        

    def concatenateLikesAnalysis(self):
        '''
        Concatene le fichier d'entrainement de l'algorythme avec le fichier d'analyse des données des images afin de pouvoir
        etablir les gouts des utilisateurs
        '''
        
        analysis_file = open('json_data.json', 'r')
        self.aDict = json.load(analysis_file)
        analysis_file.close()

        training_fileA = open(self.training_file)
        likeDict = json.load(training_fileA)
        training_fileA.close()

        for x in self.aDict:
            for y in likeDict:
                if x == y:
                    if likeDict[y]["like"] == "y":
                        self.like.append(self.aDict[x])
                    elif likeDict[y]["like"] == "n":
                        self.dislike.append(self.aDict[x])
        

AnalysisResult = LikeAnalysis("training_file.json", "image_like")

In [79]:
class suggestionAlgorythm:

    def __init__(self, AnalysisResult):
        self.like = AnalysisResult.like
        self.dislike = AnalysisResult.dislike
        self.image_folder = AnalysisResult.image_folder
        self.aDict = AnalysisResult.aDict
        self.likeProfile = {}

    def calculateSizeLike(self):
        likeBig = 0
        likeSmall = 0
        likeMedium = 0

        for i in self.like:
            
            if i["Taille"] == "Big":
                likeBig += 1
                
            elif i["Taille"] == "Medium":
                likeMedium += 1
            elif i["Taille"] == "Small":
                likeSmall +=1

        for j in self.dislike:
            
            if j["Taille"] == "Big":
                likeBig -= 1
            elif j["Taille"] == "Medium":
                likeMedium -= 1
            elif j["Taille"] == "Small":
                likeSmall -=1
                
        self.likeProfile["Big"]= (likeBig/(len(self.like)+len(self.dislike)))
        self.likeProfile["Medium"]=(likeMedium/(len(self.like)+len(self.dislike)))
        self.likeProfile["Small"]=(likeSmall/(len(self.like)+len(self.dislike)))
    

    def calculateLikeProbability(self):
        probability = None
        return probability

    def suggestImage(self):

        if self.calculateLikeProbability() > 0.9:
            return True
        else:
            return False


t = suggestionAlgorythm(AnalysisResult)
t.calculateSizeLike()
print(t.likeProfile)


{'Big': -0.2, 'Medium': 0.0, 'Small': 0.0}
