In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from PIL import Image
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split

import gensim
from gensim import corpora, downloader
from gensim.utils import simple_preprocess

import tensorflow as tf
from tensorflow import keras
from keras.utils import pad_sequences
from tensorflow.keras.layers import Input, BatchNormalization,concatenate, Dense, Embedding, Flatten, GlobalAveragePooling1D, GlobalAveragePooling2D
from tensorflow.keras.models import Model, Sequential

from tensorflow.keras.applications.xception import Xception,preprocess_input 

from joblib import dump, load

2023-06-12 09:40:29.411916: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
all_data = pd.read_csv("../data/X_train_update.csv", index_col='Unnamed: 0')
target = pd.read_csv("../data/Y_train_CVw08PX.csv", index_col='Unnamed: 0')

In [3]:
all_data.head()

Unnamed: 0,designation,description,productid,imageid
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786


In [4]:
label_encoder = preprocessing.LabelEncoder()

target_le = label_encoder.fit_transform(target['prdtypecode'].to_numpy())

In [5]:
sentences = all_data['designation']

# tokenisationx
tokens = [simple_preprocess(sent, min_len = 3) for sent in sentences]

In [6]:
# calcul de la longueur max des séquences de tokens
size_tokens = map(lambda x:len(x),tokens)
max_len = max(size_tokens)

print("taille maximum des sequences de tokens :", max_len )

taille maximum des sequences de tokens : 33


In [7]:
# import de l'embedding préentrainé GLoVe dimension 100
embeddings_index = {}

f = open("../data/glove/glove.6B.100d.txt")

for line in f :
    values = line.split()
    word = values[0]
    coeffs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coeffs 
f.close()

In [8]:
# création du dicitonnaire
dictionary = corpora.Dictionary(tokens)

print("nombre de phrases traitées :", dictionary.num_docs)
print("nombre de mots traités :", dictionary.num_pos)

nombre de phrases traitées : 84916
nombre de mots traités : 696547


In [9]:
# tri du dictionnaire par ordre de fréquence d'appartion décroissante des tokens
sort_dictionary = sorted(dictionary.cfs.items(), key = lambda t : -t[1])
words = [dictionary[t[0]] for t in sort_dictionary]

# création de la matrice d'embedding de dimension 100 pour les "max_words" les plus fréquents
embedding_dim = 100
max_words = len(dictionary)+1 # on prend tout le dicitonnaire ici

embedding_matrix = np.zeros((max_words,embedding_dim))

for  i, word in enumerate(words) :
    if i < max_words :
        embedding_vector = embeddings_index.get(word)
    
    if  embedding_vector is not None :
        embedding_matrix[i] = embedding_vector
        
embedding_matrix.shape

(60867, 100)

In [10]:
# transformation des tokens de chaque ligne en son identifiant unique dans le dicitonnaire
tokens2id = [dictionary.doc2idx(t) for t in tokens]

# padding des séquences d'ID pour qu'ils aient tous une taille de 33 (max_len)
tokens2id_pad = pad_sequences(tokens2id , maxlen =  max_len, padding = 'post')

tokens2id_pad.shape

(84916, 33)

In [11]:
# Ajout d'un colone 'tokens2id' au df des données avant le spilt Train/Test/Valid/
all_data['tokens2id'] = pd.Series(list(tokens2id_pad))

In [12]:
all_data.head()

Unnamed: 0,designation,description,productid,imageid,tokens2id
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,"[3, 2, 6, 4, 1, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,"[19, 14, 9, 8, 28, 20, 25, 8, 10, 21, 18, 12, ..."
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,"[33, 38, 31, 30, 32, 34, 39, 36, 35, 37, 0, 0,..."
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,"[45, 42, 43, 40, 44, 41, 0, 0, 0, 0, 0, 0, 0, ..."
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,"[46, 14, 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [13]:
#Split Train / Test / Valid (60 / 20 / 20)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(all_data, target_le, test_size=0.2, random_state = 0, stratify = target_le) 
X_train_raw, X_valid_raw, y_train, y_valid = train_test_split(X_train_raw, y_train, test_size=0.2, random_state = 0, stratify = y_train) 

In [14]:
print("Taille totale du jeu reduit : {} entrées".format(len(all_data)))
print("Train : {} \nValidation : {} \nTest : {}".format(len(X_train_raw),len(X_valid_raw),len( X_test_raw)))

Taille totale du jeu reduit : 84916 entrées
Train : 54345 
Validation : 13587 
Test : 16984


In [15]:
images_dir = "../data/images/image_train/"

# liste des chemins vers les images
X_train_path = [images_dir + "image_" + str(X_train_raw.iloc[k,3]) + "_product_"+str(X_train_raw.iloc[k,2])+".jpg" for k in range(len(X_train_raw))]
X_valid_path = [images_dir + "image_" + str(X_valid_raw.iloc[k,3]) + "_product_"+str(X_valid_raw.iloc[k,2])+".jpg" for k in range(len(X_valid_raw))]
X_test_path = [images_dir + "image_" + str(X_test_raw.iloc[k,3]) + "_product_"+str(X_test_raw.iloc[k,2])+".jpg" for k in range(len(X_test_raw))]

In [16]:
image_size = 299 # Modèle Xception
batch_size = 32
prefetch_factor = tf.data.experimental.AUTOTUNE 

In [17]:
# fonction pour le chargement des images sans augmentation
def load_image(filepath) :
    # Chargement de l'image du df en mémoire
    im = tf.io.read_file(filepath)
    
    # Décodage de l'info dans un tenseur RGB
    im = tf.image.decode_jpeg(im, 3)
    
    # Retourne l'image à la bonne dimension 
    im = tf.image.resize(im, size=(image_size, image_size))
    
    # Preprocess du modèle
    im = tf.keras.applications.xception.preprocess_input(im)
        
    return im

In [18]:
# fonction pour le chargement des images AVEC augmentation

import random
# Génerateur aléatoire
rng = tf.random.Generator.from_seed(123, alg='philox')

# fonction pour le chargement des images
def load_transform_image(filepath) :
    # Chargement de l'image du df en mémoire
    im = tf.io.read_file(filepath)
    
    # Décodage de l'info dans un tenseur RGB
    im = tf.image.decode_jpeg(im, 3)
    
    # Data Augmentation : ajustement aléatoire du contraste
    contrast_factor = random.random() + 1.0
    im = tf.image.adjust_contrast(im,contrast_factor = contrast_factor)
    
    # Data Augmentation : retournement horizontal aléatoire
    im = tf.image.stateless_random_flip_left_right(im,rng.make_seeds(2)[0])
   
    # Retourne l'image à la bonne dimension 
    im = tf.image.resize(im, size=(image_size, image_size))
    
    # Preprocess du modèle
    im = tf.keras.applications.xception.preprocess_input(im)
        
    return im

2023-06-12 09:40:55.807111: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [19]:
# Données pour la validation au cours de l'entrainement
# définition du dataset
dataset_valid = tf.data.Dataset.from_tensor_slices((X_valid_path, y_valid))

# application de la fonction load_image au dataset
dataset_valid = dataset_valid.map(lambda y, z : [load_image(y),z], num_parallel_calls = -1).cache()

# regroupement en batchs
dataset_valid = dataset_valid.batch(batch_size).prefetch(prefetch_factor)

In [20]:
# Données de test après l'entrainement
# définition du dataset
dataset_test = tf.data.Dataset.from_tensor_slices((X_test_path, y_test))

# application de la fonction load_image au dataset
dataset_test = dataset_test.map(lambda y, z : [load_image(y),z], num_parallel_calls = -1).cache()

# regroupement en batchs
dataset_test = dataset_test.batch(batch_size).prefetch(prefetch_factor)

In [21]:
# Création des données d'entrainement
# définition du dataset
dataset_train = tf.data.Dataset.from_tensor_slices((X_train_path, y_train))

# application de la fonction load_image au dataset
dataset_train = dataset_train.map(lambda y, z : [(load_transform_image(y)),z], num_parallel_calls = -1).cache()

# regroupement en batchs
dataset_train = dataset_train.shuffle(1000).batch(batch_size).prefetch(prefetch_factor)

In [22]:
# Modèle Xception 
xception = Xception(weights='imagenet', 
                    include_top= False,
                    input_shape = (image_size,image_size,3)) 

for layer in xception.layers[:105]:
    layer.trainable = False
    
for layer in xception.layers[105:]:
    layer.trainable = True

In [23]:
# Nombre de classes en sortie
n_class = 27

In [24]:
## Modèle 1 : 1 simple classifieur
model = Sequential()
model.add(xception)
model.add(GlobalAveragePooling2D()) 
model.add(Dense(n_class, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 xception (Functional)       (None, 10, 10, 2048)      20861480  
                                                                 
 global_average_pooling2d (G  (None, 2048)             0         
 lobalAveragePooling2D)                                          
                                                                 
 dense (Dense)               (None, 27)                55323     
                                                                 
Total params: 20,916,803
Trainable params: 8,457,683
Non-trainable params: 12,459,120
_________________________________________________________________


In [25]:
model.compile(optimizer= 'adam', 
              loss = 'sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [26]:
from tensorflow.keras import callbacks

# Instanciation du callback 
checkpoint = callbacks.ModelCheckpoint(filepath = "../data/checkpoint_xception_dataset_full",
                                       monitor = 'val_loss',
                                       save_best_only = True,
                                       save_weights_only = True,
                                       mode = 'min',
                                       save_freq = 'epoch')

In [27]:
history = model.fit(dataset_train,
                    epochs = 4,
                    validation_data = dataset_valid, 
                    callbacks = [checkpoint])

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [30]:
model.save('../models/xception_unfreeze105_4ep_dataset_full_data.h5')

In [None]:
dump(model, "../models/xception_unfreeze105_4ep_dataset_full_data.joblib")

In [None]:
# Récupération de l'accuray pour les 3 modèles
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

In [33]:
epochs = range(1, len(acc)+1)

In [None]:
# Visualisation de performances des 3 modèles pendant l'apprentissage
plt.figure(figsize = (12,8))

plt.subplot(121)
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Xception unfreeze / 10 epochs")
plt.plot(epochs,acc, "--", label = "Training " )
plt.plot(epochs,val_acc,label = "Validation")

plt.subplot(122)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Xception unfreeze / 10 epochs")
plt.plot(epochs,loss, "--", label = "Training " )
plt.plot(epochs,val_loss,label = "Validation")

plt.legend();

In [22]:
reload_model = load("../models/xception_unfreeze105_4ep_dataset_full_data.joblib")

In [23]:
score = reload_model.evaluate(dataset_test)

print("Accuracy - Modèle Xception Full Data :",score[1])

Accuracy - Modèle Xception Full Data : 0.6200541853904724
