In [1]:
# Import standard libraries
import pandas as pd
import numpy as np
from glob import glob
import io
import re
import pickle

# Import image processing/reading libraries
import cv2
import matplotlib.pyplot as plt
from skimage import transform
import albumentations as A



# CNN libraries
# example of tending the vgg16 model
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.layers import Dense
from keras.layers import Flatten
import tensorflow as tf
from tensorflow.keras import mixed_precision
from tensorflow.keras.applications.vgg19 import preprocess_input
from tensorflow.keras.preprocessing import image
import os
from keras.layers import Flatten

# Import NLP libraries
from sentence_transformers import SentenceTransformer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer


# Other libraries
from IPython.display import clear_output

## BOW - TF-IDF

In [8]:
description=pd.read_csv("../data/Dataset+projet+prétraitement+textes+images/Flipkart/flipkart_com-ecommerce_sample_1050.csv")

## Preprocessing du text avec transformation en minuscule, eliminiation de stopwords
def text_processing(x):
    x=x.lower()
    pattern = r'[0-9]'
    x = re.sub(pattern, '', x)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(x)
    word_tokens = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
            
    return " ".join(filtered_sentence)

description["processed_description"]=description["description"].apply(lambda x: text_processing(x))

In [19]:
# Application de TF-IDF
transformer = TfidfVectorizer(max_df=0.5,min_df=0.001)
vectors = transformer.fit_transform(description["processed_description"])
feature_names = transformer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [21]:
# Sauvegarde du fichier vectoriser sous forme serialisee
with open("tfidf_vectorizer.pkl","wb") as f:
    pickle.dump(df,f)

In [22]:
df

Unnamed: 0,aa,aapno,able,abode,absorbency,absorbent,abstract,abstracts,ac,accent,...,ym,york,young,youth,youthful,yuva,zero,zipper,zone,zyxel
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.183971,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.076418,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1046,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1047,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1048,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Sentence vectorizer

In [39]:
# Vectoriser avec un modele BERT
description=pd.read_csv("../data/Dataset+projet+prétraitement+textes+images/Flipkart/flipkart_com-ecommerce_sample_1050.csv")
vectorizer = SentenceTransformer('all-MiniLM-L6-v2')
test=description["description"].iloc[0]
sentence_embeddings = vectorizer.encode(test)
sentence_embeddings = sentence_embeddings.reshape(1,sentence_embeddings.shape[0])
vectorized_description=[]
for a in description["description"]:
    sentence_embeddings = vectorizer.encode(a)
    sentence_embeddings = sentence_embeddings.reshape(1,sentence_embeddings.shape[0])
    vectorized_description.append(sentence_embeddings.reshape(384))

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

## CNN Vectorizer

In [43]:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Creation du modele  cnn VGG16 en modifiant l'input
model = VGG16(weights='imagenet', include_top=False,input_shape=(258,477,3))
flat1 = Flatten()(model.layers[-1].output)
output = Dense(500, activation='softmax')(flat1)
model=Model(inputs=model.inputs,outputs=output)


In [44]:
img=cv2.imread("../data/Dataset+projet+prétraitement+textes+images/Flipkart/Images/7b72c92c2f6c40268628ec5f14c6d590.jpg")

In [45]:
# Format de l'image entrante
scale_percent = 20 # percent of original size
width = int(img.shape[1] * scale_percent / 100)
height = int(img.shape[0] * scale_percent / 100)
dim = (width, height)
  
# resize image
resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
resized.shape

(258, 477, 3)

In [57]:
img=cv2.imread("../data/Dataset+projet+prétraitement+textes+images/Flipkart/Images/7b72c92c2f6c40268628ec5f14c6d590.jpg")
scale_percent = 20 # percent of original size
width = int(img.shape[1] * scale_percent / 100)
height = int(img.shape[0] * scale_percent / 100)
dim = (width, height)
cnn_features1=[]
i=0
# Extraction des features des images originales
for a in description["image"]:
    print(i)
    
    img = cv2.imread("../data/Dataset+projet+prétraitement+textes+images/Flipkart/Images/"+a)
    resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
    x = image.img_to_array(resized)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = model.predict(x)
    cnn_features1.append(features.reshape(500))
    i=i+1
    clear_output(wait=True)
    
features1=pd.concat([pd.DataFrame(cnn_features1),pd.DataFrame(vectorized_description)],axis=1,ignore_index=True)

1049


In [58]:
cnn_features2=[]
i=0
# Extraction des features des images data1
for a in description["image"]:
    print(i)
    img = cv2.imread("../data/data_pretraitement/data1/"+a)
    resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
    x = image.img_to_array(resized)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = model.predict(x)
    cnn_features2.append(features.reshape(500))
    i=i+1
    clear_output(wait=True)
    
features2=pd.concat([pd.DataFrame(cnn_features2),pd.DataFrame(vectorized_description)],axis=1,ignore_index=True)

1049


In [59]:
cnn_features3=[]
i=0
# Extraction des features des images data2
for a in description["image"]:
    print(i)
    img = cv2.imread("../data/data_pretraitement/data2/"+a)
    resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
    x = image.img_to_array(resized)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = model.predict(x)
    cnn_features3.append(features.reshape(500))
    i=i+1
    clear_output(wait=True)


features3=pd.concat([pd.DataFrame(cnn_features3),pd.DataFrame(vectorized_description)],axis=1,ignore_index=True)

1049


In [60]:
cnn_features4=[]
i=0
# Extraction des features des images data3
for a in description["image"]:
    print(i)
    img = cv2.imread("../data/data_pretraitement/data3/"+a)
    resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
    x = image.img_to_array(resized)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = model.predict(x)
    cnn_features4.append(features.reshape(500))
    i=i+1
    clear_output(wait=True)


features4=pd.concat([pd.DataFrame(cnn_features4),pd.DataFrame(vectorized_description)],axis=1,ignore_index=True)

1049


In [61]:
# Sauvegarde des features sous forme serialisee 
with open("vectorized_cnn.pkl","wb") as f:
    pickle.dump([features1,features2,features3,features4],f)

## SIFT

In [24]:
img=cv2.imread("../data/Dataset+projet+prétraitement+textes+images/Flipkart/Images/7b72c92c2f6c40268628ec5f14c6d590.jpg")
scale_percent = 20 # percent of original size
width = int(img.shape[1] * scale_percent / 100)
height = int(img.shape[0] * scale_percent / 100)
dim = (width, height)
resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
gray_scale = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)

In [25]:
# Fonction extrayant un dico de features par image
def step1(description,path):
    #sift_features=[]
    i=0
    dico=[]
    for a in description["image"]:
        print(i)
        img = cv2.imread(path+a)
        width=img.shape[1]
        height=img.shape[0]
        if width > 500 or height > 500:
            scale_percent = 20 # percent of original size
            width = int(img.shape[1] * scale_percent / 100)
            height = int(img.shape[0] * scale_percent / 100)
            dim = (width, height)
            resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
        else :
            resized = img
        #gray_scale = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
        sift = cv2.xfeatures2d.SIFT_create()
        kp, des = sift.detectAndCompute(resized, None)
        #sift_features.append(des)
        if des is not None:
            for d in des:
                dico.append(d)
        i=i+1
        clear_output(wait=True)
    return dico

In [17]:
# Code de la fonction step1 sans fonction
sift_features1=[]
i=0
dico=[]
for a in description["image"]:
    print(i)
    
    img = cv2.imread("../data/Dataset+projet+prétraitement+textes+images/Flipkart/Images/"+a)
    width=img.shape[1]
    height=img.shape[0]
    if width > 500 or height > 500:
        scale_percent = 20 # percent of original size
        width = int(img.shape[1] * scale_percent / 100)
        height = int(img.shape[0] * scale_percent / 100)
        dim = (width, height)
        resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
    else :
        resized = img
    #gray_scale = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
    sift = cv2.xfeatures2d.SIFT_create()
    kp, des = sift.detectAndCompute(resized, None)
    sift_features1.append(des)
    if des is not None:
        for d in des:
            dico.append(d)
    i=i+1
    clear_output(wait=True)

NameError: name 'MiniBatchKMeans' is not defined

In [26]:
# Fonction creant et fittant un algorithme kmeans sur un dictionnaire
def step2(k,batch,dico):
    kmeans = MiniBatchKMeans(n_clusters=k, batch_size=batch).fit(dico)
    return kmeans

In [27]:
# Fonction creant une liste de features
def step3(description,path,kmeans):
    histo_list = []
    i=0
    for a in description["image"]:
        print(i)

        img = cv2.imread(path+a)
        print(path+a)
        width=img.shape[1]
        height=img.shape[0]
        if width > 500 or height > 500:
            scale_percent = 20 # percent of original size
            width = int(img.shape[1] * scale_percent / 100)
            height = int(img.shape[0] * scale_percent / 100)
            dim = (width, height)
            resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
        else :
            resized = img
        sift = cv2.xfeatures2d.SIFT_create()

        kp, des = sift.detectAndCompute(resized, None)  
        histo = np.zeros(620)
        nkp = np.size(kp)

        if des is not None:
            for d in des:
                idx = kmeans.predict([d])
                histo[idx] += 1/nkp 
        i=i+1
        clear_output(wait=True)
        histo_list.append(histo)
    return histo_list

In [21]:
# Code de la fonction step3 sans fonction
histo_list = []
i=0
for a in description["image"]:
    print(i)
    
    img = cv2.imread("../data/Dataset+projet+prétraitement+textes+images/Flipkart/Images/"+a)
    width=img.shape[1]
    height=img.shape[0]
    if width > 500 or height > 500:
        scale_percent = 20 # percent of original size
        width = int(img.shape[1] * scale_percent / 100)
        height = int(img.shape[0] * scale_percent / 100)
        dim = (width, height)
        resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
    else :
        resized = img
    sift = cv2.xfeatures2d.SIFT_create()
    
    kp, des = sift.detectAndCompute(resized, None)  
    histo = np.zeros(620)
    nkp = np.size(kp)

    if des is not None:
        for d in des:
            idx = kmeans.predict([d])
            histo[idx] += 1/nkp 
    i=i+1
    clear_output(wait=True)
    histo_list.append(histo)

1049


In [29]:
# Creation des features visuelles et textuelles pour les images originales
dico = step1(description,"../data/Dataset+projet+prétraitement+textes+images/Flipkart/Images/")
kmeans = step2(620,100,dico)
histo_list = step3(description,"../data/Dataset+projet+prétraitement+textes+images/Flipkart/Images/",kmeans)
siftfeatures1=pd.concat([pd.DataFrame(histo_list),df],ignore_index=True,axis=1)

1049
../data/Dataset+projet+prétraitement+textes+images/Flipkart/Images/f2f027ad6a6df617c9f125173da71e44.jpg


In [30]:
# Creation des features visuelles et textuelles pour les images data1
dico = step1(description,"../data/data_pretraitement/data1/")
kmeans = step2(620,100,dico)
histo_list = step3(description,"../data/data_pretraitement/data1/",kmeans)
siftfeatures2=pd.concat([pd.DataFrame(histo_list),df],ignore_index=True,axis=1)

1049
../data/data_pretraitement/data1/f2f027ad6a6df617c9f125173da71e44.jpg


In [31]:
# Creation des features visuelles et textuelles pour les images data2
dico = step1(description,"../data/data_pretraitement/data2/")
kmeans = step2(620,100,dico)
histo_list = step3(description,"../data/data_pretraitement/data2/",kmeans)
siftfeatures3=pd.concat([pd.DataFrame(histo_list),df],ignore_index=True,axis=1)

1049
../data/data_pretraitement/data2/f2f027ad6a6df617c9f125173da71e44.jpg


In [32]:
# Creation des features visuelles et textuelles pour les images data3
dico = step1(description,"../data/data_pretraitement/data3/")
kmeans = step2(620,100,dico)
histo_list = step3(description,"../data/data_pretraitement/data3/",kmeans)
siftfeatures4=pd.concat([pd.DataFrame(histo_list),df],ignore_index=True,axis=1)

1049
../data/data_pretraitement/data3/f2f027ad6a6df617c9f125173da71e44.jpg


In [33]:
# Sauvegarde du fichier sous format serialise
with open("vectorized_sift.pkl","wb") as f:
    pickle.dump([siftfeatures1,siftfeatures2,siftfeatures3,siftfeatures4],f)