<h2>Module 3 (Generate embeddings of images)</h2

this module will use some pre-trained and get embeddings from their last layer

In [4]:
import tensorflow as tf
import datetime as dt
import cv2
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
import json
import os

In [2]:
#loading the pre-trained DenseNet121 model 
tf.keras.backend.clear_session()
model_embedding = tf.keras.applications.DenseNet121(
    include_top=False, weights='imagenet', input_tensor=None, input_shape=(520,520,3),
    pooling=None,
)
def load_img(path):

  img = cv2.imread(path,cv2.IMREAD_UNCHANGED) 
  img = cv2.resize(img,(520,520),interpolation=cv2.INTER_AREA)
  img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
  img = tf.image.convert_image_dtype(img,tf.float32)[tf.newaxis, ...] 
  return img 

#function to get the embedding
def get_embeddings(path):
  img = load_img(path)
  op = model_embedding.predict(img,steps=1)
  op = tf.squeeze(op, axis=None, name=None)
  op = tf.reduce_mean(op, axis=(0,1), keepdims=False, name=None).numpy()
  #op = op.eval(session=tf.Session()) 
  op = op.tolist()
  return op

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
start_time = dt.datetime.now()
em1 = get_embeddings('/content/image1000_0.jpg')
print("Time taken to generate embeddings ",dt.datetime.now()-start_time)

Time taken to generate embeddings  0:00:32.519287


<h3>Module 4 (Store embeddings)</h3>

this module will store all our embeddings in Elastic Search and Faiss so can easily retrive them using similarity logic

In [5]:
!pip install elasticsearch
!pip install -q kaggle

Collecting elasticsearch
[?25l  Downloading https://files.pythonhosted.org/packages/09/93/461a042becf2a35a666fb7dbb2fa31f0f766dfd1b01e7d971f4ad51f0d69/elasticsearch-7.12.0-py2.py3-none-any.whl (334kB)
[K     |█                               | 10kB 23.4MB/s eta 0:00:01[K     |██                              | 20kB 14.2MB/s eta 0:00:01[K     |███                             | 30kB 13.2MB/s eta 0:00:01[K     |████                            | 40kB 12.4MB/s eta 0:00:01[K     |█████                           | 51kB 8.6MB/s eta 0:00:01[K     |█████▉                          | 61kB 9.1MB/s eta 0:00:01[K     |██████▉                         | 71kB 9.2MB/s eta 0:00:01[K     |███████▉                        | 81kB 9.4MB/s eta 0:00:01[K     |████████▉                       | 92kB 9.6MB/s eta 0:00:01[K     |█████████▉                      | 102kB 8.3MB/s eta 0:00:01[K     |██████████▊                     | 112kB 8.3MB/s eta 0:00:01[K     |███████████▊                    | 

In [3]:
#downloading the required files
# !gdown --id 1iVn4WhmZoq7lWgB1cPE6F8LzkQbf1dQ8
!gdown --id 1wDMBZIABSRsdiLYQ5Y9uBzh41a5nX2qt
# !gdown --id 1CBDJaBAq7HDM-Qkp91oVus4cSS3EIkMn

Downloading...
From: https://drive.google.com/uc?id=1wDMBZIABSRsdiLYQ5Y9uBzh41a5nX2qt
To: /content/Final.csv
1.67GB [00:15, 109MB/s]


In [10]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
from elasticsearch import Elasticsearch , helpers

In [12]:
#downloaded the dataset from kaggle
!kaggle datasets download -d shreyas90999/mycasestudy02ee

Downloading mycasestudy02ee.zip to /content
100% 11.1G/11.1G [04:26<00:00, 48.4MB/s]
100% 11.1G/11.1G [04:26<00:00, 44.6MB/s]


In [13]:
!mkdir data
!unzip -q '/content/mycasestudy02ee.zip' -d '/content/data'

In [14]:
!rm -rf '/content/mycasestudy02ee.zip'

In [15]:
#get data from json file
folder = []
json_file =[]
for (root,dirs,files) in tqdm(os.walk('/content/data/', topdown=False)):
  for directory in dirs:
    for i in os.listdir(root+directory):
      if i.endswith('.json'):
        folder.append(directory)
        json_file.append(i)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [16]:
#creating a csv and list by grouping categories into the main categories with the similar type of products 
super_cat = []
cat = []
info = []
file_name = []
url =[]
for i in range(len(json_file)):
  f = open('/content/data/' + folder[i] + '/' + json_file[i] )
  data = json.load(f)
  for j in data['name']:
    if folder[i]=='women_boots' or folder[i]=='women_casual_shoes' or folder[i]=='women_flats' or folder[i]=='women_heels':
       super_cat.append('foot_ware')
       cat.append(folder[i])
       info.append(json_file[i])
       file_name.append(j)
       url.append(data['url'])
    if folder[i]=='women_shirts_tops_tees':
      super_cat.append('upper_ware')
      cat.append(folder[i])
      info.append(json_file[i])
      file_name.append(j)
      url.append(data['url'])
    if folder[i]=='women_jeans_jeggings' or folder[i]=='women_shorts_skirts' or folder[i]=='women_trousers':
      super_cat.append('lower_ware')
      cat.append(folder[i])
      info.append(json_file[i])
      file_name.append(j)
      url.append(data['url'])
  f.close()

In [17]:
#creating a dataframe with list of categories
df = pd.DataFrame({ 'id':np.arange(len(file_name)),
                    'super_cat':super_cat,
                   'cat':cat,
                   'info':info,
                   'file_name':file_name,
                   'url':url})

In [21]:
df.shape

(75298, 6)

In [93]:
#initializing the elasticsearch object 
es = Elasticsearch(
    cloud_id='i-o-optimized-deployment:dXMtd2VzdDEuZ2NwLmNsb3VkLmVzLmlvJGE2NTI0MTAyMDY3YzQ2ZjZiZjhlZGVhMjE4OTI0YWI2JGNjYjdhMDkxOTEwMjQ1MGZiZjE0YjM5ZTI5ODlhZjQx',
    http_auth=('elastic', 'x9CH4HKxVjMX18SSPuSVDVkb'),
)

In [None]:
#creating the schema in elasticsearch
create_query = {
    "mappings": {
        "properties": {
            "description_vector": {
                "type": "dense_vector",
                "dims": 1024
            }
        }
    }
}
#create index
es.indices.create(index="upper_ware", body=create_query)
es.indices.create(index="foot_ware", body=create_query)
es.indices.create(index="lower_ware", body=create_query)

In [43]:
#genrate docs and embedding to transfer to Elastic search
docs = []
emd = []
count =0
for row in tqdm(df.iterrows()):
  json_obj = {}
  path = root + row[1]['cat']+'/' + row[1]['file_name']
  em = get_embeddings(path)
  json_obj['_index']=row[1]['super_cat']
  json_obj['_id']=row[1]['id']
  json_obj['url'] = row[1]['url']
  json_obj['cat'] = row[1]['cat']
  json_obj['file_name'] = row[1]['file_name']
  json_obj['description_vector'] = em
  emd.append(em)
  docs.append(json_obj)
  

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [44]:
#adding the embedding into the dataframe
df['description_vector']=emd

In [45]:
df.head()

Unnamed: 0,id,super_cat,cat,info,file_name,url,description_vector
0,0,lower_ware,women_trousers,image1014.json,image1014_0.jpg,https://www.myntra.com/trousers/fablestreet/fa...,"[0.00033520295983180404, 0.0021647484973073006..."
1,1,lower_ware,women_trousers,image1014.json,image1014_1.jpg,https://www.myntra.com/trousers/fablestreet/fa...,"[0.00034161005169153214, 0.0010327675845474005..."
2,2,lower_ware,women_trousers,image1014.json,image1014_2.jpg,https://www.myntra.com/trousers/fablestreet/fa...,"[0.0002880047250073403, 0.002158018760383129, ..."
3,3,lower_ware,women_trousers,image1014.json,image1014_3.jpg,https://www.myntra.com/trousers/fablestreet/fa...,"[0.00036859430838376284, 0.0016940443310886621..."
4,4,lower_ware,women_trousers,image1014.json,image1014_4.jpg,https://www.myntra.com/trousers/fablestreet/fa...,"[0.00037159663042984903, 0.0008401929517276585..."


In [46]:
#saving into the file for further use
df.to_csv('Final.csv')

In [5]:
df = pd.read_csv('Final.csv')

In [None]:
#transfer all embeddings to Elastic Search
helpers.bulk(es, docs)

(75298, [])

In [99]:
#Search query ( will now search for similar products)
def search_similar_image(query_vec,index_label):
  search_query = {
      "size": 10,
      "_source": {
          "includes": ["file_name","url"]
      },
      "query": {
          "script_score": {
              "query": {
                  "match_all": {}
              },
              "script": {
                  #"source": "cosineSimilarity(params.queryVector, 'description_vector') + 1.0",
                  "source": "1 / l2norm(params.queryVector,'description_vector') + 1.0",
                  "params": {
                      "queryVector": query_vec
                  }
              }
          }
      }
  }
  response = es.search(
      index= index_label,
      body=search_query
  )

  return response

In [80]:
query_emb = get_embeddings('/content/image1000_0.jpg')

In [100]:
%%time
index_label = 'upper_ware'
response = search_similar_image(query_emb,index_label)

CPU times: user 16 ms, sys: 1.07 ms, total: 17.1 ms
Wall time: 124 ms


In [101]:
#below are results of similar items that we added in Elastic search
response

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '47119',
    '_index': 'upper_ware',
    '_score': 119978.21,
    '_source': {'file_name': 'image1000_0.jpg',
     'url': 'https://www.myntra.com/tops/forever-new/forever-new-women-white-solid-shirred-ruffle-crop-top/11782572/buy'},
    '_type': '_doc'},
   {'_id': '51256',
    '_index': 'upper_ware',
    '_score': 119978.21,
    '_source': {'file_name': 'image950_0.jpg',
     'url': 'https://www.myntra.com/tops/forever-new/forever-new-women-white-solid-shirred-ruffle-crop-top/11782572/buy'},
    '_type': '_doc'},
   {'_id': '47120',
    '_index': 'upper_ware',
    '_score': 1.172966,
    '_source': {'file_name': 'image1000_1.jpg',
     'url': 'https://www.myntra.com/tops/forever-new/forever-new-women-white-solid-shirred-ruffle-crop-top/11782572/buy'},
    '_type': '_doc'},
   {'_id': '51257',
    '_index': 'upper_ware',
    '_score': 1.172966,
    '_source': {'file_name': 'image950_1.jpg',

<h3>Storing in Faiss</h3>

In [6]:
!pip install faiss-gpu

Collecting faiss-gpu
[?25l  Downloading https://files.pythonhosted.org/packages/5d/36/383911b8edf8c29cb7e9e8aee4e6b69b0f36c52237e3a06ce64a9551ef22/faiss_gpu-1.7.0-cp37-cp37m-manylinux2014_x86_64.whl (89.4MB)
[K     |████████████████████████████████| 89.4MB 52kB/s 
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.0


In [7]:
import faiss                   # make faiss available

In [18]:
#retriving embedding into list from the dataframe
list_emb = []
for i in range(len(df['description_vector'])):
  list_emb.append(eval(df['description_vector'][i]))

In [21]:
emb = np.array(list_emb)
img_emb = emb.astype('float32')

In [24]:
img_emb.shape

(75298, 1024)

In [43]:
#cosin quantizer
nlist = 5  # number of clusters
PerVec = 8
SubPerVec = 8
quantizer = faiss.IndexFlatIP(1024)
# index = faiss.IndexFlatIP(1024)
index = faiss.IndexIVFPQ(quantizer,1024,nlist,PerVec,SubPerVec)
index.train(img_emb)
index.add(img_emb)

In [51]:
#euclidean distance quantizer
quantizer = faiss.IndexFlatL2(1024)   # build the index
index = faiss.IndexIVFPQ(quantizer,1024,nlist,PerVec,SubPerVec)
index.train(img_emb)
print(index.is_trained)
index.add(img_emb)                  # add vectors to the index
print(index.ntotal)

True
75298


In [28]:
query_emb = get_embeddings('/content/image1.jpg')

In [29]:
# converting the list to array for seraching the product
query_emb = np.array(query_emb)
query_emb= query_emb.astype('float32')

In [30]:
row_vector = query_emb[np.newaxis,:]
row_vector.shape

(1, 1024)

In [52]:
# search the product from the embedding saved with euclidean into the faiss
%%time
k = 10                          # we want to see 4 nearest neighbors
D, I = index.search(row_vector, k) # sanity check
print(I[0]) #index
print(D) #distance

[14926 16324 16069 14575 13245 16028 14688 14125 15779 16071]
[[14.260212 14.486645 14.486645 14.498741 14.859512 14.954227 15.036573
  15.09613  15.114634 15.211807]]
CPU times: user 4.15 ms, sys: 2.01 ms, total: 6.16 ms
Wall time: 6.47 ms


In [53]:
#filtering the product based on the index
rec_pro = df[df.id.isin(I[0])].drop('Unnamed: 0',axis=1)

In [54]:
rec_pro

Unnamed: 0,id,super_cat,cat,info,file_name,url,description_vector
13245,13245,lower_ware,women_jeans_jeggings,image33.json,image33_1.jpg,https://www.myntra.com/jeans/desigual/desigual...,"[0.00032169456244446337, 0.0015724558616057038..."
14125,14125,lower_ware,women_jeans_jeggings,image270.json,image270_1.jpg,https://www.myntra.com/jeans/tommy-hilfiger/to...,"[0.00032137089874595404, 0.0015712501481175423..."
14575,14575,lower_ware,women_jeans_jeggings,image202.json,image202_1.jpg,https://www.myntra.com/jeans/american-eagle-ou...,"[0.0002735695743467659, 0.0015679027419537306,..."
14688,14688,lower_ware,women_jeans_jeggings,image251.json,image251_1.jpg,https://www.myntra.com/jeans/marks--spencer/ma...,"[0.00028662089607678354, 0.0015935034025460482..."
14926,14926,lower_ware,women_jeans_jeggings,image669.json,image669_1.jpg,https://www.myntra.com/jeans/american-eagle-ou...,"[0.00031227667932398617, 0.0018144890200346708..."
15779,15779,lower_ware,women_jeans_jeggings,image279.json,image279_1.jpg,https://www.myntra.com/jeans/levis/levis-women...,"[0.0003411159268580377, 0.001750853261910379, ..."
16028,16028,lower_ware,women_jeans_jeggings,image226.json,image226_1.jpg,https://www.myntra.com/jeans/only/only-women-b...,"[0.00030201056506484747, 0.0016176414210349321..."
16069,16069,lower_ware,women_jeans_jeggings,image454.json,image454_0.jpg,https://www.myntra.com/jeans/superdry/superdry...,"[0.0003155630547553301, 0.0010675698285922408,..."
16071,16071,lower_ware,women_jeans_jeggings,image454.json,image454_2.jpg,https://www.myntra.com/jeans/superdry/superdry...,"[0.0002897631493397057, 0.00159936782438308, 0..."
16324,16324,lower_ware,women_jeans_jeggings,image504.json,image504_0.jpg,https://www.myntra.com/jeans/superdry/superdry...,"[0.0003155630547553301, 0.0010675698285922408,..."


In [44]:
# search the product from the embedding saved with cosin index into the faiss
%%time
k = 10                          # we want to see 4 nearest neighbors
D, I = index.search(row_vector, k) # sanity check
print(I[0]) #index
print(D) #distance
#filtering the product based on the index
rec_pro = df[df.id.isin(I[0])]

[11870 16571 15226 15833 16518 14533  3559 10929 10265  7553]
[[15.234959 16.703098 16.761396 16.761396 17.074736 17.074736 17.364878
  17.364878 17.711271 17.711271]]
CPU times: user 9.1 ms, sys: 0 ns, total: 9.1 ms
Wall time: 8.96 ms


In [49]:
rec_pro.drop('Unnamed: 0',axis=1)

Unnamed: 0,id,super_cat,cat,info,file_name,url,description_vector
3559,3559,lower_ware,women_trousers,image1130.json,image1130_0.jpg,https://www.myntra.com/trousers/hm/hm-women-ca...,"[0.00034993584267795086, 0.0014610572252422571..."
7553,7553,lower_ware,women_trousers,image522.json,image522_2.jpg,https://www.myntra.com/trousers/kazo/kazo-wome...,"[0.0002554995589889586, 0.002343848580494523, ..."
10265,10265,lower_ware,women_trousers,image422.json,image422_2.jpg,https://www.myntra.com/trousers/kazo/kazo-wome...,"[0.0002554995589889586, 0.002343848580494523, ..."
10929,10929,lower_ware,women_trousers,image1080.json,image1080_0.jpg,https://www.myntra.com/trousers/hm/hm-women-ca...,"[0.00034993584267795086, 0.0014610572252422571..."
11870,11870,lower_ware,women_trousers,image445.json,image445_1.jpg,https://www.myntra.com/trousers/jump-usa/jump-...,"[0.0002904768625739962, 0.0008964059525169432,..."
14533,14533,lower_ware,women_jeans_jeggings,image528.json,image528_1.jpg,https://www.myntra.com/jeans/levis/levis-women...,"[0.0003462400636635721, 0.0012777912197634578,..."
15226,15226,lower_ware,women_jeans_jeggings,image462.json,image462_1.jpg,https://www.myntra.com/jeans/united-colors-of-...,"[0.00033629938843660057, 0.001100885565392673,..."
15833,15833,lower_ware,women_jeans_jeggings,image512.json,image512_1.jpg,https://www.myntra.com/jeans/united-colors-of-...,"[0.00033629938843660057, 0.001100885565392673,..."
16518,16518,lower_ware,women_jeans_jeggings,image478.json,image478_1.jpg,https://www.myntra.com/jeans/levis/levis-women...,"[0.0003462400636635721, 0.0012777912197634578,..."
16571,16571,lower_ware,women_jeans_jeggings,image209.json,image209_1.jpg,https://www.myntra.com/jeans/levis/levis-women...,"[0.0003346421290189028, 0.0015800799010321498,..."


In [55]:
# saving the stored faiss into the file 
faiss.write_index(index,"img_embedding_quantize.index")

In [56]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [58]:
! cp img_embedding_quantize.index 'drive/MyDrive/Applied AI/CS2/'

In [10]:
index = faiss.read_index("img_embedding.index")  # load the index