In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
from typing import Union

In [None]:
from PIL import Image as PImage
from PIL.Image import Image

In [None]:
from pathlib import Path

In [None]:
from collections import OrderedDict

In [None]:
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d

In [None]:
import torch
import torch.nn.functional as F
from torch import nn, Tensor
from torchvision import (transforms, datasets)
# from torchvision import prototype as P

## Image search

In [None]:
from typing import Union
from pathlib import Path
from tqdm import tqdm

```bash
! pip install -U opencv-python
```

In [None]:
import cv2

In [None]:
from torch import no_grad
from torch.jit import ScriptModule
from torchvision.models import (resnet34, resnet50, wide_resnet50_2)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
size = 256
imsz = 224
IMG_SUFF = {'.jpg', '.jpeg', '.png'}
path = Path('../data')

In [None]:
class ToPILImage(object):
    """Convert inout image to PIL image"""

    def __init__(self, mode=None):
        super().__init__()
        self.to_pil = transforms.ToPILImage(mode=mode)

    def convert(self, img: Union[np.ndarray, Image]):
        """
        Converts image to the PIL format
        Args:
            img: inout image

        Returns:
            converted image
        """
        return img if isinstance(img, Image) else self.to_pil(img)

    def __call__(self, *args, **kwargs):
        return self.convert(*args, **kwargs)

    def __repr__(self):
        format_string = self.__class__.__name__ + '('
        if self.to_pil.mode is not None:
            format_string += f'mode={self.to_pil.mode}'
        format_string += ')'
        return format_string


class Img2Vec(object):
    """Model wrapper for image embedding"""

    def __init__(
        self, backbone: Union[nn.Module, ScriptModule], trfm: transforms, device: str = 'cpu', 
        func:callable = None, ptrf:callable=None):
        super().__init__()
        self.device = torch.device(device)
        self.backbone = (backbone.eval() if hasattr(backbone, 'eval') else backbone).to(device)
        self.call_backbone = func if func else self.backbone
        self.trfm = trfm
        self.ptrf = ptrf

    def preprocess(self, *xs: Union[Image, np.ndarray]) -> Tensor:
        """
        Transform data before model
        Args:
            *xs: input data

        Returns:
            processed data for model
        """
        return torch.stack([self.trfm(x) for x in xs]).to(self.device)

    @no_grad()
    def forward(self, *xs: Union[Image, np.ndarray]) -> np.ndarray:
        tns = self.preprocess(*xs)
        rts = self.call_backbone(tns)
        rts = self.ptrf(rts) if self.ptrf else rts
        y = rts.cpu().data.numpy()

        return y

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

In [None]:
vec_trsfm = transforms.Compose([ToPILImage(mode='RGB'),
                                transforms.Resize(size),
                                transforms.CenterCrop(imsz),
                                transforms.ToTensor(),
                                transforms.Normalize(
                                    mean=[0.485, 0.456, 0.406], 
                                    std=[0.229, 0.224, 0.225])])

#### Prepare data

In [None]:
search_path = path / 'search'

In [None]:
dir_paths = [dp for dp in search_path.iterdir() if dp.is_dir()]

In [None]:
dir_paths

In [None]:
img_pts = [im_pt for dp in dir_paths for im_pt in dp.iterdir() if im_pt.suffix in IMG_SUFF]

In [None]:
def read_img(im_pt):
    img = cv2.imread(str(im_pt), cv2.IMREAD_ANYCOLOR)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    
    return img

In [None]:
def read_pil_img(im_pt):
    img = PImage.open(im_pt)
    
    return img

In [None]:
imgs = [read_img(ip) for ip in img_pts]

In [None]:
pil_imgs = [read_pil_img(ip) for ip in img_pts]

#### Initialize features extractor

In [None]:
cut = 1

In [None]:
body = wide_resnet50_2(pretrained=True)

In [None]:
body

In [None]:
class FlattenLayer(nn.Module):
    """Flatten layer"""

    def __init__(self):
        super().__init__()

    def forward(self, x: Tensor) -> Tensor:
        return torch.flatten(x, 1)

In [None]:
backbone = nn.Sequential(*list(body.children())[:-cut])
net = nn.Sequential(backbone, FlattenLayer())

In [None]:
net

In [None]:
img_vec = Img2Vec(net, vec_trsfm, device=device)

In [None]:
with tqdm(
    imgs, desc='Vectorizing images', total=len(imgs), position=0, leave=True) as pimgs:
    vecs = [img_vec(im)[0] for im in pimgs]

In [None]:
vecs[0].shape, len(vecs)

In [None]:
img_vecs = list(zip(imgs, vecs))

In [None]:
vecs[0].shape

In [None]:
imgs[0].shape[0] * imgs[0].shape[1] * imgs[0].shape[2] 

In [None]:
img_vecs[0][0].shape

In [None]:
for im in imgs:
    plt.imshow(im)
    plt.show()
    plt.close()

#### Compare vectors

In [None]:
from scipy.spatial.distance import cosine

In [None]:
import matplotlib.pyplot as plt

In [None]:
def top_vecs(qi, top_k=5, model=img_vec, comp_vecs=img_vecs, normalize:bool=False):
    qv = model(qi)
    qv = qv / qv.norm(dim=-1, keepdim=True) if normalize else qv
    qv = qv[0]
    resul_pts = [(cosine(qv, vc), pt) for pt, vc in comp_vecs]
    resul_pts = sorted(resul_pts, key=lambda x: x[0], reverse=False)
    resul_pts = resul_pts[:top_k]
    
    return resul_pts

#### Query images

- ch_1.jpg
- ch_2.jpg
- ft_1.jpeg
- ft_2.jpg
- rv_1.jpeg
- rv_2.jpeg
- st_1.jpeg
- st_2.jpeg
- st_3.jpg
- ct_1.jpg
- ct_2.jpg

In [None]:
query_path = path / 'queries'
query_path.mkdir(exist_ok=True)

In [None]:
qim = read_img(query_path / 'ft_2.jpg')

In [None]:
res = top_vecs(qim)

In [None]:
#res

In [None]:
plt.imshow(qim)
plt.show()
plt.close()

In [None]:
for dist, res_img in res:
    plt.title(f'dist={dist}')
    plt.imshow(res_img)
    plt.show()
    plt.close()

#### Go into the details

In [None]:
qim.shape

In [None]:
plt.imshow(qim)
plt.show()
plt.close()

In [None]:
qtens = img_vec.preprocess(qim)

In [None]:
backbone = img_vec.backbone
backbone

In [None]:
conv_part = backbone[0][:-1]
pool_part = backbone[0][-1]
flat_part = backbone[-1]

In [None]:
conv_part

In [None]:
pool_part

In [None]:
flat_part

In [None]:
[111111]

In [None]:
[img1, img2, img3, img4]
[img]

In [None]:
con_tens = conv_part(qtens)
con_tens[0].shape

In [None]:
pool_tens = pool_part(con_tens)
pool_tens[0].shape

In [None]:
lin_tens = flat_part(pool_tens)
lin_tens[0].shape

#### Search with CLIP model

``` pip install -U ftfy regex tqdm ```

``` pip install -U git+https://github.com/openai/CLIP.git ```

In [None]:
import clip

In [None]:
clip_net, clip_preprocess = clip.load('ViT-B/32', device=device, jit=False)

In [None]:
img2clip = Img2Vec(
    clip_net, clip_preprocess, device=device, 
    func=clip_net.encode_image, ptrf=lambda x: x / x.norm(dim=-1, keepdim=True))

In [None]:
with tqdm(
    pil_imgs, desc='Vectorizing images (PIL)', 
    total=len(pil_imgs), position=0, leave=True) as pimgs:
    clip_vecs = [img2clip(im)[0] for im in plimgs]

In [None]:
clip_vecs[0].shape, len(vecs)

- ch_1.jpg
- ch_2.jpg
- ft_1.jpeg
- ft_2.jpg
- rv_1.jpeg
- rv_2.jpeg
- st_1.jpeg
- st_2.jpeg
- st_3.jpg
- ct_1.jpg
- ct_2.jpg

In [None]:
pil_vecs = list(zip(imgs, clip_vecs))

In [None]:
clip_qim = read_pil_img(query_path / 'ft_2.jpg')

In [None]:
clip_res = top_vecs(clip_qim, model=img2clip, comp_vecs=pil_vecs, normalize=False)

In [None]:
plt.imshow(clip_qim)
plt.show()
plt.close()

In [None]:
for dist, res_img in clip_res:
    plt.title(f'dist={dist}')
    plt.imshow(res_img)
    plt.show()
    plt.close()

## Visualize vectors

In [None]:
img_pts[0]#.parent.name

In [None]:
y_img = [im_pt.parent.name for im_pt in img_pts]

In [None]:
y_img = np.array(y_img)

In [None]:
y_nms, y_cn = np.unique(y_img, return_counts=True)
y_nms, y_cn

In [None]:
for idx, (pv, y_im) in enumerate(zip(pil_vecs, y_img)):
    plt.title(f'dist={y_im}')
    plt.imshow(pv[0])
    plt.show()
    plt.close()
    

In [None]:
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import KMeans, AgglomerativeClustering

In [None]:
y_nms_ls = y_nms.tolist()
y_nms_ls = [(nm, idx) for idx, nm in enumerate(y_nms_ls)]
y_dic = dict(y_nms_ls)
y_nms_ls, y_dic

In [None]:
y_img_nb = np.array([y_dic[y_nm] for y_nm in y_img])
y_img_nb

In [None]:
pca = PCA(n_components=3)
pca = pca.fit(clip_vecs)
X_img_pc = pca.transform(clip_vecs)

In [None]:
y_chf = np.choose(y_img_nb, list(range(len(y_nms_ls)))).astype(float)
y_chf

In [None]:
for name, label in y_nms_ls:
    print(name)

In [None]:
def plot_clusters(X_pc, y_ch):
    fig = plt.figure(1, figsize=(12, 6))
    plt.clf()

    ax = fig.add_subplot(111, projection="3d", elev=48, azim=134)
    ax.set_position([0, 0, 0.95, 1])
    plt.cla()
    for name, label in y_nms_ls:
        ax.text3D(
            X_pc[y_ch == label, 0].mean(),
            X_pc[y_ch == label, 1].mean() + 1.5,
            X_pc[y_ch == label, 2].mean(),
            name,
            horizontalalignment="center",
            bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"),
        )
    # Reorder the labels to have colors matching the cluster results
    y_chf = np.choose(y_ch, list(range(len(y_nms_ls)))).astype(float)
    ax.scatter(
        X_pc[:, 0], X_pc[:, 1], X_pc[:, 2], 
        c=y_chf, cmap=plt.cm.nipy_spectral, edgecolor="k")

    ax.xaxis.set_ticklabels([])
    ax.yaxis.set_ticklabels([])
    ax.zaxis.set_ticklabels([])

    plt.show()

In [None]:
plot_clusters(X_img_pc, y_img_nb)

## Text search

``` pip install -U sentence-transformers ```

In [None]:
from sentence_transformers import SentenceTransformer, util

In [None]:
model_dbr = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [None]:
model_dbl = SentenceTransformer('stsb-bert-large')

In [None]:
model_rbb = SentenceTransformer('stsb-roberta-base')

In [None]:
model_rbl = SentenceTransformer('stsb-roberta-large')

In [None]:
#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model_dbl.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print(f'{embedding.shape=}')
    print("")

## Semantic similarity

In [None]:
# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              #'The new movie is so great',
              'The new movie is so horrable',
             ]

#Compute embedding for both lists
embeddings1 = model_dbl.encode(sentences1, convert_to_tensor=True)
embeddings2 = model_dbl.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

## Search

In [None]:
embedder = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [None]:
# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.',
          'A pasta is eating man .',
          'A pasta is eattyen by man .'
          ]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.']


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: {:.4f})".format(score))

    """
    # Alternatively, we can also use util.semantic_search to perform cosine similarty + topk
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5)
    hits = hits[0]      #Get the hits for the first query
    for hit in hits:
        print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
    """

In [None]:
embedder

In [None]:
embedder_body = nn.Sequential(embedder)[-1]

In [None]:
embedder_body

In [None]:
sent_vec = embedder_body.encode('The test sentence')

In [None]:
sent_vec.shape