# Encoding Item Textual and Visual Modality Features for Amazon2023

In [1]:
from typing import Callable, Iterable

import torch, tqdm, os
import pandas as pd
from freerec.data.tags import USER, ITEM
from freerec.data.utils import download_from_url
from freerec.utils import export_pickle

from concurrent.futures import ThreadPoolExecutor

import torchdata.datapipes as dp
from transformers import AutoImageProcessor, AutoModel
from sentence_transformers import SentenceTransformer
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


You can download the models according to [[hf-mirror](https://hf-mirror.com/)].

In [2]:
dataset: str = "Amazon2023Baby_554811_ROU"
datadir: str = f"../data/Processed/{dataset}"
image_folder: str = os.path.join(datadir, "item_images", "large")
model_cache_dir: str = "../models"

In [3]:
item_df = pd.read_csv(
    os.path.join(datadir, "item.txt"), sep='\t'
)
item_df.head(5)

Unnamed: 0,ITEM,TEXT,IMAGE_URL
0,0,Title: Baby Tracker® - Daily Childcare Journal...,https://m.media-amazon.com/images/I/41Bb6wf+qU...
1,1,Title: Disney Mickey and Friends Baby Beginner...,https://m.media-amazon.com/images/I/61tpXz7AAT...
2,2,Title: SoftPlay Fisher-Price Precious Planet C...,https://m.media-amazon.com/images/I/516RIuPn3R...
3,3,Title: Christian Art Gifts Girl Baby Book of M...,https://m.media-amazon.com/images/I/41xViU1RwR...
4,4,Title: Christian Art Gifts Boy Baby Book of Me...,https://m.media-amazon.com/images/I/415eCH3JG5...


We first download images from given urls. Please check `image_size` before going on.

In [32]:
def download_images(item_df: pd.DataFrame):
    ids = item_df[ITEM.name]
    urls = item_df['IMAGE_URL']
    with ThreadPoolExecutor() as executor:
        for id_, url in tqdm.tqdm(zip(ids, urls), desc="Download images: "):
            if url:
                executor.submit(
                    download_from_url,
                    url=url,
                    root=image_folder,
                    filename=f"{id_}.jpg",
                    log=False
                )

In [33]:
download_images(item_df)

Download images: : 30380it [00:32, 937.29it/s] 


Then, we will encode visual modality first.

In [4]:
def load_image(idx: int, processor):
    has_image: bool = False
    try:
        image = Image.open(
            os.path.join(
                image_folder, f"{idx}.jpg"
            )
        ).convert('RGB')
        has_image = True
    except FileNotFoundError:
        image = Image.new('RGB', (224, 224))
    return idx, has_image, processor(images=image, return_tensors='pt')['pixel_values'][0]

def encode_visual_modality(
    item_df: pd.DataFrame,
    model: str, model_dir: str,
    num_workers: int = 4, batch_size: int = 128,
):
    from functools import partial
    processor = AutoImageProcessor.from_pretrained(
        os.path.join(model_dir, model), local_files_only=True
    )

    _process = partial(load_image, processor=processor)

    datapipe = dp.iter.IterableWrapper(
        range(len(item_df))
    ).sharding_filter().map(
        _process
    )
    dataloader = torch.utils.data.DataLoader(
        datapipe, 
        num_workers=num_workers, batch_size=batch_size,
        shuffle=False
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    encoder = AutoModel.from_pretrained(
        os.path.join(model_dir, model), local_files_only=True
    ).to(device).eval()

    vIndices = []
    vMasks = []
    vFeats = []
    with torch.no_grad():
        encoder.eval()
        for (indices, has_images, images) in tqdm.tqdm(dataloader, desc="Visual batches: "):
            vIndices.append(indices)
            vMasks.append(has_images)
            outputs = encoder(pixel_values=images.to(device)).last_hidden_state
            if outputs.ndim == 3:
                # vit (Batch, Sequence, D)
                outputs = outputs[:, 0]
            else:
                # resnet (Batch, D, K, K)
                outputs = outputs.flatten(2).mean(-1)
            vFeats.append(
                outputs.detach().cpu()
            )
    vIndices = torch.cat(vIndices, dim=0)
    vMasks = torch.cat(vMasks, dim=0)
    vFeats = torch.cat(vFeats, dim=0).flatten(1) # (N, D)
    vFeats = vFeats[vIndices.argsort()] # reindex
    vMasks = vMasks[vIndices.argsort()]
    assert vFeats.size(0) == len(item_df), f"Unknown errors happen ..."

    vMasks = vMasks.to(vFeats.device)
    mean = vFeats[vMasks].mean(dim=0, keepdim=True).repeat((vFeats.size(0), 1))
    vMasks = vMasks.unsqueeze(-1).expand_as(vFeats)
    vFeats = torch.where(vMasks, vFeats, mean)

    export_pickle(
        vFeats, os.path.join(
            datadir, f"visual_{model}.pkl"
        )
    )
    return vFeats

In [7]:
encode_visual_modality(
    item_df,
    model="vit-base-16-224",
    model_dir=model_cache_dir
)

Visual batches: : 240it [03:45,  1.06it/s]                       


tensor([[-0.1044, -0.2491, -0.2130,  ..., -0.0333, -0.4862,  0.2023],
        [-0.1021,  0.0347, -0.0537,  ..., -0.0105, -0.0494,  0.0819],
        [-0.0634,  0.2227,  0.0578,  ..., -0.0093,  0.1451,  0.2042],
        ...,
        [-0.1225, -0.0066,  0.0214,  ..., -0.1641, -0.0435,  0.0231],
        [ 0.0448, -0.2399,  0.0342,  ..., -0.1069,  0.0291, -0.0334],
        [-0.0702,  0.3238, -0.1198,  ..., -0.3347,  0.2070, -0.0101]])

In [5]:
def encode_textual_modality(
    item_df: pd.DataFrame,
    model: str, model_dir: str,
    batch_size: int = 128
):
    sentences = item_df['TEXT']
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    encoder = SentenceTransformer(
        os.path.join(model_dir, model),
        device=device
    ).eval()

    with torch.no_grad():
        tFeats = encoder.encode(
            sentences, 
            convert_to_tensor=True,
            batch_size=batch_size, show_progress_bar=True
        ).cpu()
    assert tFeats.size(0) == len(item_df), f"Unknown errors happen ..."

    export_pickle(
        tFeats, os.path.join(
            datadir, f"textual_{model}.pkl"
        )
    )
    return tFeats

In [6]:
encode_textual_modality(
    item_df,
    model="all-MiniLM-L6-v2",
    model_dir=model_cache_dir
)

Batches: 100%|██████████| 259/259 [00:45<00:00,  5.66it/s]


tensor([[-0.0899,  0.0039, -0.0233,  ..., -0.0025,  0.0305,  0.0443],
        [ 0.0385, -0.0299,  0.0062,  ...,  0.0310, -0.0348,  0.0290],
        [ 0.0183, -0.0791,  0.0461,  ..., -0.0465,  0.0215,  0.0571],
        ...,
        [-0.0214,  0.0251, -0.0389,  ..., -0.0851,  0.0558,  0.0507],
        [ 0.0217,  0.0355, -0.0526,  ...,  0.0204,  0.0282,  0.0283],
        [-0.0817, -0.0837, -0.0051,  ..., -0.0638,  0.0584,  0.0338]])

The following code is used to extract features by CLIP.

In [7]:

def encode_clip_textual_visual_modality(
    item_df: pd.DataFrame,
    img_clip_model: str,
    text_clip_model: str,
    model_dir: str,
    batch_size: int = 128
):
    images = []
    vMasks = torch.ones((len(item_df,)))
    for idx in range(len(item_df)):
        try:
            image = Image.open(
                os.path.join(
                    image_folder, f"{idx}.jpg"
                )
            ).convert('RGB')
        except FileNotFoundError:
            image = Image.new('RGB', (224, 224))
            vMasks[idx] = 0
        images.append(image)
    vMasks = vMasks.bool()

    sentences = item_df['TEXT']

    img_encoder = SentenceTransformer(
        os.path.join(model_dir, img_clip_model),
        device=torch.device('cuda:0')
    ).eval()
    text_encoder = SentenceTransformer(
        os.path.join(model_dir, text_clip_model),
        device=torch.device('cuda:1')
    ).eval()

    with torch.no_grad():
        vFeats = img_encoder.encode(
            images,
            convert_to_tensor=True,
            batch_size=batch_size, show_progress_bar=True
        ).cpu()
        tFeats = text_encoder.encode(
            sentences, 
            convert_to_tensor=True,
            batch_size=batch_size, show_progress_bar=True
        ).cpu()
    assert vFeats.size(0) == len(item_df), f"Unknown errors happen ..."
    assert tFeats.size(0) == len(item_df), f"Unknown errors happen ..."

    vMasks = vMasks.to(vFeats.device)
    mean = vFeats[vMasks].mean(dim=0, keepdim=True).repeat((vFeats.size(0), 1))
    vMasks = vMasks.unsqueeze(-1).expand_as(vFeats)
    vFeats = torch.where(vMasks, vFeats, mean)

    export_pickle(
        vFeats, os.path.join(
            datadir, f"visual_{img_clip_model}.pkl"
        )
    )

    export_pickle(
        tFeats, os.path.join(
            datadir, f"textual_{text_clip_model}.pkl"
        )
    )
    return vFeats, tFeats

In [8]:
encode_clip_textual_visual_modality(
    item_df,
    img_clip_model="clip-vit-b-32",
    text_clip_model="clip-vit-b-32-multilingual-v1",
    model_dir=model_cache_dir
)

Batches: 100%|██████████| 259/259 [04:28<00:00,  1.04s/it]
Batches: 100%|██████████| 259/259 [02:19<00:00,  1.86it/s]


(tensor([[-0.2293,  0.0023,  0.1084,  ...,  0.4313, -0.1574,  0.0206],
         [-0.3903, -0.2919, -0.0827,  ...,  0.3836, -0.0038,  0.3730],
         [-0.2293,  0.0023,  0.1084,  ...,  0.4313, -0.1574,  0.0206],
         ...,
         [-0.4276,  0.3432,  0.2271,  ...,  0.7927, -0.6071,  0.3222],
         [-0.4272,  0.3286,  0.1283,  ..., -0.0385,  0.0927, -0.0549],
         [-0.2324, -0.3409,  0.0483,  ..., -0.0207,  0.3209, -0.1621]]),
 tensor([[ 2.0746e-05,  7.1700e-02, -1.1730e-01,  ..., -6.6910e-02,
           4.1709e-02, -3.1474e-02],
         [-2.5776e-02,  1.2629e-01,  2.8502e-02,  ...,  1.1202e-01,
          -1.3230e-02, -7.9388e-02],
         [ 3.2196e-02,  1.1812e-01, -8.3679e-02,  ...,  2.1190e-02,
           8.2094e-03, -2.7704e-02],
         ...,
         [ 5.8675e-03,  6.8395e-02, -1.3923e-01,  ..., -7.9439e-02,
          -1.3898e-01, -1.5873e-01],
         [ 1.8296e-02,  1.2071e-01, -9.2982e-02,  ...,  3.5933e-02,
          -1.6972e-02, -9.6719e-03],
         [-1.6469e-