# Encoding Item Textual and Visual Modality Features for Amazon2023

In [1]:
from typing import Callable, Iterable

import torch, tqdm, os
import pandas as pd
from freerec.data.tags import USER, ITEM
from freerec.data.utils import download_from_url
from freerec.utils import export_pickle

from concurrent.futures import ThreadPoolExecutor

import torchdata.datapipes as dp
from transformers import AutoImageProcessor, AutoModel
from sentence_transformers import SentenceTransformer
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


You can download the models according to [[hf-mirror](https://hf-mirror.com/)].

In [2]:
dataset: str = "Amazon2023Beauty_10104811_ROU"
datadir: str = f"../data/Processed/{dataset}"
image_folder: str = os.path.join(datadir, "item_images", "large")
model_cache_dir: str = "../models"

In [3]:
item_df = pd.read_csv(
    os.path.join(datadir, "item.txt"), sep='\t'
)
item_df.head(5)

Unnamed: 0,ITEM,TEXT,IMAGE_URL
0,0,Title: Klutz Metallic Glam Nail Studio Activit...,https://m.media-amazon.com/images/I/51dyKdZMlC...
1,1,Title: Versace Bright Crystal Eau de Toilette ...,https://m.media-amazon.com/images/I/41lnN8CpvE...
2,2,Title: Conair CD82ZCS Instant Heat Curling Iro...,https://m.media-amazon.com/images/I/31N529CJ78...
3,3,Title: Conair CD82ZCS Instant Heat Curling Iro...,https://m.media-amazon.com/images/I/31N529CJ78...
4,4,Title: Refill Cartridges CCR\nFeatures: 1. Cle...,https://m.media-amazon.com/images/I/41KIM5M9xi...


We first download images from given urls. Please check `image_size` before going on.

In [32]:
def download_images(item_df: pd.DataFrame):
    ids = item_df[ITEM.name]
    urls = item_df['IMAGE_URL']
    with ThreadPoolExecutor() as executor:
        for id_, url in tqdm.tqdm(zip(ids, urls), desc="Download images: "):
            if url:
                executor.submit(
                    download_from_url,
                    url=url,
                    root=image_folder,
                    filename=f"{id_}.jpg",
                    log=False
                )

In [33]:
download_images(item_df)

Download images: : 30380it [00:32, 937.29it/s] 


Then, we will encode visual modality first.

In [6]:
def load_image(idx: int, processor):
    has_image: bool = False
    try:
        image = Image.open(
            os.path.join(
                image_folder, f"{idx}.jpg"
            )
        ).convert('RGB')
        has_image = True
    except FileNotFoundError:
        image = Image.new('RGB', (224, 224))
    return idx, has_image, processor(images=image, return_tensors='pt')['pixel_values'][0]

def encode_visual_modality(
    item_df: pd.DataFrame,
    model: str, model_dir: str,
    num_workers: int = 4, batch_size: int = 128,
):
    from functools import partial
    processor = AutoImageProcessor.from_pretrained(
        os.path.join(model_dir, model), local_files_only=True
    )

    _process = partial(load_image, processor=processor)

    datapipe = dp.iter.IterableWrapper(
        range(len(item_df))
    ).sharding_filter().map(
        _process
    )
    dataloader = torch.utils.data.DataLoader(
        datapipe, 
        num_workers=num_workers, batch_size=batch_size,
        shuffle=False
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    encoder = AutoModel.from_pretrained(
        os.path.join(model_dir, model), local_files_only=True
    ).to(device).eval()

    vIndices = []
    vMasks = []
    vFeats = []
    with torch.no_grad():
        encoder.eval()
        for (indices, has_images, images) in tqdm.tqdm(dataloader, desc="Visual batches: "):
            vIndices.append(indices)
            vMasks.append(has_images)
            outputs = encoder(pixel_values=images.to(device)).last_hidden_state
            if outputs.ndim == 3:
                # vit (Batch, Sequence, D)
                outputs = outputs[:, 0]
            else:
                # resnet (Batch, D, K, K)
                outputs = outputs.flatten(2).mean(-1)
            vFeats.append(
                outputs.detach().cpu()
            )
    vIndices = torch.cat(vIndices, dim=0)
    vMasks = torch.cat(vMasks, dim=0)
    vFeats = torch.cat(vFeats, dim=0).flatten(1) # (N, D)
    vFeats = vFeats[vIndices.argsort()] # reindex
    vMasks = vMasks[vIndices.argsort()]
    assert vFeats.size(0) == len(item_df), f"Unknown errors happen ..."

    vMasks = vMasks.to(vFeats.device)
    mean = vFeats[vMasks].mean(dim=0, keepdim=True).repeat((vFeats.size(0), 1))
    vMasks = vMasks.unsqueeze(-1).expand_as(vFeats)
    vFeats = torch.where(vMasks, vFeats, mean)

    export_pickle(
        vFeats, os.path.join(
            datadir, f"visual_{model}.pkl"
        )
    )
    return vFeats

In [7]:
encode_visual_modality(
    item_df,
    model="dino-resnet50",
    model_dir=model_cache_dir
)

Visual batches: : 240it [03:45,  1.06it/s]                       


tensor([[-0.1044, -0.2491, -0.2130,  ..., -0.0333, -0.4862,  0.2023],
        [-0.1021,  0.0347, -0.0537,  ..., -0.0105, -0.0494,  0.0819],
        [-0.0634,  0.2227,  0.0578,  ..., -0.0093,  0.1451,  0.2042],
        ...,
        [-0.1225, -0.0066,  0.0214,  ..., -0.1641, -0.0435,  0.0231],
        [ 0.0448, -0.2399,  0.0342,  ..., -0.1069,  0.0291, -0.0334],
        [-0.0702,  0.3238, -0.1198,  ..., -0.3347,  0.2070, -0.0101]])

In [8]:
def encode_textual_modality(
    item_df: pd.DataFrame,
    model: str, model_dir: str,
    batch_size: int = 128
):
    sentences = item_df['TEXT']
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    encoder = SentenceTransformer(
        os.path.join(model_dir, model),
        device=device
    ).eval()

    with torch.no_grad():
        tFeats = encoder.encode(
            sentences, 
            convert_to_tensor=True,
            batch_size=batch_size, show_progress_bar=True
        ).cpu()
    assert tFeats.size(0) == len(item_df), f"Unknown errors happen ..."

    export_pickle(
        tFeats, os.path.join(
            datadir, f"textual_{model}.pkl"
        )
    )
    return tFeats

In [9]:
encode_textual_modality(
    item_df,
    model="all-MiniLM-L6-v2",
    model_dir=model_cache_dir
)

Batches: 100%|██████████| 238/238 [00:33<00:00,  7.13it/s]


tensor([[-0.0750,  0.0625,  0.0448,  ..., -0.0347, -0.0460,  0.0341],
        [-0.0064, -0.0411,  0.1195,  ..., -0.0291,  0.0006, -0.0331],
        [-0.0876,  0.0186, -0.0125,  ..., -0.0514, -0.0609,  0.0226],
        ...,
        [-0.0577,  0.0095,  0.0716,  ..., -0.0412,  0.0741,  0.0501],
        [-0.0054,  0.0916,  0.0795,  ...,  0.0172,  0.0215,  0.0051],
        [-0.0679,  0.0423,  0.0517,  ..., -0.0599,  0.0046,  0.0219]])

The following code is used to extract features by CLIP.

In [10]:

def encode_clip_textual_visual_modality(
    item_df: pd.DataFrame,
    img_clip_model: str,
    text_clip_model: str,
    model_dir: str,
    batch_size: int = 128
):
    images = []
    vMasks = torch.ones((len(item_df,)))
    for idx in range(len(item_df)):
        try:
            image = Image.open(
                os.path.join(
                    image_folder, f"{idx}.jpg"
                )
            ).convert('RGB')
        except FileNotFoundError:
            image = Image.new('RGB', (224, 224))
            vMasks[idx] = 0
        images.append(image)
    vMasks = vMasks.bool()

    sentences = item_df['TEXT']

    img_encoder = SentenceTransformer(
        os.path.join(model_dir, img_clip_model),
        device=torch.device('cuda:0')
    ).eval()
    text_encoder = SentenceTransformer(
        os.path.join(model_dir, text_clip_model),
        device=torch.device('cuda:1')
    ).eval()

    with torch.no_grad():
        vFeats = img_encoder.encode(
            images,
            convert_to_tensor=True,
            batch_size=batch_size, show_progress_bar=True
        ).cpu()
        tFeats = text_encoder.encode(
            sentences, 
            convert_to_tensor=True,
            batch_size=batch_size, show_progress_bar=True
        ).cpu()
    assert vFeats.size(0) == len(item_df), f"Unknown errors happen ..."
    assert tFeats.size(0) == len(item_df), f"Unknown errors happen ..."

    vMasks = vMasks.to(vFeats.device)
    mean = vFeats[vMasks].mean(dim=0, keepdim=True).repeat((vFeats.size(0), 1))
    vMasks = vMasks.unsqueeze(-1).expand_as(vFeats)
    vFeats = torch.where(vMasks, vFeats, mean)

    export_pickle(
        vFeats, os.path.join(
            datadir, f"visual_{img_clip_model}.pkl"
        )
    )

    export_pickle(
        tFeats, os.path.join(
            datadir, f"textual_{text_clip_model}.pkl"
        )
    )
    return vFeats, tFeats

In [11]:
encode_clip_textual_visual_modality(
    item_df,
    img_clip_model="clip-vit-b-32",
    text_clip_model="clip-vit-b-32-multilingual-v1",
    model_dir=model_cache_dir
)

Batches: 100%|██████████| 238/238 [04:07<00:00,  1.04s/it]
Batches: 100%|██████████| 238/238 [01:39<00:00,  2.39it/s]


(tensor([[-0.0367, -0.4885,  0.3980,  ..., -0.1649,  0.4623,  0.0621],
         [-0.2082, -0.0440,  0.0904,  ...,  0.3867, -0.0183, -0.0969],
         [-0.0832,  0.3915, -0.0901,  ...,  0.9376,  0.2346,  0.1945],
         ...,
         [-0.1495,  0.3978,  0.2217,  ...,  0.1837, -0.4292,  0.1551],
         [-0.2783,  0.1620,  0.1765,  ...,  0.4337, -0.3054, -0.1246],
         [-0.2874, -0.1172,  0.1805,  ...,  0.1124,  0.0043,  0.2117]]),
 tensor([[-0.0480,  0.1438, -0.0667,  ...,  0.0107,  0.0527, -0.1216],
         [ 0.0211,  0.1272, -0.0593,  ...,  0.0686,  0.0270, -0.0941],
         [-0.0333,  0.1581, -0.0986,  ...,  0.0823,  0.0271,  0.0128],
         ...,
         [ 0.0293,  0.1610, -0.0843,  ..., -0.0547, -0.0385, -0.0440],
         [ 0.1392,  0.0145, -0.0654,  ..., -0.0097,  0.0556, -0.1122],
         [ 0.0427,  0.1206, -0.1094,  ...,  0.0034, -0.0012, -0.0377]]))