In [None]:
!kaggle competitions download -c h-and-m-personalized-fashion-recommendations

In [1]:
# unzip h-and-m-personalized-fashion-recommendations.zip
!unzip h-and-m-personalized-fashion-recommendations.zip

Archive:  h-and-m-personalized-fashion-recommendations.zip
  inflating: articles.csv            
  inflating: customers.csv           
  inflating: images/010/0108775015.jpg  
  inflating: images/010/0108775044.jpg  
  inflating: images/010/0108775051.jpg  
  inflating: images/011/0110065001.jpg  
  inflating: images/011/0110065002.jpg  
  inflating: images/011/0110065011.jpg  
  inflating: images/011/0111565001.jpg  
  inflating: images/011/0111565003.jpg  
  inflating: images/011/0111586001.jpg  
  inflating: images/011/0111593001.jpg  
  inflating: images/011/0111609001.jpg  
  inflating: images/011/0112679048.jpg  
  inflating: images/011/0112679052.jpg  
  inflating: images/011/0114428026.jpg  
  inflating: images/011/0114428030.jpg  
  inflating: images/011/0116379047.jpg  
  inflating: images/011/0118458003.jpg  
  inflating: images/011/0118458004.jpg  
  inflating: images/011/0118458028.jpg  
  inflating: images/011/0118458029.jpg  
  inflating: images/011/0118458034.jpg  
  in

In [None]:
import torch
import skimage.io as io
import clip
from PIL import Image
import pickle
import json
import os
from tqdm import tqdm
import argparse
from fashion_clip.fashion_clip import FashionCLIP
import numpy as np

fclip = FashionCLIP('fashion-clip')


def main(clip_model_type: str):
    device = torch.device('cuda:0')
    out_path = f"./data/coco/oscar_split_{clip_model_type}_train.pkl"
    #clip_model_name = clip_model_type.replace('/', '_')
    #clip_model, preprocess = clip.load(clip_model_type, device=device, jit=False)
    with open('./data/coco/annotations/train_caption.json', 'r') as f:
        data = json.load(f)
    print("%0d captions loaded from json " % len(data))
    all_embeddings = []
    all_captions = []
    for i in tqdm(range(len(data))):
        d = data[i]
        img_id = d["image_id"]
        filename = f"./data/coco/train2014/COCO_train2014_{int(img_id):012d}.jpg"
        if not os.path.isfile(filename):
            filename = f"./data/coco/val2014/COCO_val2014_{int(img_id):012d}.jpg"
        pil_image = io.imread(filename)
        image_embeddings = fclip.encode_images([pil_image], batch_size=1)
        image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, ord=2, axis=-1, keepdims=True)
        prefix = torch.tensor(image_embeddings).to(device)
        #image = preprocess(Image.fromarray(image)).unsqueeze(0).to(device)
        #with torch.no_grad():
        #    prefix = clip_model.encode_image(image).cpu()
        d["clip_embedding"] = i
        all_embeddings.append(prefix)
        all_captions.append(d)
        if (i + 1) % 10000 == 0:
            with open(out_path, 'wb') as f:
                pickle.dump({"clip_embedding": torch.cat(all_embeddings, dim=0), "captions": all_captions}, f)

    with open(out_path, 'wb') as f:
        pickle.dump({"clip_embedding": torch.cat(all_embeddings, dim=0), "captions": all_captions}, f)

    print('Done')
    print("%0d embeddings saved " % len(all_embeddings))
    return 0


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--clip_model_type', default="fashion", choices=('RN50', 'RN101', 'RN50x4', 'ViT-B/32', 'fashion'))
    args = parser.parse_args()
    exit(main(args.clip_model_type))


In [3]:
# read articles.csv
import pandas as pd
from collections import Counter
articles = pd.read_csv("./articles.csv")
# drop items that have the same description
subset = articles.drop_duplicates("detail_desc").copy()

# remove items of unkown category
subset = subset[~subset["product_group_name"].isin(["Unknown"])]

# FashionCLIP has a limit of 77 tokens, let's play it safe and drop things with more than 40 tokens
subset = subset[subset["detail_desc"].apply(lambda x : 4 < len(str(x).split()))]

# We also drop products types that do not occur very frequently in this subset of data
most_frequent_product_types = [k for k, v in dict(Counter(subset["product_type_name"].tolist())).items() if v > 10]
subset = subset[subset["product_type_name"].isin(most_frequent_product_types)]

In [5]:
subset.head(3)
subset.to_csv("subset_data.csv", index=False)
f"There are {len(subset)} elements in the dataset"

'There are 37811 elements in the dataset'

In [8]:
# read subset_data.csv
import pandas as pd
from PIL import Image
subset = pd.read_csv("subset_data.csv")
# iterate across rows
for i, row in subset.iterrows():
    if i > 5:
        break
    print(row["article_id"])
    print(row["detail_desc"])
    # get the image from Images folder in format LLM/Images/040/0400246010.jpg
    img_path = f"./Images/0{str(row['article_id'])[:2]}/0{str(row['article_id'])}.jpg"
    print(img_path)
    # get the image from the path
    img = Image.open(img_path)
    img.show()

108775015
Jersey top with narrow shoulder straps.
./Images/010/0108775015.jpg
110065001
Microfibre T-shirt bra with underwired, moulded, lightly padded cups that shape the bust and provide good support. Narrow adjustable shoulder straps and a narrow hook-and-eye fastening at the back. Without visible seams for greater comfort.
./Images/011/0110065001.jpg
111565001
Semi shiny nylon stockings with a wide, reinforced trim at the top. Use with a suspender belt. 20 denier.
./Images/011/0111565001.jpg
111586001
Tights with built-in support to lift the bottom. Black in 30 denier and light amber in 15 denier.
./Images/011/0111586001.jpg
111593001
Semi shiny tights that shape the tummy, thighs and calves while also encouraging blood circulation in the legs. Elasticated waist.
./Images/011/0111593001.jpg
111609001
Opaque matt tights. 200 denier.
./Images/011/0111609001.jpg
