In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import os
from pathlib import Path
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import torch.nn.functional as F
import torch
!pip install peft



# Features table of product items

### This table contains all h&m articles with details such as a type of product, a color, a product group and other features.  
**Article data description:**

- `article_id` : A unique identifier of every article.
- `product_code`, `prod_name` : A unique identifier of every product and its name (not the same).
- `product_type`, `product_type_name` : The group of product_code and its name.
- `product_group_name` : Product Group. Father to product type.
- `graphical_appearance_no`, `graphical_appearance_name` : The group of graphics and its name.
- `colour_group_code`, `colour_group_name` : The group of color and its name.
- `perceived_colour_value_id`, `perceived_colour_value_name`, `perceived_colour_master_id`, `perceived_colour_master_name` : The added color info.
- `department_no`, `department_name` : A unique identifier of every department and its name.
- `index_code`, `index_name` : A unique identifier of every index and its name.
- `index_group_no`, `index_group_name` : A group of indices and its name.
- `section_no`, `section_name` : A unique identifier of every section and its name.
- `garment_group_no`, `garment_group_name` : A unique identifier of every garment and its name.
- `detail_desc` : Details.

In [None]:
text_path = '/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv'
articles = pd.read_csv(text_path)
print(articles.shape) # 100k data points
articles.head(5)

# LoRA finetune CLIP (single column)

In [None]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import (
    CLIPProcessor, 
    CLIPModel, 
    TrainingArguments, 
    Trainer
)

# Load CLIP model and processor
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)
model

## Dataset on product_group_name 

In [None]:
import os
import pandas as pd
from datasets import Dataset
import itertools

# 图片文本目录路径
images_path = '/kaggle/input/h-and-m-personalized-fashion-recommendations/images'
text_path = '/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv'

# 读取csv文件
# df = pd.read_csv(text_path)
df = articles
df['image_path'] = ''

# 为每个article_id添加对应的图片路径，并将表格保存到/kaggle/working/articles_with_image_path.csv
for i in range(df.shape[0]):
    # 用article_id获取图片路径
    article_id = str(df.iloc[i]['article_id'])
    image_path = images_path + f'/0{article_id[:2]}' + f'/0{article_id}.jpg'
    if not os.path.exists(image_path):
        continue
    df.loc[i, 'image_path'] = image_path
    

df.to_csv('/kaggle/working/articles_with_image_path.csv')

In [None]:
# remove all the columns that their image_path is NAN
df = pd.read_csv('/kaggle/working/articles_with_image_path.csv')
filter_product_group_name = ['Unknown','Underwear/nightwear','Cosmetic','Bags','Items',
    'Furniture','Garment and Shoe care','Stationery','Interior textile','Fun']
format_df = df[df['image_path'].notna() & (df['image_path'] != '') & (~df['product_group_name'].isin(filter_product_group_name))]
print(f'清洗后的df行列数: {format_df.shape}')

# 将df拆分为测试集df和训练集df，并保证product_group_name类别比例保持一致。
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import numpy as np

def stratified_split(df, text_column, test_size=0.2, random_state=42):
    # 为每个唯一的 text 值分配一个类别标签
    df['text_category'] = pd.Categorical(df[text_column]).codes
    
    # 初始化 StratifiedShuffleSplit
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    
    # 进行分层抽样
    for train_index, test_index in splitter.split(df, df['text_category']):
        train_df = df.iloc[train_index].copy()
        test_df = df.iloc[test_index].copy()
    
    # 删除临时的 'text_category' 列
    train_df.drop('text_category', axis=1, inplace=True)
    test_df.drop('text_category', axis=1, inplace=True)
    
    return train_df, test_df

# 进行分层抽样
train_df, val_df = stratified_split(format_df, text_column='product_group_name', test_size=0.2)

print(f"训练集大小: {len(train_df)}")
print(f"验证集大小: {len(val_df)}")

# 检查每个集合中各类别的比例
def check_proportions(df, column):
    return df[column].value_counts(normalize=True)

print("\n训练集中的类别比例:")
print(check_proportions(train_df, 'product_group_name'))

print("\n验证集中的类别比例:")
print(check_proportions(val_df, 'product_group_name'))
# Unknown之后的占比太小（只占了300个dp不到）感觉可以删除，但如何处理这部分的推理

In [None]:
# 数据集构造
class TextImageDataset(Dataset): # 接受处理后的df构建迭代器数据集
    def __init__(self, dataframe, prompt):
        self.dataframe = dataframe
        self.prompt = prompt
        
    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        text = row['product_group_name']
        text = self.prompt.format(text)
        image = Image.open(row['image_path'])

        return {
            'text': text,
            'image': image,
        }


# 创建数据集实例
prompt = 'A photo of a {}'
train_dataset = TextImageDataset(train_df, prompt)
val_dataset = TextImageDataset(val_df, prompt)

# Multi labels training

## Model loading and lora config

In [2]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import (
    CLIPProcessor, 
    CLIPModel, 
    TrainingArguments, 
    Trainer
)

# Load CLIP model and processor
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", 'k_proj'],
    lora_dropout=0.05,
    bias="none",
#     task_type="classification"
)

peft_model = get_peft_model(model, lora_config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
peft_model.to(device)
print(peft_model.print_trainable_parameters())



trainable params: 1,474,560 || all params: 152,751,873 || trainable%: 0.9653
None


## Data processing and Dataset 

In [3]:
df = pd.read_csv('/kaggle/working/articles_with_image_path.csv')
df = df[df['image_path'].notna() & (df['image_path'] != '')]
target_columns = ["product_group_name", "product_type_name", "graphical_appearance_name", "colour_group_name", "perceived_colour_value_name", "perceived_colour_master_name", "department_name", "index_name", "index_group_name", "section_name", "garment_group_name"]
# 11 target col
print(df.shape)
df.head()

(105100, 27)


Unnamed: 0.1,Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,...,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,image_path
0,0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,/kaggle/input/h-and-m-personalized-fashion-rec...
1,1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,/kaggle/input/h-and-m-personalized-fashion-rec...
2,2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,/kaggle/input/h-and-m-personalized-fashion-rec...
3,3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,...,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",/kaggle/input/h-and-m-personalized-fashion-rec...
4,4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,...,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",/kaggle/input/h-and-m-personalized-fashion-rec...


In [4]:
def split_dataset(df, val_ratio=0.2, seed=42):
    """
    将DataFrame划分为训练集和验证集
    
    Args:
        df: 输入的DataFrame
        val_ratio: 验证集占比，默认0.2
        seed: 随机种子，用于复现结果
        
    Returns:
        train_df: 训练集DataFrame
        val_df: 验证集DataFrame
    """
    # 设置随机种子
    np.random.seed(seed)
    
    # 获取总行数
    n_total = len(df)
    
    # 计算验证集大小
    n_val = int(n_total * val_ratio)
    
    # 生成随机索引
    indices = np.random.permutation(n_total)
    val_indices = indices[:n_val]
    train_indices = indices[n_val:]
    
    # 划分数据集
    val_df = df.iloc[val_indices].copy().reset_index(drop=True)
    train_df = df.iloc[train_indices].copy().reset_index(drop=True)
    
    print(f"总数据量: {n_total}")
    print(f"训练集大小: {len(train_df)} ({1-val_ratio:.1%})")
    print(f"验证集大小: {len(val_df)} ({val_ratio:.1%})")
    
    return train_df, val_df

train_df, val_df = split_dataset(df, val_ratio=0.2)

columns_unique_labels = {}
for t_col in target_columns:
    columns_unique_labels[t_col] = list(df[t_col].unique())
for k,v in columns_unique_labels.items():
    print(f'\nClass {k} has {len(v)} unique labels:')
    print(v)

总数据量: 105100
训练集大小: 84080 (80.0%)
验证集大小: 21020 (20.0%)

Class product_group_name has 19 unique labels:
['Garment Upper body', 'Underwear', 'Socks & Tights', 'Garment Lower body', 'Accessories', 'Items', 'Nightwear', 'Unknown', 'Underwear/nightwear', 'Shoes', 'Swimwear', 'Garment Full body', 'Cosmetic', 'Interior textile', 'Bags', 'Furniture', 'Garment and Shoe care', 'Fun', 'Stationery']

Class product_type_name has 131 unique labels:
['Vest top', 'Bra', 'Underwear Tights', 'Socks', 'Leggings/Tights', 'Sweater', 'Top', 'Trousers', 'Hair clip', 'Umbrella', 'Pyjama jumpsuit/playsuit', 'Bodysuit', 'Hair string', 'Unknown', 'Hoodie', 'Sleep Bag', 'Hair/alice band', 'Belt', 'Boots', 'Bikini top', 'Swimwear bottom', 'Underwear bottom', 'Swimsuit', 'Skirt', 'T-shirt', 'Dress', 'Hat/beanie', 'Kids Underwear top', 'Shorts', 'Shirt', 'Cap/peaked', 'Pyjama set', 'Sneakers', 'Sunglasses', 'Cardigan', 'Gloves', 'Earring', 'Bag', 'Blazer', 'Other shoe', 'Jumpsuit/Playsuit', 'Sandals', 'Jacket', 'Cos

In [5]:
class TrainDataset(Dataset):
    def __init__(self, dataframe, target_columns):
        self.dataframe = dataframe
        self.target_columns = target_columns
        self.prompt = 'The {column} of photo is {label}' # temporary, need to be changed
            
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        image = Image.open(row['image_path'])
        labels = []
        for col in self.target_columns:
            labels.append(row[col].strip())
        
        text_list = [self.prompt.format(column=' '.join(c.split('_')), label=l) for c, l in zip(self.target_columns, labels)]
        text = '. '.join(text_list)
        # select the labels from k columns to create the prompt
        return {
            'image': image,
            'text': text,
        }
    
# Define data collator for training dataset
def collate_fn(batch):
#     print(batch)
    texts = [item['text'] for item in batch]
    images = [item['image'] for item in batch]
    
    # 处理文本
    text_inputs = processor(
        text=texts,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
#     print(text_inputs['input_ids'].shape)
    # 处理图像
    image_inputs = processor(
        images=images,
        return_tensors="pt",
    )
    
    # 合并文本和图像的输入
    inputs = {
        'input_ids': text_inputs['input_ids'],
        'attention_mask': text_inputs['attention_mask'],
        'pixel_values': image_inputs['pixel_values'],
    }
    
    return inputs

# 重复构造train_ds再连接起来
from torch.utils.data import Dataset, ConcatDataset

t_columns_list = [
    ["product_group_name", "product_type_name", "graphical_appearance_name"],
    ["colour_group_name", "perceived_colour_value_name", "perceived_colour_master_name"],
    ["department_name", "index_name", "index_group_name"],
    ["section_name", "garment_group_name"],
] 
train_ds_list = []
for t_cols in t_columns_list:
    train_ds = TrainDataset(train_df, t_cols)
    train_ds_list.append(train_ds)

train_ds = ConcatDataset(train_ds_list)
print(len(train_ds))
for i in range(0, 2):
    print(train_ds[i])

336320
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1166x1750 at 0x795F0F6CB0D0>, 'text': 'The product group name of photo is Garment Full body. The product type name of photo is Dress. The graphical appearance name of photo is Check'}
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1470x1750 at 0x795F0F6CBCA0>, 'text': 'The product group name of photo is Garment Upper body. The product type name of photo is Hoodie. The graphical appearance name of photo is Solid'}


In [6]:
class ValidateDataset(Dataset):
    def __init__(self, dataframe, target_columns):
        self.dataframe = dataframe
        self.target_columns = target_columns

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        image = Image.open(row['image_path'])
        dp = {'image': image}
        for col in self.target_columns:
            dp[col] = row[col]
        return dp

def collate_fn_val(batch):
    batch_list = {}
    for item in batch:
        for k,v in item.items():
            if k not in batch_list:
                batch_list[k] = [v]
            else:
                batch_list[k].append(v)
    return batch_list
    
val_ds = ValidateDataset(val_df, target_columns)
for i in range(2):
    print(val_ds[i])

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=722x1750 at 0x795F161AEEC0>, 'product_group_name': 'Garment Lower body', 'product_type_name': 'Trousers', 'graphical_appearance_name': 'Denim', 'colour_group_name': 'Blue', 'perceived_colour_value_name': 'Medium Dusty', 'perceived_colour_master_name': 'Blue', 'department_name': 'Young Boy Denim', 'index_name': 'Children Sizes 134-170', 'index_group_name': 'Baby/Children', 'section_name': 'Young Boy', 'garment_group_name': 'Trousers Denim'}
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1414x1750 at 0x795F0F6CB970>, 'product_group_name': 'Garment Upper body', 'product_type_name': 'Blazer', 'graphical_appearance_name': 'Solid', 'colour_group_name': 'Black', 'perceived_colour_value_name': 'Dark', 'perceived_colour_master_name': 'Black', 'department_name': 'Suit', 'index_name': 'Ladieswear', 'index_group_name': 'Ladieswear', 'section_name': 'Womens Tailoring', 'garment_group_name': 'Dressed'}


## Inference

In [7]:
# 需要推理的目标column name和每个column的unique label
def single_column_infer_fn(model, images, column, unique_labels):
    '''
    input: 
        model: CLIP model
        processor: CLIP processor (global).
        images: list of images
        column: the specific column name
        unique_labels: all possible unique labels in this column
    output: 
        probs: classification probabilities
    '''
    infer_prompt = 'The {c} of photo is {label}' # temporary, need to be changed
    col = ' '.join(column.split('_')) # replaced _ by space
    unique_prompts = [infer_prompt.format(c=col, label=l) for l in unique_labels]
#     print(unique_prompts)
    device = model.device
    inputs = processor(
        text=unique_prompts,
        images=images,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    pixel_values = inputs['pixel_values'].to(device)

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask if attention_mask is not None else None,
            pixel_values=pixel_values
        )
        logits_per_image = outputs.logits_per_image
    return logits_per_image

In [8]:
def compute_loss_and_eval(model, val_dl, columns_unique_labels):
    '''
    columns_unique_labels: {'column_name1': ['unique_label_name'], 'column_name2': ['unique_label_name']}.
    val_dl: 'image', 'column1', 'column2' ...
    '''
    device = model.device
    print(f'{len(columns_unique_labels)} column need to be evaluated')
    acc_loss_dict = {}
    for column,unique_labels in columns_unique_labels.items():
        print(f'\nEvaluation on column: {column}')
        all_preds = []
        all_labels = []
        single_column_loss = 0
        for batch in tqdm(val_dl):
            logits_per_image = single_column_infer_fn(model, batch['image'], column, unique_labels)
            
            probs = F.softmax(logits_per_image, dim=-1)
            preds = [unique_labels[idx] for idx in probs.argmax(dim=-1).cpu().numpy()]
            labels = batch[column]
#             is necessary to compute loss?
            # 将字符串标签转换为索引
            label_indices = torch.tensor([unique_labels.index(label) for label in labels], 
                                       device=device)
            # 计算交叉熵损失
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits_per_image, label_indices)
#             print(len(batch[column]))
            single_column_loss += loss.item() * len(batch[column])
            all_preds.extend(preds)
            all_labels.extend(labels)
        accuracy = sum(p == l for p, l in zip(all_preds, all_labels)) / len(all_labels)
        single_column_loss = single_column_loss / len(all_labels)
        acc_loss_dict[column] = (accuracy, single_column_loss)
        print(f'Model accuracy on {column}: {accuracy}')
        print(f'Loss on {column}: {single_column_loss}')
    return acc_loss_dict
    

In [None]:
# original model evaluation on validate dataset
val_dl = DataLoader(val_ds, batch_size=256, collate_fn=collate_fn_val)
tmp = compute_loss_and_eval(model, val_dl, columns_unique_labels)

11 column need to be evaluated

Evaluation on column: product_group_name


  0%|          | 0/83 [00:00<?, ?it/s]

Model accuracy on product_group_name: 0.5303044719314938
Loss on product_group_name: 1.444383701452406

Evaluation on column: product_type_name


  0%|          | 0/83 [00:00<?, ?it/s]

Model accuracy on product_type_name: 0.3889153187440533
Loss on product_type_name: 2.223591276081713

Evaluation on column: graphical_appearance_name


  0%|          | 0/83 [00:00<?, ?it/s]

Model accuracy on graphical_appearance_name: 0.07117031398667935
Loss on graphical_appearance_name: 4.683136934558059

Evaluation on column: colour_group_name


  0%|          | 0/83 [00:00<?, ?it/s]

Model accuracy on colour_group_name: 0.3423882017126546
Loss on colour_group_name: 2.419921142821307

Evaluation on column: perceived_colour_value_name


  0%|          | 0/83 [00:00<?, ?it/s]

Model accuracy on perceived_colour_value_name: 0.19757373929590866
Loss on perceived_colour_value_name: 2.039835844076213

Evaluation on column: perceived_colour_master_name


  0%|          | 0/83 [00:00<?, ?it/s]

Model accuracy on perceived_colour_master_name: 0.49700285442435777
Loss on perceived_colour_master_name: 1.8545446902655285

Evaluation on column: department_name


  0%|          | 0/83 [00:00<?, ?it/s]

Model accuracy on department_name: 0.06745956232159847
Loss on department_name: 5.418500460997635

Evaluation on column: index_name


  0%|          | 0/83 [00:00<?, ?it/s]

Model accuracy on index_name: 0.32331113225499525
Loss on index_name: 2.577958757326106

Evaluation on column: index_group_name


  0%|          | 0/83 [00:00<?, ?it/s]

## Training loop func

In [None]:
import wandb

def train(
    model=None, 
    train_ds=None, 
    val_ds=None, 
    n_epochs=3, 
    batch_size=128, 
    lr=1e-5, 
    infer_fn=None,
    project_name="H&M",  # wandb project name
    run_name='10.09',  # optional wandb run name
):
    # Initialize wandb
    wandb.init(
        project=project_name,
        name=run_name,
        config={
            "learning_rate": lr,
            "epochs": n_epochs,
            "batch_size": batch_size
        }
    )

    train_dl = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate_fn_val)
    optimizer = AdamW(model.parameters(), lr=lr)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    global_step = 0
    print('Training Begin...')
    
    for epoch in range(n_epochs):
        model.train()
        running_loss = 0.0    
        for batch in tqdm(train_dl):
            input_ids = batch['input_ids'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            
            outputs = model(input_ids=input_ids, pixel_values=pixel_values)
            logits_per_image = outputs.logits_per_image
            logits_per_text = outputs.logits_per_text
            
            labels = torch.arange(logits_per_image.size(0)).to(device)
            loss_img = F.cross_entropy(logits_per_image, labels)
            loss_txt = F.cross_entropy(logits_per_text, labels)
            loss = (loss_img + loss_txt) / 2
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            global_step += 1
            
            # Log training loss every 100 steps
            if global_step % 100 == 0:
                wandb.log({
                    "train/loss": running_loss / 100,
                    "train/step": global_step
                })
                running_loss = 0.0
        
        model.eval()
        acc_loss_dict = compute_loss_and_eval(model, val_dl, columns_unique_labels)
        
        # Log validation metrics
        log_dict = {
            f"val/epoch": epoch,
            f"val/loss": loss.item()
        }
        
        # Log individual column metrics
        for column, (accuracy, single_column_loss) in acc_loss_dict.items():
            log_dict.update({
                f"val/loss_{column}": single_column_loss,
                f"val/accuracy_{column}": accuracy
            })
        
        wandb.log(log_dict)
        
        print(f"Epoch {epoch}, Loss: {loss.item()}")
        print(f"Evaluation results on val_dl:\n {acc_loss_dict}")
    
    # Close wandb run
    wandb.finish()

## Ready to Train

In [None]:
train_arg = {
    'model': peft_model,
    'train_ds': train_ds,
    'val_ds': val_ds,
    'n_epochs': 3,
    'batch_size': 256,
    'lr': 1e-5,
    'infer_fn': None,
}
torch.cuda.empty_cache()
train(**train_arg)