In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import os
from pathlib import Path
!pip install peft

Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.12.0


In [3]:
text_path = '/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv'
articles = pd.read_csv(text_path)
print(articles.shape) # 100k data points
articles.head(5)

(105542, 25)


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


# Images and their features (articles.csv)

### This table contains all h&m articles with details such as a type of product, a color, a product group and other features.  
**Article data description:**

- `article_id` : A unique identifier of every article.
- `product_code`, `prod_name` : A unique identifier of every product and its name (not the same).
- `product_type`, `product_type_name` : The group of product_code and its name.
- `product_group_name` : Product Group. Father to product type.
- `graphical_appearance_no`, `graphical_appearance_name` : The group of graphics and its name.
- `colour_group_code`, `colour_group_name` : The group of color and its name.
- `perceived_colour_value_id`, `perceived_colour_value_name`, `perceived_colour_master_id`, `perceived_colour_master_name` : The added color info.
- `department_no`, `department_name` : A unique identifier of every department and its name.
- `index_code`, `index_name` : A unique identifier of every index and its name.
- `index_group_no`, `index_group_name` : A group of indices and its name.
- `section_no`, `section_name` : A unique identifier of every section and its name.
- `garment_group_no`, `garment_group_name` : A unique identifier of every garment and its name.
- `detail_desc` : Details.

In [None]:
for col in articles.columns:
    if not 'no' in col and not 'code' in col and not 'id' in col:
        un_n = articles[col].nunique()
        print(f'n of unique {col}: {un_n}')

In [None]:
f, ax = plt.subplots(figsize=(15, 7))
ax = sns.histplot(data=articles, y='index_name', color='skyblue', ax=ax)
ax.set_xlabel('count by index name')
ax.set_ylabel('index name')
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(15, 7))
ax = sns.histplot(data=articles, y='garment_group_name', color='orange', hue='index_group_name', multiple="stack")
ax.set_xlabel('count by garment group')
ax.set_ylabel('garment group')
plt.show()

In [None]:
articles.groupby(['index_group_name', 'index_name']).count()['article_id']

In [None]:
pd.options.display.max_rows = None
articles.groupby(['product_group_name', 'product_type_name']).count()['article_id']

# LoRA finetune CLIP

In [3]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import (
    CLIPProcessor, 
    CLIPModel, 
    TrainingArguments, 
    Trainer
)

# Load CLIP model and processor on CPU
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name)
processor = CLIPProcessor.from_pretrained(model_name)
model

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [None]:
# How process processing images
from PIL import Image
# Download and open an example image
jpg = '/kaggle/input/h-and-m-personalized-fashion-recommendations/images/017/0176754003.jpg'
image = Image.open(jpg)

# Process the image
inputs = processor(images=image, return_tensors="pt")

# Visualize the processing steps
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

# Original image
axs[0].imshow(image)
axs[0].set_title("Original Image")
axs[0].axis('off')

# Resized and cropped image
resized_image = image.resize((224, 224))
axs[1].imshow(resized_image)
axs[1].set_title("Resized and Cropped")
axs[1].axis('off')

# Normalized image
normalized_image = inputs['pixel_values'][0].permute(1, 2, 0)
axs[2].imshow(normalized_image)
axs[2].set_title("Normalized")
axs[2].axis('off')

plt.tight_layout()
plt.show()

print("Original image shape:", image.size)
print("Processed image shape:", inputs['pixel_values'].shape)
print("Pixel value range:", inputs['pixel_values'].min().item(), "to", inputs['pixel_values'].max().item())

## product_group_name

In [None]:
import os
import pandas as pd
from datasets import Dataset
import itertools

# 图片文本目录路径
images_path = '/kaggle/input/h-and-m-personalized-fashion-recommendations/images'
text_path = '/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv'

# 读取csv文件
# df = pd.read_csv(text_path)
df = articles
df['image_path'] = ''

# 为每个article_id添加对应的图片路径，并将表格保存到/kaggle/working/articles_with_image_path.csv
for i in range(df.shape[0]):
    # 用article_id获取图片路径
    article_id = str(df.iloc[i]['article_id'])
    image_path = images_path + f'/0{article_id[:2]}' + f'/0{article_id}.jpg'
    if not os.path.exists(image_path):
        continue
    df.loc[i, 'image_path'] = image_path
    

df.to_csv('/kaggle/working/articles_with_image_path.csv')

In [4]:
# remove all the columns that their image_path is NAN
df = pd.read_csv('/kaggle/working/articles_with_image_path.csv')
filter_product_group_name = ['Unknown','Underwear/nightwear','Cosmetic','Bags','Items',
    'Furniture','Garment and Shoe care','Stationery','Interior textile','Fun']
format_df = df[df['image_path'].notna() & (df['image_path'] != '') & (~df['product_group_name'].isin(filter_product_group_name))]
print(f'清洗后的df行列数: {format_df.shape}')

# 将df拆分为测试集df和训练集df，并保证product_group_name类别比例保持一致。
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import numpy as np

def stratified_split(df, text_column, test_size=0.2, random_state=42):
    # 为每个唯一的 text 值分配一个类别标签
    df['text_category'] = pd.Categorical(df[text_column]).codes
    
    # 初始化 StratifiedShuffleSplit
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    
    # 进行分层抽样
    for train_index, test_index in splitter.split(df, df['text_category']):
        train_df = df.iloc[train_index].copy()
        test_df = df.iloc[test_index].copy()
    
    # 删除临时的 'text_category' 列
    train_df.drop('text_category', axis=1, inplace=True)
    test_df.drop('text_category', axis=1, inplace=True)
    
    return train_df, test_df

# 进行分层抽样
train_df, test_df = stratified_split(format_df, text_column='product_group_name', test_size=0.2)

print(f"训练集大小: {len(train_df)}")
print(f"测试集大小: {len(test_df)}")

# 检查每个集合中各类别的比例
def check_proportions(df, column):
    return df[column].value_counts(normalize=True)

print("\n训练集中的类别比例:")
print(check_proportions(train_df, 'product_group_name'))

print("\n测试集中的类别比例:")
print(check_proportions(test_df, 'product_group_name'))
# Unknown之后的占比太小（只占了300个dp不到）感觉可以删除，但如何处理这部分的推理

清洗后的df行列数: (104803, 27)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_category'] = pd.Categorical(df[text_column]).codes


训练集大小: 83842
测试集大小: 20961

训练集中的类别比例:
product_group_name
Garment Upper body    0.407242
Garment Lower body    0.188641
Garment Full body     0.126679
Accessories           0.105007
Underwear             0.052110
Shoes                 0.049200
Swimwear              0.029818
Socks & Tights        0.023198
Nightwear             0.018105
Name: proportion, dtype: float64

测试集中的类别比例:
product_group_name
Garment Upper body    0.407232
Garment Lower body    0.188636
Garment Full body     0.126664
Accessories           0.105005
Underwear             0.052144
Shoes                 0.049187
Swimwear              0.029817
Socks & Tights        0.023186
Nightwear             0.018129
Name: proportion, dtype: float64


In [17]:
from PIL import Image
from torch.utils.data import Dataset, DataLoader

class TextImageDataset(Dataset): # 接受处理后的df构建迭代器数据集
    def __init__(self, dataframe, prompt):
        self.dataframe = dataframe
        self.prompt = prompt
        
    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        text = row['product_group_name']
        text = self.prompt.format(text)
        image = Image.open(row['image_path'])
        print(f"Sample at index {index}: {text}") 
        return {
            'text': text,
            'image': image,
        }


# 创建数据集实例
# df = pd.DataFrame({'text': ['example1', 'example2'], 'image_path': ['path/to/image1.jpg', None]})
prompt = 'A photo of a {}'
train_dataset = TextImageDataset(train_df, prompt)
test_dataset = TextImageDataset(test_df, prompt)

In [28]:
from transformers import DataCollator
class CustomDataCollator(DataCollator):
    def __init__(self):
        pass

    def __call__(self, batch):
        texts = [item['text'] for item in batch]
        images = [item['image'] for item in batch]
        
        # 处理文本
        text_inputs = processor(
            text=texts,
            padding=True,
            truncation=True,
            return_tensors="pt",
        )
        
        # 处理图像
        image_inputs = processor(
            images=images,
            return_tensors="pt",
        )
        
        # 合并文本和图像的输入
        inputs = {
            'input_ids': text_inputs['input_ids'],
            'attention_mask': text_inputs['attention_mask'],
            'pixel_values': image_inputs['pixel_values'],
        }
        
        return inputs
    
data_collator = CustomDataCollator()

TypeError: NewType.__init__() takes 3 positional arguments but 4 were given

In [24]:
# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", 'k_proj'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(model, lora_config)
print(peft_model.print_trainable_parameters())

# Define data collator
def collate_fn(batch):
    texts = [item['text'] for item in batch]
    images = [item['image'] for item in batch]
    
    # 处理文本
    text_inputs = processor(
        text=texts,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )
    
    # 处理图像
    image_inputs = processor(
        images=images,
        return_tensors="pt",
    )
    
    # 合并文本和图像的输入
    inputs = {
        'input_ids': text_inputs['input_ids'],
        'attention_mask': text_inputs['attention_mask'],
        'pixel_values': image_inputs['pixel_values'],
    }
    
    return inputs

# Set up training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/clip_lora_output",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to='tensorboard'
)

# Initialize Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=collate_fn,
)

trainable params: 1,474,560 || all params: 152,751,873 || trainable%: 0.9653
None


In [25]:
# Start training
trainer.train()

# Save the fine-tuned model
peft_model.save_pretrained("./clip_peft_finetuned")

TypeError: 'DataLoader' object is not subscriptable