In [1]:
!pip install peft
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import os
from peft import LoraConfig, get_peft_model
import wandb

Collecting peft
  Downloading peft-0.13.1-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.1-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.1


In [2]:
def get_image_paths_and_labels_from_df(df, data_dir):
    article_ids = df["article_id"].values
    image_paths = []
    labels = []
    
    for article_id in article_ids:
        image_path = f"{data_dir}/images/0{str(article_id)[:2]}/0{article_id}.jpg"
        # Check if the image file exists
        if os.path.exists(image_path):
            image_paths.append(image_path)
            # Add corresponding label only if the image exists
            labels.append(df[df["article_id"] == article_id])

    return image_paths, labels

class ImageDataset(torch.utils.data.Dataset):
    def __init__(self, image_paths, processor=None):
        self.image_paths = image_paths
        self.processor = processor
        self.image_ids = []

        for image_path in self.image_paths:
            if not os.path.exists(image_path):
                raise FileNotFoundError(f"Image {image_path} not found.")
            else:
                image_id = int(image_path.split("/")[-1].split(".")[0])
                self.image_ids.append(image_id)
            

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        if self.processor is not None:
            inputs = self.processor(images=image, return_tensors="pt", padding=True)
            image = inputs["pixel_values"][0]
        return image, self.image_ids[idx]

In [3]:
# set random seed 42
torch.manual_seed(42)

<torch._C.Generator at 0x78f2d2d98310>

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", cache_dir="model", local_files_only=False)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", cache_dir="model", local_files_only=False)

model = model.to(device)

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



In [5]:
text_path = '/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv'
articles = pd.read_csv(text_path)
data_dir = '/kaggle/input/h-and-m-personalized-fashion-recommendations'

In [6]:
# map from article_id to df index
article_id_to_idx = {article_id: idx for idx, article_id in enumerate(articles["article_id"])}

# get all classes of the dataframe
class_names = articles.columns.tolist()
label_names = dict()
label_names_to_idx = dict()
for class_name in class_names:
    label_names[class_name] = articles[class_name].unique()
    label_names_to_idx[class_name] = {label_name: idx for idx, label_name in enumerate(label_names[class_name])}

article_ids = label_names["article_id"]
#selected_class_names = ["product_type_name", "graphical_appearance_name"]
selected_class_names = ["product_group_name", "product_type_name", "graphical_appearance_name", "colour_group_name", "perceived_colour_value_name", "perceived_colour_master_name", "department_name", "index_name", "index_group_name", "section_name", "garment_group_name"]

In [7]:
# grouped by product_code
grouped = articles.groupby("product_code")
groups = [group for _, group in grouped]

# split 0.8/0.1/0.1
train_groups, test_groups = train_test_split(groups, test_size=0.2, random_state=42) 
val_groups, test_groups = train_test_split(test_groups, test_size=0.5, random_state=42) 

train_df = pd.concat(train_groups)
val_df = pd.concat(val_groups)
test_df = pd.concat(test_groups)

print(f"{len(train_df)=} {len(val_df)=} {len(test_df)=}")

len(train_df)=84445 len(val_df)=10534 len(test_df)=10563


In [8]:
train_paths, train_labels = get_image_paths_and_labels_from_df(train_df, data_dir)
val_paths, val_labels = get_image_paths_and_labels_from_df(val_df, data_dir)
test_paths, test_labels = get_image_paths_and_labels_from_df(test_df, data_dir)

In [9]:
class MultiOutputLayer(torch.nn.Module):
    def __init__(self, input_size, inter_size, output_size):
        super(MultiOutputLayer, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, inter_size)
        self.fc2 = torch.nn.Linear(inter_size, output_size)
        self.dropout = torch.nn.Dropout(0.5)
        self.act = torch.nn.SiLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [10]:
class MultiOutputClipModel(torch.nn.Module):
    def __init__(self, clip_model, class_names, vision_hidden_size, inter_size, output_size):
        super(MultiOutputClipModel, self).__init__()
        self.clip_model = clip_model
        self.class_names = class_names
        self.output_layers = torch.nn.ModuleDict({
            class_name: MultiOutputLayer(vision_hidden_size, inter_size, output_size)
            for class_name in self.class_names
        })
    
    def forward(
        self,
        text_input_dict,
        pixel_values,
        # position_ids = None,
        output_attentions = None,
        output_hidden_states = None,
        return_dict = None,
    ):

        output_attentions = output_attentions if output_attentions is not None else self.clip_model.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.clip_model.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.clip_model.config.use_return_dict

        vision_outputs = self.clip_model.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        vision_embeds = vision_outputs[1]
        vision_embeds_dict = {
            class_name: output_layer(vision_embeds) 
                for class_name, output_layer in self.output_layers.items()
        }

        text_outputs_dict = {
            class_name: self.clip_model.text_model(
                input_ids=text_input_dict[class_name]["input_ids"],
                attention_mask=text_input_dict[class_name]["attention_mask"],
                # position_ids=position_ids,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            ) for class_name in self.class_names
        }

        text_embeds_dict = {
            class_name: self.clip_model.text_projection(text_outputs[1])
                for class_name, text_outputs in text_outputs_dict.items()
        }

        logits_per_image_dict = {
            class_name: vision_embeds_dict[class_name] @ text_embeds_dict[class_name].T
                for class_name in self.output_layers.keys()
        }

        return logits_per_image_dict

In [11]:
# custom criterion: cross entropy loss across all classes
class MultiOutputClipCriterion(torch.nn.Module):
    def __init__(self, class_names):
        super(MultiOutputClipCriterion, self).__init__()
        self.class_names = class_names
        self.criterion = torch.nn.CrossEntropyLoss()

    def forward(self, logits_dict, labels_dict):
        loss = 0
        for class_name in self.class_names:
            logits = logits_dict[class_name]
            labels = labels_dict[class_name]
            loss += self.criterion(logits, labels)
        return loss

In [13]:
train_dataset = ImageDataset(train_paths, processor)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)

val_dataset = ImageDataset(val_paths, processor)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=256, shuffle=False)

test_dataset = ImageDataset(test_paths, processor)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=256, shuffle=False)

In [14]:
# freeze all parameters in model

# for param in model.parameters():
#     param.requires_grad = False

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,                  # Low-rank dimension (adjustable)
    lora_alpha=32,          # Scaling factor (adjustable)
    target_modules=["q_proj", "v_proj", "k_proj"],  # Specify which layers to apply LoRA to
    lora_dropout=0.05,       # Dropout rate (optional)
    bias="none",            # Whether to include biases ("none", "all", "lora_only")
    task_type="classification"  # Task type ("classification" or "regression")
)

# Apply LoRA to the CLIP model
model = get_peft_model(model, lora_config)

In [15]:
mo_model = MultiOutputClipModel(model, selected_class_names, 768, 128, 512).to(device)
mo_model.train()

MultiOutputClipModel(
  (clip_model): PeftModel(
    (base_model): LoraModel(
      (model): CLIPModel(
        (text_model): CLIPTextTransformer(
          (embeddings): CLIPTextEmbeddings(
            (token_embedding): Embedding(49408, 512)
            (position_embedding): Embedding(77, 512)
          )
          (encoder): CLIPEncoder(
            (layers): ModuleList(
              (0-11): 12 x CLIPEncoderLayer(
                (self_attn): CLIPSdpaAttention(
                  (k_proj): lora.Linear(
                    (base_layer): Linear(in_features=512, out_features=512, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=512, bias

In [16]:
# show all trainable parameters in mo_model
for name, param in mo_model.named_parameters():
    if param.requires_grad:
        print(name)

clip_model.base_model.model.text_model.encoder.layers.0.self_attn.k_proj.lora_A.default.weight
clip_model.base_model.model.text_model.encoder.layers.0.self_attn.k_proj.lora_B.default.weight
clip_model.base_model.model.text_model.encoder.layers.0.self_attn.v_proj.lora_A.default.weight
clip_model.base_model.model.text_model.encoder.layers.0.self_attn.v_proj.lora_B.default.weight
clip_model.base_model.model.text_model.encoder.layers.0.self_attn.q_proj.lora_A.default.weight
clip_model.base_model.model.text_model.encoder.layers.0.self_attn.q_proj.lora_B.default.weight
clip_model.base_model.model.text_model.encoder.layers.1.self_attn.k_proj.lora_A.default.weight
clip_model.base_model.model.text_model.encoder.layers.1.self_attn.k_proj.lora_B.default.weight
clip_model.base_model.model.text_model.encoder.layers.1.self_attn.v_proj.lora_A.default.weight
clip_model.base_model.model.text_model.encoder.layers.1.self_attn.v_proj.lora_B.default.weight
clip_model.base_model.model.text_model.encoder.lay

In [17]:
# generate text input
text_input_dict = {
    class_name: processor(text=[f"A photo of a {label}" for label in label_names[class_name]], 
                          return_tensors="pt", padding=True).to(device)
    for class_name in selected_class_names
}

In [18]:

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_key")
wandb.login(key=secret_value_0)
wandb.init(project="clip-lora_2", name='multiclass_layer2')
criteria = MultiOutputClipCriterion(class_names=selected_class_names)
optimizer = torch.optim.AdamW(mo_model.parameters(), lr=1e-4)
num_epochs = 10  # 根据需要调整
step = 0
def validate(model, dataloader, criteria, device, text_inputs, class_names):
    model.eval()
    total_loss = 0.0
    total_correct = {class_name: 0 for class_name in class_names}
    total_samples = 0

    with torch.no_grad():
        for images, image_ids in tqdm(dataloader):
            images = images.to(device)
            logits_per_image_dict = model(pixel_values=images, text_input_dict=text_inputs)

            # 获取真实标签
            true_labels_dict = {
                class_name: [label_names_to_idx[class_name][articles.loc[article_id_to_idx[image_id.item()], class_name]] 
                             for image_id in image_ids]
                for class_name in class_names
            }
            true_labels_dict = {class_name: torch.tensor(true_labels).to(device)
                                for class_name, true_labels in true_labels_dict.items()}
            
            # 计算损失
            loss = criteria(logits_per_image_dict, true_labels_dict)
            total_loss += loss.item() * images.size(0)

            # 计算准确率
            total_samples += images.size(0)
            for class_name in class_names:
                _, preds = torch.max(logits_per_image_dict[class_name], dim=1)
                total_correct[class_name] += (preds == true_labels_dict[class_name]).sum().item()

    avg_loss = total_loss / total_samples / len(class_names)
    accuracy = {class_name: total_correct[class_name] / total_samples for class_name in class_names}
    return avg_loss, accuracy


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mliulianhang[0m ([33mliulianhang-kth-royal-institute-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112844911109379, max=1.0…

In [21]:

for epoch in range(num_epochs):
    mo_model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    for images, image_ids in tqdm(train_dataloader):
        images = images.to(device)
        logits_per_image_dict = mo_model(pixel_values=images, text_input_dict=text_input_dict)

        # 获取真实标签
        true_labels_dict = {
            class_name: [label_names_to_idx[class_name][articles.loc[article_id_to_idx[image_id.item()], class_name]] 
                         for image_id in image_ids]
            for class_name in selected_class_names
        }
        true_labels_dict = {class_name: torch.tensor(true_labels).to(device) 
                            for class_name, true_labels in true_labels_dict.items()}

        # 计算损失
        loss = criteria(logits_per_image_dict, true_labels_dict)
        total_loss += loss.item() * images.size(0)

        # 计算准确率
        correct = 0
        total_samples += images.size(0)
        for class_name in selected_class_names:
            _, preds = torch.max(logits_per_image_dict[class_name], dim=1)
            correct += (preds == true_labels_dict[class_name]).sum().item()
        total_correct += correct

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 记录训练损失和准确率到 wandb
        # 在训练循环中，记录每个类别的准确率
        log_dict = {
            "train_loss": loss.item(),
            "train_accuracy": correct / images.size(0) / len(selected_class_names)
        }
#         for class_name in selected_class_names:
#             accuracy = total_correct_per_class[class_name] / total_samples
#             log_dict[f"train_accuracy_{class_name}"] = accuracy

        wandb.log(log_dict, step=step)
        step += 1

    avg_loss = total_loss / total_samples / len(selected_class_names)
    accuracy = total_correct / total_samples / len(selected_class_names)
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_loss:.4f}, Train Accuracy: {accuracy:.4f}")

    # 在每个 epoch 结束后进行验证
    val_loss, val_accuracy_dict = validate(mo_model, val_dataloader, criteria, device, text_input_dict, selected_class_names)
    val_accuracy = sum(val_accuracy_dict.values()) / len(val_accuracy_dict)
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

        # 记录验证损失和每个类别的准确率到 wandb
    log_dict = {
        "val_loss": val_loss,
        "val_accuracy": val_accuracy
    }
    for class_name, accuracy in val_accuracy_dict.items():
        log_dict[f"val_accuracy_{class_name}"] = accuracy

    wandb.log(log_dict, step=step)


wandb.finish()


  2%|▏         | 1/41 [00:19<12:43, 19.08s/it]


KeyboardInterrupt: 

In [1]:
# 保存模型
torch.save(mo_model.state_dict(), "model/final_output_clip_model2.pth")

# 在训练完成后进行测试
test_loss, test_accuracy_dict = validate(
    mo_model, test_dataloader, criteria, device, text_input_dict, selected_class_names
)

print(f"Test Loss: {test_loss:.4f}")

# 显示每个类别的准确率
print("Test Accuracy per Class:")
for class_name, accuracy in test_accuracy_dict.items():
    print(f"{class_name}: {accuracy:.4f}")

# 计算并显示平均准确率
test_accuracy = sum(test_accuracy_dict.values()) / len(test_accuracy_dict)
print(f"Average Test Accuracy: {test_accuracy:.4f}")


NameError: name 'torch' is not defined

In [None]:
# from kaggle_secrets import UserSecretsClient
# num_epochs = 10  # Adjust as needed
# criteria = MultiOutputClipCriterion(class_names=selected_class_names)
# optimizer = torch.optim.AdamW(mo_model.parameters(), lr=1e-4)
# step = 0
# user_secrets = UserSecretsClient()
# secret = user_secrets.get_secret("wandb_key")
# wandb.login(key=secret)
# wandb.init(project="clip-lora_2", name='multiclass_layer')
# for epoch in range(num_epochs):
#     mo_model.train()
#     total_loss = 0.0
#     total_correct = 0
#     total_samples = 0

#     for images, image_ids in tqdm(train_dataloader):
#         images = images.to(device)
#         logits_per_image_dict = mo_model(pixel_values=images, text_input_dict=text_input_dict)

#         # Get true labels from image_ids
#         true_labels_dict = {
#             class_name: [label_names_to_idx[class_name][articles.loc[article_id_to_idx[image_id.item()], class_name]] 
#                        for image_id in image_ids]
#             for class_name in selected_class_names
#         }
#         true_labels_dict = {class_name: torch.tensor(true_labels).to(device) 
#                             for class_name, true_labels in true_labels_dict.items()}
        
#         # Compute loss
#         loss = criteria(logits_per_image_dict, true_labels_dict)
#         total_loss += loss.item() * images.size(0)

#         # Predictions and accuracy
#         correct = 0
#         total_samples += images.size(0)
#         for class_name in selected_class_names:
#             _, preds = torch.max(logits_per_image_dict[class_name], dim=1)
#             correct += (preds == true_labels_dict[class_name]).sum().item()
#         total_correct += correct

#         # Backward pass
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         # log the loss and accuracy to wandb
#         wandb.log({"loss": loss.item(), "accuracy": correct / images.size(0) / len(selected_class_names)},
#                   step=step)
#         step += 1

#     avg_loss = total_loss / total_samples / len(selected_class_names)
#     accuracy = total_correct / total_samples / len(selected_class_names)
#     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

#     # Validate after each epoch
#     # val_loss, val_accuracy = validate(model, val_dataloader, criteria, device, text_inputs, class_name)
#     # print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

# wandb.finish()

# # Save the model
# torch.save(mo_model.state_dict(), "model/2_output_clip_model-3.pth")

In [None]:
# def validate(model, dataloader, criteria, device, text_inputs, class_names):
#     model.eval()
#     total_loss = 0.0
#     total_correct = {class_name: 0 for class_name in class_names}
#     total_samples = 0

#     with torch.no_grad():
#         for images, image_ids in tqdm(dataloader):
#             images = images.to(device)
#             logits_per_image_dict = model(pixel_values=images, text_input_dict=text_inputs)

#             # Get true labels from image_ids
#             true_labels_dict = {
#                 class_name: [label_names_to_idx[class_name][articles.loc[article_id_to_idx[image_id.item()], class_name]] 
#                            for image_id in image_ids]
#                 for class_name in class_names
#             }
#             true_labels_dict = {class_name: torch.tensor(true_labels).to(device)
#                                 for class_name, true_labels in true_labels_dict.items()}
            
#             # Compute loss
#             loss = criteria(logits_per_image_dict, true_labels_dict)
#             total_loss += loss.item() * images.size(0)

#             # Predictions and accuracy
#             total_samples += images.size(0)
#             for class_name in class_names:
#                 _, preds = torch.max(logits_per_image_dict[class_name], dim=1)
#                 total_correct[class_name] += (preds == true_labels_dict[class_name]).sum().item()

#     avg_loss = total_loss / total_samples / len(class_names)
#     accuracy = {class_name: total_correct[class_name] / total_samples for class_name in class_names}
#     return avg_loss, accuracy

In [None]:
# val_dataset = util.ImageDataset(val_paths, processor)
# val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=256, shuffle=False)
# test_dataset = util.ImageDataset(test_paths, processor)
# test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=256, shuffle=False)

In [None]:
# avg_loss, accuracy = validate(mo_model, val_dataloader, criteria, device, text_input_dict, selected_class_names)

In [None]:
# print(avg_loss)
# print(accuracy)

In [None]:
# torch.save(mo_model, "model/2_output_clip_model.pt")