In [35]:
import torch
import pandas as pd
from transformers import CLIPProcessor, CLIPModel
from torch.utils.data import DataLoader
from torchvision import transforms
from sklearn.preprocessing import LabelEncoder


csv_path = "./data/product_list.csv"
data = pd.read_csv(csv_path)

label_encoder = LabelEncoder()
data["Encoded_Label"] = label_encoder.fit_transform(data["Product_BusinessUnitDesc"])
# Save the updated CSV (optional, for debugging)
data.to_csv("product_list_with_labels.csv", index=False)


In [36]:
data.head()


Unnamed: 0,MMC,Product_BusinessUnitDesc,Encoded_Label
0,010M03A1116X9000,W RTW,2
1,010M09A1101X0863,W RTW,2
2,010M27A0006X0200,W RTW,2
3,011A11A1166X4150,W RTW,2
4,011A11A1166X9000,W RTW,2


In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [38]:
from transformers import CLIPProcessor

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")



In [39]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset
import pandas as pd

class MMCImageTextDataset(Dataset):
    def __init__(self, image_dir, csv_path, processor, transform=None):
        self.image_dir = image_dir
        self.data = pd.read_csv(csv_path)
        self.processor = processor
        self.transform = transform
        self.label_mapping = dict(zip(self.data["MMC"], self.data["Encoded_Label"]))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get MMC and label
        row = self.data.iloc[idx]
        mmc = row["MMC"]
        label = self.label_mapping[mmc]

        # Check if image exists in the folder
        image_path = os.path.join(self.image_dir, f"{mmc}.jpeg")
        if not os.path.exists(image_path):
            # Skip if the image doesn't exist
            return None

        # Load image
        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return {
            "image": image,
            "label": torch.tensor(label, dtype=torch.long),
        }


In [40]:

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    transforms.RandomResizedCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# Initialize Dataset
image_dir = "./data/DAM"  # Directory containing images named by MMC
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
dataset = MMCImageTextDataset(image_dir, 'product_list_with_labels.csv', processor, transform=transform)


In [41]:
from transformers import BatchEncoding
import logging 
def collate_fn(batch):
    batch = [item for item in batch if item is not None]  # Remove None values
    images = torch.stack([item["image"] for item in batch])
    labels = torch.tensor([item["label"] for item in batch], dtype=torch.long)
    return {
        "image": images,
        "label": labels,
    }



In [42]:
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel

# Initialize model, optimizer, and loss function
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

# DataLoader for training
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Training Loop
epochs =10
model.train()

for epoch in range(epochs):
    running_loss = 0.0
    correct = 0
    total = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
        images = batch["image"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()

        outputs = model.get_image_features(pixel_values=images)  
        logits_per_image = outputs  # Image features are directly used for classification
        # Calculate loss
        loss = criterion(logits_per_image, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        preds = logits_per_image.argmax(dim=-1)
        correct += (preds == labels).sum().item()
        total += len(labels)

    # Epoch summary
    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = 100 * correct / total
    print(f"Epoch {epoch + 1} - Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")


Training Epoch 1/10: 100%|██████████| 87/87 [00:22<00:00,  3.92it/s]


Epoch 1 - Loss: 1.6790, Accuracy: 56.22%


Training Epoch 2/10: 100%|██████████| 87/87 [00:22<00:00,  3.94it/s]


Epoch 2 - Loss: 0.7814, Accuracy: 73.68%


Training Epoch 3/10: 100%|██████████| 87/87 [00:23<00:00,  3.69it/s]


Epoch 3 - Loss: 0.6672, Accuracy: 76.50%


Training Epoch 4/10: 100%|██████████| 87/87 [00:24<00:00,  3.50it/s]


Epoch 4 - Loss: 0.5393, Accuracy: 81.78%


Training Epoch 5/10: 100%|██████████| 87/87 [00:29<00:00,  2.91it/s]


Epoch 5 - Loss: 0.4968, Accuracy: 82.61%


Training Epoch 6/10: 100%|██████████| 87/87 [00:35<00:00,  2.46it/s]


Epoch 6 - Loss: 0.4498, Accuracy: 85.07%


Training Epoch 7/10: 100%|██████████| 87/87 [00:36<00:00,  2.38it/s]


Epoch 7 - Loss: 0.4172, Accuracy: 85.21%


Training Epoch 8/10: 100%|██████████| 87/87 [00:37<00:00,  2.31it/s]


Epoch 8 - Loss: 0.4527, Accuracy: 84.31%


Training Epoch 9/10: 100%|██████████| 87/87 [00:38<00:00,  2.26it/s]


Epoch 9 - Loss: 0.4207, Accuracy: 85.32%


Training Epoch 10/10: 100%|██████████| 87/87 [00:38<00:00,  2.23it/s]

Epoch 10 - Loss: 0.3857, Accuracy: 86.33%





In [43]:
model_save_path = "trained_model.pth"

# Save the model and optimizer state
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': epoch + 1,
}, model_save_path)


In [44]:
print(dataset.label_mapping)

{'010M03A1116X9000': 2, '010M09A1101X0863': 2, '010M27A0006X0200': 2, '011A11A1166X4150': 2, '011A11A1166X9000': 2, '011B13A6014X0200': 2, '011B48A3874X0863': 2, '011D01A1166X5645': 2, '011D01A1166X9000': 2, '011D02A1166X3250': 2, '011G05ACMIXX0863': 2, '011G07A1166X9000': 2, '011G12A1166X4150': 2, '011G13A1212X0200': 2, '011J12A1166X9000': 2, '011J22X8801X9000': 2, '011J50A8976X9632': 2, '011P11A1166X9000': 2, '011P12A3862X0863': 2, '011P34A1166X5435': 2, '011R33A7970X9639': 2, '011R68A1212X4150': 2, '011R70A1166X9000': 2, '011V41A3062X5435': 2, '012A09A3232X5597': 2, '012B03A3985X5902': 2, '012J03A3236X0835': 2, '013L02A4006X5435': 2, '013T03WC437X9663': 2, '013T05TU430X0854': 2, '014C10AM034X1705': 2, '014E01CM514X0863': 2, '014E20CM514X0863': 2, '014E20TU507X0850': 2, '014P02AM042X5800': 2, '014S26AM028X2847': 2, '014S52AM044X9650': 2, '014S52WC029X0863': 2, '014S55AM042X0820': 2, '014S58AM304X0200': 2, '014S59AM005X4220': 2, '014S62TM033X5902': 2, '014S73AM105X4235': 2, '014S92AM0

In [45]:
import torch
from torchvision import transforms
from PIL import Image
from torch.utils.data import DataLoader

# Test Dataset for inference
class TestImageDataset(torch.utils.data.Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_dir = image_dir
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith((".png", ".jpg", ".jpeg"))]
        self.transform = transform

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return {
            "image": image,
            "image_name": self.image_files[idx]
        }

# Define transformations for test images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

# Load test images
test_image_dir = "./data/test_image_headmind"  
test_dataset = TestImageDataset(image_dir=test_image_dir, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

model.eval()

# Perform inference
with torch.no_grad():
    for batch in test_loader:
        images = batch["image"].to(device)
        image_names = batch["image_name"]

        # Get image features and predict labels
        image_features = model.get_image_features(pixel_values=images)
        logits_per_image = image_features  # Same as during training
        predicted_label = logits_per_image.argmax(dim=-1).item()

        # Print the result
        print(f"Image: {image_names[0]} - Predicted Label: {label_encoder.inverse_transform([predicted_label])}")


Image: image-20210928-102713-12d2869d.jpg - Predicted Label: ['W Accessories']
Image: image-20210928-102718-2474636a.jpg - Predicted Label: ['W Accessories']
Image: image-20210928-102721-8eaea48f.jpg - Predicted Label: ['W Bags']
Image: image-20210928-102725-7e28b44c.jpg - Predicted Label: ['W Accessories']
Image: image-20210928-102729-f53d9faf.jpg - Predicted Label: ['W Accessories']
Image: image-20210928-102732-8502aded.jpg - Predicted Label: ['W Accessories']
Image: image-20210928-102747-05631ad4.jpg - Predicted Label: ['W SLG']
Image: image-20210928-102758-fd41d453.jpg - Predicted Label: ['W Accessories']
Image: image-20210928-102802-39d3c54f.jpg - Predicted Label: ['W Accessories']
Image: image-20210928-102806-79d7094b.jpg - Predicted Label: ['W Accessories']
Image: image-20210928-102940-20cd277d.jpg - Predicted Label: ['W Accessories']
Image: image-20210928-103146-c8d2fedb.jpg - Predicted Label: ['W Accessories']
Image: image-20210928-103157-a46cc321.jpg - Predicted Label: ['W Ac