In [2]:
import numpy as np 
import pandas as pd 
import ast 
from itertools import chain 
from sklearn.preprocessing import MultiLabelBinarizer 
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import cv2 
from matplotlib import pyplot as plt
from IPython.display import display
from torch.utils.data import Dataset
from torchvision import transforms
from torch.utils.data import DataLoader

df = pd.read_csv('/kaggle/input/all-shapes-and-colors/train_v2.csv')
df['label'] = df['label'].apply(ast.literal_eval) 

# create set of all labels
label_set = set(chain.from_iterable(df['label']))

# binarize lables and put into dataframe (multi-hot vector of classes)
mlb = MultiLabelBinarizer()
labels_binarized = mlb.fit_transform(df['label'])
labels_binarized = [np.array(label) for label in labels_binarized]
labels_df = pd.DataFrame({"label_vectors": labels_binarized})
df = pd.concat([df, labels_df], axis=1)

In [3]:
'''
Given the problem of multi-label classification, I decided to proceed with a CNN archetitecture as  
it would be the best approach to extract features from images. I considered using classical 
computer vision strategies such as color filtering and finding contours, however the problem definition
asked for a module solution that does not hard code the number of classes. This solution using a CNN 
takes the number of different lables from the csv as an input and is therefore modular to the # of classes.

The CNN architecture is best at extracting features from images as its designed to mimick the way humans
proccess visual information. This is done through its 3 types of layers, convolutional, pooling and FC.
Applying a learnable filter, the kernel, across the image in stride in the convolutional layer, learning
features. Then a pooling layer to reduce spatial size and a FC layer to make the predictions. 

Given that this is a multi-label classification problem, I applied a sigmoid activation function to the outputs
rather than the normal softmax as I wanted each label the chance to activate rather than have the tensor add to 1. 

'''

class CNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__() 
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc = nn.Linear(32 * 16 * 16, num_classes)

    def forward(self, x):
        # ReLU to add non-linearity
        x = self.pool(F.relu(self.conv1(x)))  
        x = self.pool(F.relu(self.conv2(x)))  
        x = x.view(x.size(0), -1)     
        x = self.fc(x)       
        return x

In [4]:
# custom dataset that helps load images from the csv
class ShapeColorDataset(Dataset):
    def __init__(self, df, image_root, class_names, transform=None):
        self.df = df
        self.image_root = image_root
        self.class_names = class_names
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = cv2.imread(f"{self.image_root}/{row['image_path']}")
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        label = row['label_vectors']
        label = label.astype(np.float32)
        if self.transform:
            img = self.transform(img)
        return img, label


In [5]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((64,64)),
    transforms.ToTensor(),
])

dataset = ShapeColorDataset(df, image_root='/kaggle/input/all-shapes-and-colors/dataset_v2', class_names=mlb.classes_, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True) 

model = CNN(num_classes=len(mlb.classes_))
criterion = nn.BCEWithLogitsLoss() # use binary cross-entropy loss as its the best for multi-label outputs
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

model.train()
for epoch in range(50):
    for images, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")


Epoch 1 Loss: 0.3220
Epoch 2 Loss: 0.4719
Epoch 3 Loss: 0.4560
Epoch 4 Loss: 0.3991
Epoch 5 Loss: 0.4359
Epoch 6 Loss: 0.4533
Epoch 7 Loss: 0.3052
Epoch 8 Loss: 0.2972
Epoch 9 Loss: 0.3696
Epoch 10 Loss: 0.4572
Epoch 11 Loss: 0.3342
Epoch 12 Loss: 0.3058
Epoch 13 Loss: 0.3357
Epoch 14 Loss: 0.3435
Epoch 15 Loss: 0.4230
Epoch 16 Loss: 0.2989
Epoch 17 Loss: 0.2030
Epoch 18 Loss: 0.2264
Epoch 19 Loss: 0.3461
Epoch 20 Loss: 0.1427
Epoch 21 Loss: 0.2452
Epoch 22 Loss: 0.1545
Epoch 23 Loss: 0.2273
Epoch 24 Loss: 0.1312
Epoch 25 Loss: 0.1550
Epoch 26 Loss: 0.0408
Epoch 27 Loss: 0.2136
Epoch 28 Loss: 0.1057
Epoch 29 Loss: 0.1092
Epoch 30 Loss: 0.2682
Epoch 31 Loss: 0.1224
Epoch 32 Loss: 0.1181
Epoch 33 Loss: 0.1541
Epoch 34 Loss: 0.1050
Epoch 35 Loss: 0.0635
Epoch 36 Loss: 0.1199
Epoch 37 Loss: 0.0452
Epoch 38 Loss: 0.0441
Epoch 39 Loss: 0.0545
Epoch 40 Loss: 0.0197
Epoch 41 Loss: 0.0353
Epoch 42 Loss: 0.0216
Epoch 43 Loss: 0.0306
Epoch 44 Loss: 0.0156
Epoch 45 Loss: 0.0271
Epoch 46 Loss: 0.00

In [11]:
test_df = pd.read_csv('/kaggle/input/all-shapes-and-colors/test_v2.csv')

all_labels = sorted(set(chain.from_iterable(df['label'])))
index_to_class = {i: label for i, label in enumerate(all_labels)}

# dataset class load images from the csv
class TestDataset(Dataset):
    def __init__(self, df, image_root, class_names, transform=None):
        self.df = df
        self.image_root = image_root
        self.class_names = class_names
        self.transform = transform

    def __len__(self):
        return len(self.df)

    # a special type of python function that tells the object how to behave when accessed with []
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = cv2.imread(f"{self.image_root}/{row['image_path']}")
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if self.transform:
            img = self.transform(img)
        return img, row['image_path']


test_dataset = TestDataset(test_df, image_root='/kaggle/input/all-shapes-and-colors/dataset_v2', class_names=mlb.classes_, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True) # makes data effecient for training

model.eval()
predictions = []

with torch.no_grad():
    for images, image_paths in test_loader:
        # compute predictions on images and apply sigmoid to output values
        outputs = model(images)
        probs = torch.sigmoid(outputs)             
        preds = (probs > 0.5).int()    
        
        # convert the predictions to label names
        for i in range(preds.size(0)):
            pred_labels = [index_to_class[j] for j in range(len(preds[i])) if preds[i][j] == 1]
            predictions.append({
                'image_path': image_paths[i],
                'label': pred_labels
            })

pred_df = pd.DataFrame(predictions)
pred_df.to_csv('predictions.csv', index=False)