In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
import pdb
from sklearn.model_selection import train_test_split
from PIL import Image
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn.utils import clip_grad_norm_, clip_grad_value_

import warnings
import datetime

# Suppress the FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, message=".*Series.__getitem__ treating keys as positions is deprecated.*")

# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, df, transform=None, base_img_path="./data/images/"):
        self.df=df
        self.transform = transform
        self.base_img_path=base_img_path

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_path = self.base_img_path + self.df.iloc[idx][1]
        caption = self.df.iloc[idx][8]
        label = self.df.iloc[idx][6]

        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        #image is already tensor cutesey of transform sequence
        return image, torch.tensor(caption), torch.tensor(label, dtype=torch.float32)




# Model definition
class MultiModalModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_dim, num_classes=18, lstm_layers=1, bidirectional=True):
        super(MultiModalModel, self).__init__()

        self.resnet = models.resnet34(pretrained=True)
        self.resnet_output_dim = self.resnet.fc.in_features
        self.resnet.fc = nn.Identity()

        self.embedding_dim = embed_size
        self.hidden_dim = hidden_dim
        self.num_layers = lstm_layers
        self.bidirectional = bidirectional

        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers,
                            batch_first=True, bidirectional=self.bidirectional)

        lstm_output_dim = self.hidden_dim * 2 if self.bidirectional else self.hidden_dim
        classifier_input_dim = self.resnet_output_dim + lstm_output_dim
        self.resnet_classifier = nn.Sequential(
            nn.BatchNorm1d(self.resnet_output_dim),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(self.resnet_output_dim, num_classes)
        )
        self.lstm_classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(lstm_output_dim, num_classes),
        )

        self.classifier = nn.Sequential(
            #nn.BatchNorm1d(num_classes * 2),
            # nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(num_classes * 2, num_classes),
            # nn.ReLU(),
            # nn.Linear(512, num_classes),
            #nn.Sigmoid()
        )

    def forward(self, images, captions):
        image_features = self.resnet(images)
        embedded = self.embedding(captions)
        lstm_out, _ = self.lstm(embedded)

        if self.bidirectional:
            lstm_out = torch.cat((lstm_out[:, -1, :self.hidden_dim], lstm_out[:, 0, self.hidden_dim:]), dim=1)
        else:
            lstm_out = lstm_out[:, -1, :]

        resnet_classifier_output = self.resnet_classifier(image_features)
        lstm_classifier_output = self.lstm_classifier(lstm_out)

        combined_features = torch.cat((resnet_classifier_output, lstm_classifier_output), dim=1)
        # combined_features = torch.cat((image_features, lstm_out), dim=1)
        output = self.classifier(combined_features)

        return output

# Training function
def train_epoch(model, train_loader, val_loader, num_epochs, criterion, optimizer, scheduler, device, log_file, clip_effect=0.5, model_path="./multimodalmodels/", history_path="./history/", save_prefix="multimodalmodel_resnet_34_clipped_val0_5_two_class_heads_"):
    best_val_acc = 0.0
    history = {
        'epoch': [],
        'train_loss': [],
        'val_loss': [],
        'val_accuracy': [],
        'val_f1_score': []
    }
    count_epoch = 0
    # Train the model
    for epoch in tqdm(range(num_epochs), position=0, leave=True):
        count_epoch += 1
        model.train()
        train_loss = 0.0
        for images, captions, labels in train_loader:
            images, captions, labels = images.to(device), captions.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images, captions)
            loss = criterion(outputs, labels.float())
            loss.backward()

            clip_grad_value_(model.parameters(), clip_effect)

            optimizer.step()

            train_loss += loss.item() * images.size(0)

        scheduler.step()
        train_loss /= len(train_loader.dataset)

        model.eval()
        val_loss = 0.0
        all_labels = []
        all_outputs = []
        with torch.no_grad():
            for images, captions, labels in tqdm(val_loader, position=0, leave=True):
                images, captions, labels = images.to(device), captions.to(device), labels.to(device)

                outputs = model(images, captions)
                loss = criterion(outputs, labels.float())
                val_loss += loss.item() * images.size(0)

                all_labels.append(labels.cpu().numpy())
                probabilities = torch.sigmoid(outputs)
                all_outputs.append(probabilities.cpu().numpy())

        val_loss /= len(val_loader.dataset)

        all_labels = np.concatenate(all_labels, axis=0)
        all_outputs = np.concatenate(all_outputs, axis=0)
        val_accuracy = accuracy_score(all_labels, (all_outputs > 0.5).astype(int))
        val_f1 = f1_score(all_labels, (all_outputs > 0.5), average='micro')

        history['epoch'].append(epoch + 1)
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['val_accuracy'].append(val_accuracy)
        history['val_f1_score'].append(val_f1)

        print(f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1 Score: {val_f1:.4f}')

        if val_accuracy > best_val_acc:
            best_val_acc = val_accuracy
            torch.save(model.state_dict(), model_path + save_prefix + '_vacc{}_vf1{}_vlss{}_epoch{}.pth'.format(int(val_accuracy *100), int(val_f1*100), int(val_loss*100), count_epoch))
            #torch.save(history, history_path + save_prefix + '_vacc{}_vf1{}_vlss{}.pth'.format(val_accuracy, val_f1, val_loss)

        time_stamp = datetime.datetime.now()
        pd.DataFrame(history).to_csv(history_path + save_prefix + "_{}vacc{}_epoch{}.csv".format(time_stamp.strftime("%d_%m_%y_%H_%M"), int(val_accuracy*100), count_epoch), index=False)

# Example usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])




In [6]:
train = pd.read_csv('./processed-data/train.csv', converters={'EncodedLabels': pd.eval, 'TokensWithPadding': pd.eval})
# train = train[:1024]
train_df, val_df = train_test_split(train, test_size=0.1, random_state=42)

train_set = CustomDataset(train_df, transform)
val_set = CustomDataset(val_df, transform)
trainloader = DataLoader(train_set, batch_size=128, shuffle=True, num_workers=10)
valloader = DataLoader(val_set, batch_size=128, shuffle=True, num_workers=10)

In [7]:


vocab_size = 7330
embed_size = 28
hidden_dim = 256
num_classes = 18
num_epochs = 15
log_file = 'training_log.csv'
lr = 0.001

model = MultiModalModel(vocab_size, embed_size, hidden_dim, num_classes)

# model.load_state_dict(torch.load("./multimodalmodels/multimodalmodel_vacc63_vf181_vlss11.pth"))

model.to(device)
criterion = nn.BCEWithLogitsLoss()#BCELoss()
optimizer = optim.Adagrad(model.parameters(), lr=lr)#Adam(model.parameters(), lr=lr)
optimizer.param_groups[0]['initial_lr'] = lr
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=2,  eta_min=0.00001, last_epoch=num_epochs-1)



# train_epoch(model, trainloader, valloader, num_epochs, criterion, optimizer, scheduler, device, log_file)



## Get Predictions

In [75]:
class TestDataset(Dataset):
    def __init__(self, df, transform=None, base_img_path="./data/images/"):
        self.df=df
        self.transform = transform
        self.base_img_path=base_img_path

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_path = self.base_img_path + self.df.iloc[idx][1]
        caption = self.df.iloc[idx][4]
        print(type(caption))

        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        #image is already tensor cutesey of transform sequence
        return image, caption, self.df.iloc[idx][1]


In [76]:
def predict(model, data_loader, save_path, device):
    model.eval()
    all_outputs = []
    with torch.no_grad():
        for images, captions, imageid in tqdm(data_loader, position=0, leave=True):
            print(images)
            print(captions)
            print(imageid)
            images, captions = images.to(device), captions.to(device)
            outputs = model(images, captions)
            outputs = torch.sigmoid(outputs) > 0.5
            all_outputs.extend((imageid, outputs.cpu().numpy()))

    return all_outputs

In [77]:
model = MultiModalModel(vocab_size, embed_size, hidden_dim, num_classes)

model.load_state_dict(torch.load("./multimodalmodels/multimodalmodel_resnet_34_clipped_val0_5_two_class_heads__vacc52_vf170_vlss12_epoch14.pth"))
model.to(device)



MultiModalModel(
  (resnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, tr

In [80]:
import ast
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
test_df = pd.read_csv('./processed-data/test.csv', converters={'TokensWithPadding': pd.eval})

def dataframeColumnToTensor(column):
  array = []
  for row in column:
    array.append(row)
  return torch.tensor(array)

test_df["TokensWithPadding"] = dataframeColumnToTensor(test_df["TokensWithPadding"])


test_loader = DataLoader(TestDataset(test_df, transform), batch_size=1, shuffle=False, num_workers=10)

AttributeError: 'list' object has no attribute 'tolist'

In [None]:
predictions = predict(model, test_loader, "", device)

In [None]:
# os.path.getsize('./best_model.pth')

97191964