In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
import pdb
from sklearn.model_selection import train_test_split
from PIL import Image
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn.utils import clip_grad_norm_, clip_grad_value_

import warnings
import datetime

# Suppress the FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, message=".*Series.__getitem__ treating keys as positions is deprecated.*")

# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, df, transform=None, base_img_path="./data/images/"):
        self.df=df
        self.transform = transform
        self.base_img_path=base_img_path

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_path = self.base_img_path + self.df.iloc[idx][1]
        caption = self.df.iloc[idx][8]
        label = self.df.iloc[idx][6]

        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        #image is already tensor cutesey of transform sequence
        return image, torch.tensor(caption), torch.tensor(label, dtype=torch.float32)




# Model definition
class MultiModalModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_dim, num_classes=18, lstm_layers=1, bidirectional=True):
        super(MultiModalModel, self).__init__()

        self.resnet = models.resnet34(pretrained=True)
        self.resnet_output_dim = self.resnet.fc.in_features
        self.resnet.fc = nn.Identity()

        self.embedding_dim = embed_size
        self.hidden_dim = hidden_dim
        self.num_layers = lstm_layers
        self.bidirectional = bidirectional

        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers,
                            batch_first=True, bidirectional=self.bidirectional)

        lstm_output_dim = self.hidden_dim * 2 if self.bidirectional else self.hidden_dim
        classifier_input_dim = self.resnet_output_dim + lstm_output_dim
        self.resnet_classifier = nn.Sequential(
            nn.BatchNorm1d(self.resnet_output_dim),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(self.resnet_output_dim, num_classes)
        )
        self.lstm_classifier = nn.Sequential(
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(lstm_output_dim, num_classes),
        )

        self.classifier = nn.Sequential(
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(num_classes * 2, num_classes)
        )

    def forward(self, images, captions):
        image_features = self.resnet(images)
        embedded = self.embedding(captions)
        lstm_out, _ = self.lstm(embedded)

        if self.bidirectional:
            lstm_out = torch.cat((lstm_out[:, -1, :self.hidden_dim], lstm_out[:, 0, self.hidden_dim:]), dim=1)
        else:
            lstm_out = lstm_out[:, -1, :]

        resnet_classifier_output = self.resnet_classifier(image_features)
        lstm_classifier_output = self.lstm_classifier(lstm_out)

        combined_features = torch.cat((resnet_classifier_output, lstm_classifier_output), dim=1)
        # combined_features = torch.cat((image_features, lstm_out), dim=1)
        output = self.classifier(combined_features)

        return output

# Training function
def train_epoch(model, train_loader, val_loader, num_epochs, criterion, optimizer, scheduler, device, log_file, clip_effect=10, model_path="./models/", history_path="./history/", save_prefix="second_clasification_head_"):
    best_val_acc = 0.0
    history = {
        'epoch': [],
        'train_loss': [],
        'val_loss': [],
        'val_accuracy': [],
        'val_f1_score': []
    }
    count_epoch = 0
    # Train the model
    for epoch in tqdm(range(num_epochs), position=0, leave=True):
        count_epoch += 1
        model.train()
        train_loss = 0.0
        for images, captions, labels in train_loader:
            images, captions, labels = images.to(device), captions.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images, captions)
            loss = criterion(outputs, labels.float())
            loss.backward()

            clip_grad_value_(model.parameters(), clip_effect)

            optimizer.step()

            train_loss += loss.item() * images.size(0)

        # scheduler.step()
        train_loss /= len(train_loader.dataset)

        model.eval()
        val_loss = 0.0
        all_labels = []
        all_outputs = []
        with torch.no_grad():
            for images, captions, labels in tqdm(val_loader, position=0, leave=True):
                images, captions, labels = images.to(device), captions.to(device), labels.to(device)

                outputs = model(images, captions)
                loss = criterion(outputs, labels.float())
                val_loss += loss.item() * images.size(0)

                all_labels.append(labels.cpu().numpy())
                probabilities = (torch.sigmoid(outputs).cpu().numpy()  > 0.5).astype(int)

                # assert(x > 0 for x in probabilities[1:])
                all_outputs.append(probabilities)

        val_loss /= len(val_loader.dataset)

        scheduler.step(val_loss)
        all_labels = np.concatenate(all_labels, axis=0)
        all_outputs = np.concatenate(all_outputs, axis=0)
        val_accuracy = accuracy_score(all_labels, all_outputs)
        val_f1 = f1_score(all_labels, all_outputs, average='micro')

        history['epoch'].append(epoch + 1)
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['val_accuracy'].append(val_accuracy)
        history['val_f1_score'].append(val_f1)

        print(f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1 Score: {val_f1:.4f}')

        if (val_accuracy > best_val_acc and best_val_acc > 0.50) or count_epoch % 20 == 0:
            best_val_acc = val_accuracy
            torch.save(model.state_dict(), model_path + save_prefix + '_vacc{}_vf1{}_vlss{}_epoch{}.pth'.format(int(val_accuracy *100), int(val_f1*100), int(val_loss*100), count_epoch))
            

        time_stamp = datetime.datetime.now()
        pd.DataFrame(history).to_csv(history_path + save_prefix + "_{}vacc{}_epoch{}.csv".format(time_stamp.strftime("%d_%m_%y_%H_%M"), int(val_accuracy*100), count_epoch), index=False)

# Example usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv('./processed-data/train.csv', converters={'EncodedLabels': pd.eval, 'TokensWithPadding': pd.eval})
# train = train[:1024]
train_df, val_df = train_test_split(train, test_size=0.1, random_state=42)

train_set = CustomDataset(train_df, transform)
val_set = CustomDataset(val_df, transform)
trainloader = DataLoader(train_set, batch_size=64, shuffle=True, num_workers=12)
valloader = DataLoader(val_set, batch_size=64, shuffle=True, num_workers=12)



In [None]:


vocab_size = 7330
embed_size = 28
hidden_dim = 256
num_classes = 18
num_epochs = 70
log_file = 'training_log.csv'
lr = 0.1

model = MultiModalModel(vocab_size, embed_size, hidden_dim, num_classes)

# model.load_state_dict(torch.load("./multimodalmodels/multimodalmodel_resnet_34_clipped_val0_5_two_class_heads_noclip_oldhead_vacc62_vf180_vlss9_epoch20.pth"))
model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optimizer = optim.Adadelta(model.parameters(), lr=lr)#Adam(model.parameters(), lr=lr)
optimizer.param_groups[0]['initial_lr'] = lr
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.01, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08)

train_epoch(model, trainloader, valloader, num_epochs, criterion, optimizer, scheduler, device, log_file)

100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.59it/s]
  1%|▌                                         | 1/70 [00:57<1:06:41, 58.00s/it]

Epoch [1/70], Train Loss: 0.2940, Val Loss: 0.1816, Val Accuracy: 0.4530, Val F1 Score: 0.5975


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.36it/s]
  3%|█▏                                        | 2/70 [01:56<1:06:05, 58.32s/it]

Epoch [2/70], Train Loss: 0.1999, Val Loss: 0.1758, Val Accuracy: 0.4530, Val F1 Score: 0.5975


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.98it/s]
  4%|█▊                                        | 3/70 [02:55<1:05:11, 58.39s/it]

Epoch [3/70], Train Loss: 0.1912, Val Loss: 0.1685, Val Accuracy: 0.4530, Val F1 Score: 0.5975


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.82it/s]
  6%|██▍                                       | 4/70 [03:53<1:04:22, 58.52s/it]

Epoch [4/70], Train Loss: 0.1815, Val Loss: 0.1592, Val Accuracy: 0.4527, Val F1 Score: 0.6128


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.53it/s]
  7%|███                                       | 5/70 [04:52<1:03:27, 58.58s/it]

Epoch [5/70], Train Loss: 0.1721, Val Loss: 0.1508, Val Accuracy: 0.4490, Val F1 Score: 0.6244


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.63it/s]
  9%|███▌                                      | 6/70 [05:51<1:02:31, 58.62s/it]

Epoch [6/70], Train Loss: 0.1647, Val Loss: 0.1427, Val Accuracy: 0.4550, Val F1 Score: 0.6315


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.38it/s]
 10%|████▏                                     | 7/70 [06:50<1:01:39, 58.73s/it]

Epoch [7/70], Train Loss: 0.1585, Val Loss: 0.1384, Val Accuracy: 0.4780, Val F1 Score: 0.6493


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.90it/s]
 11%|████▊                                     | 8/70 [07:48<1:00:38, 58.69s/it]

Epoch [8/70], Train Loss: 0.1537, Val Loss: 0.1364, Val Accuracy: 0.4803, Val F1 Score: 0.6571


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 15.17it/s]
 13%|█████▋                                      | 9/70 [08:47<59:36, 58.63s/it]

Epoch [9/70], Train Loss: 0.1491, Val Loss: 0.1336, Val Accuracy: 0.4870, Val F1 Score: 0.6622


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.96it/s]
 14%|██████▏                                    | 10/70 [09:45<58:37, 58.62s/it]

Epoch [10/70], Train Loss: 0.1461, Val Loss: 0.1305, Val Accuracy: 0.4907, Val F1 Score: 0.6724


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.52it/s]
 16%|██████▊                                    | 11/70 [10:44<57:39, 58.63s/it]

Epoch [11/70], Train Loss: 0.1430, Val Loss: 0.1304, Val Accuracy: 0.4897, Val F1 Score: 0.6731


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.47it/s]
 17%|███████▎                                   | 12/70 [11:43<56:43, 58.69s/it]

Epoch [12/70], Train Loss: 0.1396, Val Loss: 0.1325, Val Accuracy: 0.4993, Val F1 Score: 0.6779


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.68it/s]
 19%|███████▉                                   | 13/70 [12:41<55:46, 58.70s/it]

Epoch [13/70], Train Loss: 0.1365, Val Loss: 0.1277, Val Accuracy: 0.5010, Val F1 Score: 0.6816


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.86it/s]
 20%|████████▌                                  | 14/70 [13:40<54:46, 58.69s/it]

Epoch [14/70], Train Loss: 0.1341, Val Loss: 0.1296, Val Accuracy: 0.5083, Val F1 Score: 0.6875


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.44it/s]
 21%|█████████▏                                 | 15/70 [14:39<53:49, 58.72s/it]

Epoch [15/70], Train Loss: 0.1316, Val Loss: 0.1281, Val Accuracy: 0.5057, Val F1 Score: 0.6867


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 15.22it/s]
 23%|█████████▊                                 | 16/70 [15:37<52:47, 58.66s/it]

Epoch [16/70], Train Loss: 0.1295, Val Loss: 0.1271, Val Accuracy: 0.5110, Val F1 Score: 0.6953


100%|███████████████████████████████████████████| 47/47 [00:03<00:00, 14.61it/s]
 24%|██████████▍                                | 17/70 [16:36<51:50, 58.70s/it]

Epoch [17/70], Train Loss: 0.1272, Val Loss: 0.1261, Val Accuracy: 0.5090, Val F1 Score: 0.6922
