In [69]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
import pdb
from sklearn.model_selection import train_test_split
from PIL import Image
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import warnings

# Suppress the FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, message=".*Series.__getitem__ treating keys as positions is deprecated.*")

# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, df, transform=None, base_img_path="./data/images/"):
        self.df=df
        self.transform = transform
        self.base_img_path=base_img_path

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_path = self.base_img_path + self.df.iloc[idx][1]
        caption = self.df.iloc[idx][8]
        label = self.df.iloc[idx][6]

        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        #image is already tensor cutesey of transform sequence
        return image, torch.tensor(caption), torch.tensor(label, dtype=torch.float32)




# Model definition
class MultiModalModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_dim, num_classes=18, lstm_layers=2, bidirectional=True):
        super(MultiModalModel, self).__init__()

        self.resnet = models.resnet34(pretrained=True)
        self.resnet_output_dim = self.resnet.fc.in_features
        self.resnet.fc = nn.Identity()

        self.embedding_dim = embed_size
        self.hidden_dim = hidden_dim
        self.num_layers = lstm_layers
        self.bidirectional = bidirectional

        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers,
                            batch_first=True, bidirectional=self.bidirectional)

        lstm_output_dim = self.hidden_dim * 2 if self.bidirectional else self.hidden_dim
        classifier_input_dim = self.resnet_output_dim + lstm_output_dim

        self.classifier = nn.Sequential(
            nn.Linear(classifier_input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes),
            nn.Sigmoid()
        )

    def forward(self, images, captions):
        image_features = self.resnet(images)
        embedded = self.embedding(captions)
        lstm_out, _ = self.lstm(embedded)

        if self.bidirectional:
            lstm_out = torch.cat((lstm_out[:, -1, :self.hidden_dim], lstm_out[:, 0, self.hidden_dim:]), dim=1)
        else:
            lstm_out = lstm_out[:, -1, :]

        combined_features = torch.cat((image_features, lstm_out), dim=1)
        output = self.classifier(combined_features)

        return output

# Training function
def train_epoch(model, train_loader, val_loader, num_epochs, criterion, optimizer, scheduler, device, log_file):
    best_val_acc = 0.0
    history = {
        'epoch': [],
        'train_loss': [],
        'val_loss': [],
        'val_accuracy': [],
        'val_f1_score': []
    }

    for epoch in tqdm(range(num_epochs), position=0, leave=True):
        model.train()
        train_loss = 0.0
        for images, captions, labels in train_loader:
            images, captions, labels = images.to(device), captions.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images, captions)
            loss = criterion(outputs, labels.float())
            loss.backward()
            optimizer.step()
            scheduler.step()
            train_loss += loss.item() * images.size(0)

        train_loss /= len(train_loader.dataset)

        model.eval()
        val_loss = 0.0
        all_labels = []
        all_outputs = []
        with torch.no_grad():
            for images, captions, labels in tqdm(val_loader, position=0, leave=True):
                images, captions, labels = images.to(device), captions.to(device), labels.to(device)

                outputs = model(images, captions)
                loss = criterion(outputs, labels.float())
                val_loss += loss.item() * images.size(0)

                all_labels.append(labels.cpu().numpy())
                all_outputs.append(outputs.cpu().numpy())

        val_loss /= len(val_loader.dataset)
        all_labels = np.concatenate(all_labels, axis=0)
        all_outputs = np.concatenate(all_outputs, axis=0)
        val_accuracy = accuracy_score(all_labels, (all_outputs > 0.5))
        val_f1 = f1_score(all_labels, (all_outputs > 0.5), average='micro')

        history['epoch'].append(epoch + 1)
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['val_accuracy'].append(val_accuracy)
        history['val_f1_score'].append(val_f1)

        print(f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1 Score: {val_f1:.4f}')

        if val_accuracy > best_val_acc:
            best_val_acc = val_accuracy
            torch.save(model.state_dict(), 'best_model.pth')

        # pd.DataFrame(history).to_csv(log_file, index=False)

# Example usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Lambda(lambda x: x / x.abs().max())
])




In [70]:
train = pd.read_csv('./processed-data/train.csv', converters={'EncodedLabels': pd.eval, 'TokensWithPadding': pd.eval})
# train = train[:1024]
train_df, val_df = train_test_split(train, test_size=0.1, random_state=42)

train_set = CustomDataset(train_df, transform)
val_set = CustomDataset(val_df, transform)
trainloader = DataLoader(train_set, batch_size=256, shuffle=True)
valloader = DataLoader(val_set, batch_size=256, shuffle=True)

In [75]:


vocab_size = 10000
embed_size = 28
hidden_dim = 256
num_classes = 18
num_epochs = 10
log_file = 'training_log.csv'
lr = 0.001

model = MultiModalModel(vocab_size, embed_size, hidden_dim, num_classes)

# model.load_state_dict(torch.load("./best_model.pth"))

model.to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
optimizer.param_groups[0]['initial_lr'] = lr
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=2,  eta_min=0.000001, last_epoch=num_epochs-1)



train_epoch(model, trainloader, valloader, num_epochs, criterion, optimizer, scheduler, device, log_file)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:17<00:00,  1.45s/it]
 10%|██████████████▊                                                                                                                                     | 1/10 [03:26<31:02, 206.99s/it]

Epoch [1/10], Train Loss: 0.2236, Val Loss: 3.4139, Val Accuracy: 0.4857, Val F1 Score: 0.6361


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:17<00:00,  1.45s/it]
 20%|█████████████████████████████▌                                                                                                                      | 2/10 [06:54<27:36, 207.06s/it]

Epoch [2/10], Train Loss: 0.1313, Val Loss: 0.1020, Val Accuracy: 0.6290, Val F1 Score: 0.7869


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:17<00:00,  1.44s/it]
 30%|████████████████████████████████████████████▍                                                                                                       | 3/10 [10:20<24:06, 206.69s/it]

Epoch [3/10], Train Loss: 0.0973, Val Loss: 0.1063, Val Accuracy: 0.6400, Val F1 Score: 0.7987


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:17<00:00,  1.44s/it]
 40%|███████████████████████████████████████████████████████████▏                                                                                        | 4/10 [13:47<20:40, 206.83s/it]

Epoch [4/10], Train Loss: 0.0885, Val Loss: 0.0965, Val Accuracy: 0.6473, Val F1 Score: 0.8092


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:17<00:00,  1.44s/it]
 50%|██████████████████████████████████████████████████████████████████████████                                                                          | 5/10 [17:14<17:13, 206.78s/it]

Epoch [5/10], Train Loss: 0.0796, Val Loss: 0.0991, Val Accuracy: 0.6470, Val F1 Score: 0.8101


 50%|██████████████████████████████████████████████████████████████████████████                                                                          | 5/10 [17:51<17:51, 214.35s/it]


KeyboardInterrupt: 

In [None]:
def predict(data_loader, save_path):
  model.eval()
  predictions = []
  with torch.no_grad():
      for images, captions in tqdm(data_loader, position=0, leave=True):
          images, captions = images.to(device), captions.to(device)

          outputs = model(images, captions)
          loss = criterion(outputs, labels.float())
          val_loss += loss.item() * images.size(0)

          all_labels.append(labels.cpu().numpy())
          all_outputs.append(outputs.cpu().numpy())

In [62]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [44]:
import os
print(os.listdir())

['training_metrics.txt', 'models', 'processeddata', 'failedTests', 'training_log.txt', 'lstm.ipynb', 'data', 'exploration.ipynb', 'processed-data', 'README.md', '.git', 'temp.txt', '.gitignore', 'resnet50.ipynb', 'predictions', 'best_model.pth', 'training_log.csv', 'preprocessText.ipynb']


In [63]:
os.path.getsize('./best_model.pth')

97191964