In [29]:
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(0))
# ^ Comment out if cpu :)

import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time

Mon May 13 17:59:55 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.10              Driver Version: 551.61         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 ...    On  |   00000000:01:00.0 Off |                  N/A |
| N/A   48C    P8              9W /   64W |     358MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [31]:
# The numpy arrays seem to have been converted to strings when saved to csv. We need to convert them back into lists

train = pd.read_csv('./processed-data/train.csv', converters={'EncodedLabels': pd.eval, 'TokensWithPadding': pd.eval})
train.head()


Unnamed: 0.1,Unnamed: 0,ImageID,Labels,Caption,Tokens,LabelIds,EncodedLabels,DictionaryIds,TokensWithPadding
0,0,0.jpg,[1],Woman in swim suit holding parasol on sunny day.,"['woman', 'swim', 'suit', 'holding', 'parasol'...",[0],"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[7236, 6376, 6297, 3077, 4457, 6316, 1714]","[7236, 6376, 6297, 3077, 4457, 6316, 1714, 0, ..."
1,1,1.jpg,"[1, 19]",A couple of men riding horses on top of a gree...,"['couple', 'men', 'riding', 'horses', 'top', '...","[0, 17]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1524, 3891, 5237, 3117, 6650, 2827, 2351]","[1524, 3891, 5237, 3117, 6650, 2827, 2351, 0, ..."
2,2,2.jpg,[1],They are brave for riding in the jungle on tho...,"['brave', 'riding', 'jungle', 'eleph']",[0],"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[754, 5237, 3368, 2093]","[754, 5237, 3368, 2093, 0, 0, 0, 0, 0, 0, 0, 0..."
3,3,3.jpg,"[8, 3, 13]",a black and silver clock tower at an intersect...,"['black', 'silver', 'clock', 'tower', 'interse...","[7, 2, 11]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[616, 5718, 1257, 6684, 3267, 4138, 6738]","[616, 5718, 1257, 6684, 3267, 4138, 6738, 0, 0..."
4,4,4.jpg,"[8, 3, 7]",A train coming to a stop on the tracks out side.,"['train', 'coming', 'stop', 'tracks', 'sid']","[7, 2, 6]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, ...","[6704, 1367, 6178, 6694, 5677]","[6704, 1367, 6178, 6694, 5677, 0, 0, 0, 0, 0, ..."


In [32]:
# Mismatched storage :(
print(type(train['EncodedLabels']))
print(type(train['EncodedLabels'][0]))
print(type(train['TokensWithPadding']))
print(type(train['TokensWithPadding'][0]))

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>


In [33]:
# Make type consistent and convert to tensor
def dataframeColumnToTensor(column):
  array = []
  for row in column:
    array.append(row.tolist())
  return torch.tensor(array)

In [34]:
trainingTokens = dataframeColumnToTensor(train['TokensWithPadding'])
trainingEncodedLabels = dataframeColumnToTensor(train['EncodedLabels'])
# Sanity check for tensors
print(type(trainingTokens))
print(type(trainingTokens[0]))
print(type(trainingEncodedLabels))
print(type(trainingEncodedLabels[0]))
print(type(trainingEncodedLabels[0][0]))

<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [51]:
# Split out validation set
xTrain, xVal, yTrain, yVal = train_test_split(
  trainingTokens,
  trainingEncodedLabels,
  test_size=0.15,
  random_state=7
)

# Tokens
print(xTrain.shape)
print(xVal.shape)
# Labels
print(yTrain.shape)
print(yVal.shape)

torch.Size([25496, 28])
torch.Size([4500, 28])
torch.Size([25496, 18])
torch.Size([4500, 18])


In [52]:
#  Hyperparameter
BATCH_SIZE=128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

trainData = TensorDataset(xTrain, yTrain)
testData = TensorDataset(xVal, yVal)

trainLoader = DataLoader(dataset=trainData, batch_size=BATCH_SIZE, shuffle=True)
testLoader = DataLoader(dataset=testData, shuffle=True)

In [53]:
with open('./processed-data/dictionary.json') as f:
    dictionary = json.load(f)

In [54]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, dictionary_size, num_labels, dropout_prob, bidirectional):
        super(LSTMClassifier, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dictionary_size = dictionary_size
        self.num_labels = num_labels
        self.dropout_prob = dropout_prob
        self.bidirectional = bidirectional

        self.word_embeddings = nn.Embedding(dictionary_size, input_size)

        # two lstms? num_layers=2
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=bidirectional)

        # Linear layer for each label
        self.classifiers = nn.ModuleList([
            nn.Sequential(
                nn.Dropout(p=dropout_prob),
                nn.Linear(in_features=hidden_size * (2 if bidirectional else 1), out_features=1)
            ) for _ in range(num_labels)
        ])

    def forward(self, sentences):
        # Create the word embeddings
        embeds = self.word_embeddings(sentences)

        # Pass it through the LSTM
        lstm_out, _ = self.lstm(embeds)

        # Get the last time step output for each label
        if self.bidirectional:
            lstm_out = torch.cat((lstm_out[:, -1, :self.hidden_size], lstm_out[:, 0, self.hidden_size:]), dim=1)
        else:
            lstm_out = lstm_out[:, -1, :]
        
        # Calculate the predictions for each label
        outputs = {f'label{i+1}': torch.sigmoid(self.classifiers[i](lstm_out)).squeeze() for i in range(self.num_labels)}
        return outputs

In [70]:
# Hyperparameters
INPUT_SIZE = 128
HIDDEN_SIZE = 256
DICTIONARY_SIZE = len(dictionary)
LEARNING_RATE=0.001
NUM_LABELS=18
DROPOUT_RATE=0.4
BIDIRECTIONAL=False

# Initialize model, loss function, and optimizer
model = LSTMClassifier(INPUT_SIZE, HIDDEN_SIZE, DICTIONARY_SIZE, NUM_LABELS, DROPOUT_RATE, BIDIRECTIONAL).to(device)

model = LSTMClassifier(INPUT_SIZE, HIDDEN_SIZE, DICTIONARY_SIZE).to(device)

# Load previous model for continued training
# Caveat - the model structure can't be changed else the weight dimensions won't match
# model.load_state_dict(torch.load('./models/lstm.pt'))
# model.eval()

loss_function = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [83]:
def calculate_loss(loss_func, outputs, targets):
    total_loss = 0
    for idx, key in enumerate(outputs):
        output = outputs[key].to(device).double()
        target = targets[:, idx].to(device).double()
        total_loss += loss_func(output, target)
    return total_loss

def combine_and_threshold_predictions(class_predictions):
    combined_tensor = None
    for i, key in enumerate(class_predictions.keys()):
        if i == 0:
            combined_tensor = class_predictions[key]
        else:
            all_entries = []
            for i in range(combined_tensor.shape[0]):
                l = combined_tensor[i].tolist()
                if isinstance(l, float):
                    l = [l, class_predictions[key][i].item()]
                else:
                    l.append(class_predictions[key][i].item())
                all_entries.append(l)
            combined_tensor = torch.tensor(all_entries)
    return combined_tensor.detach().apply_(lambda x: int(x > 0.5))

def training_epoch(model, optimizer, loss_function, data_loader):
    epoch_loss = 0.0
    correct_predictions = 0
    total_examples = 0

    for tokens, targets in data_loader:
        tokens = tokens.to(device)
        targets = targets.to(device)

        batch_size = tokens.shape[0]
        total_examples += batch_size

        model.train()
        optimizer.zero_grad()

        class_predictions = model(tokens)
        loss = calculate_loss(loss_function, class_predictions, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * batch_size

        combined_predictions = combine_and_threshold_predictions(class_predictions)

        correct_predictions += accuracy_score(combined_predictions.cpu().numpy(), targets.cpu().numpy())*batch_size
    return epoch_loss, correct_predictions, total_examples

epochs = 20

startTime = time.time()

for epoch in range(epochs):
    loss, correct, total = training_epoch(model, optimizer, loss_function, trainLoader)
        
    if epoch % 5 == 0 or epoch == epochs - 1:
            print(f'Time elapsed: {(time.time() - startTime) / 60:.1f} mins')
            print(f'Epoch: {epoch + 1}, Training loss: {loss / total:.4f}, Training accuracy: {correct / total * 100:.2f}%')

print(f'Total Training Time: {(time.time() - startTime) / 60:.4f} mins')

Time elapsed: 0.4 mins
Epoch: 1, Training loss: 882.5585, Training accuracy: 56.79%


KeyboardInterrupt: 

In [81]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 5.105MB


In [84]:
torch.save(model.state_dict(), './models/lstm.pt')