In [1]:
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(0))
# ^ Comment out if cpu :)

import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
import csv
from datetime import datetime

NVIDIA GeForce RTX 3060 Laptop GPU


In [2]:
# The numpy arrays seem to have been converted to strings when saved to csv. We need to convert them back into lists

train = pd.read_csv('./processed-data/train.csv', converters={'EncodedLabels': pd.eval, 'TokensWithPadding': pd.eval})
train.head()


Unnamed: 0.1,Unnamed: 0,ImageID,Labels,Caption,Tokens,LabelIds,EncodedLabels,DictionaryIds,TokensWithPadding
0,0,0.jpg,[1],Woman in swim suit holding parasol on sunny day.,"['woman', 'swim', 'suit', 'holding', 'parasol'...",[0],"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[7236, 6376, 6297, 3077, 4457, 6316, 1714]","[7236, 6376, 6297, 3077, 4457, 6316, 1714, 0, ..."
1,1,1.jpg,"[1, 19]",A couple of men riding horses on top of a gree...,"['couple', 'men', 'riding', 'horses', 'top', '...","[0, 17]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1524, 3891, 5237, 3117, 6650, 2827, 2351]","[1524, 3891, 5237, 3117, 6650, 2827, 2351, 0, ..."
2,2,2.jpg,[1],They are brave for riding in the jungle on tho...,"['brave', 'riding', 'jungle', 'eleph']",[0],"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[754, 5237, 3368, 2093]","[754, 5237, 3368, 2093, 0, 0, 0, 0, 0, 0, 0, 0..."
3,3,3.jpg,"[8, 3, 13]",a black and silver clock tower at an intersect...,"['black', 'silver', 'clock', 'tower', 'interse...","[7, 2, 11]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[616, 5718, 1257, 6684, 3267, 4138, 6738]","[616, 5718, 1257, 6684, 3267, 4138, 6738, 0, 0..."
4,4,4.jpg,"[8, 3, 7]",A train coming to a stop on the tracks out side.,"['train', 'coming', 'stop', 'tracks', 'sid']","[7, 2, 6]","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, ...","[6704, 1367, 6178, 6694, 5677]","[6704, 1367, 6178, 6694, 5677, 0, 0, 0, 0, 0, ..."


In [3]:
# Mismatched storage :(
print(type(train['EncodedLabels']))
print(type(train['EncodedLabels'][0]))
print(type(train['TokensWithPadding']))
print(type(train['TokensWithPadding'][0]))

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>


In [4]:
# Make type consistent and convert to tensor
def dataframeColumnToTensor(column):
  array = []
  for row in column:
    array.append(row.tolist())
  return torch.tensor(array)

In [5]:
trainingTokens = dataframeColumnToTensor(train['TokensWithPadding'])
trainingEncodedLabels = dataframeColumnToTensor(train['EncodedLabels'])
# Sanity check for tensors
print(type(trainingTokens))
print(type(trainingTokens[0]))
print(type(trainingEncodedLabels))
print(type(trainingEncodedLabels[0]))
print(type(trainingEncodedLabels[0][0]))

<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [6]:
# Split out validation set
xTrain, xVal, yTrain, yVal = train_test_split(
  trainingTokens,
  trainingEncodedLabels,
  test_size=0.15,
  random_state=7
)

# Tokens
print(xTrain.shape)
print(xVal.shape)
# Labels
print(yTrain.shape)
print(yVal.shape)

In [7]:
#  Hyperparameter
BATCH_SIZE=128

trainData = TensorDataset(trainingTokens, trainingEncodedLabels)
validationData = TensorDataset(xVal, yVal)

trainLoader = DataLoader(dataset=trainData, batch_size=BATCH_SIZE, shuffle=True)
validationLoader = DataLoader(dataset=validationData, shuffle=True)

In [8]:
with open('./processed-data/dictionary.json') as f:
    dictionary = json.load(f)

In [9]:
import torch.nn as nn

# Define classifier
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, dictionary_size, num_labels, dropout_prob, bidirectional):
        super(LSTMClassifier, self).__init__()
        # MAx token length
        self.input_size = input_size
        self.hidden_size = hidden_size
        # Vocabulary
        self.dictionary_size = dictionary_size
        self.num_labels = num_labels
        self.dropout_prob = dropout_prob
        self.bidirectional = bidirectional

        self.word_embeddings = nn.Embedding(dictionary_size, input_size)

        # Test with two lstms. num_layers=2
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=bidirectional)

        # Linear layer for each label
        self.classifiers = nn.ModuleList([
            nn.Sequential(
                nn.Dropout(p=dropout_prob),
                nn.Linear(in_features=hidden_size * (2 if bidirectional else 1), out_features=1)
            ) for _ in range(num_labels)
        ])

    def forward(self, sentences):
        # Create the word embeddings
        embeds = self.word_embeddings(sentences)

        # Pass it through the LSTM
        lstm_out, _ = self.lstm(embeds)

        # Get the last time step output for each label
        if self.bidirectional:
            lstm_out = torch.cat((lstm_out[:, -1, :self.hidden_size], lstm_out[:, 0, self.hidden_size:]), dim=1)
        else:
            lstm_out = lstm_out[:, -1, :]
        
        # Calculate the predictions for each label
        outputs = {f'label{i+1}': torch.sigmoid(self.classifiers[i](lstm_out)).squeeze() for i in range(self.num_labels)}
        return outputs

In [10]:
# Hyperparameters
INPUT_SIZE = 28
HIDDEN_SIZE = 256
DICTIONARY_SIZE = len(dictionary)
LEARNING_RATE=0.001
NUM_LABELS=18
DROPOUT_RATE=0.5
BIDIRECTIONAL=True

# Initialize model, loss function, and optimizer
model = LSTMClassifier(INPUT_SIZE, HIDDEN_SIZE, DICTIONARY_SIZE, NUM_LABELS, DROPOUT_RATE, BIDIRECTIONAL).to(device)

# Load previous model for continued training
# Caveat - the model structure can't be changed else the weight dimensions won't match
# model.load_state_dict(torch.load('models/lstm_89per_600e_1lstm_50drop_bidirectional.pt'))
# model.eval()

loss_function = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [140]:
def calculate_loss(loss_func, outputs, targets):
    total_loss = 0
    for idx, key in enumerate(outputs):
        output = outputs[key].to(device).double()
        target = targets[:, idx].to(device).double()
        total_loss += loss_func(output, target)
    return total_loss

# Take all the batched predictions and get individual labels
def combine_and_threshold_predictions(class_predictions):
    combined_tensor = [(class_predictions[label]) for label in class_predictions.keys()]
    combined_tensor = torch.stack(combined_tensor).T
    return combined_tensor.detach().cpu().apply_(lambda x: int(x > 0.5))

def training_epoch(model, optimizer, loss_function, trainLoader):
    epoch_loss = 0.0
    correct_predictions = 0
    total_examples = 0

    for tokens, targets in trainLoader:
        tokens = tokens.to(device)
        targets = targets.to(device)

        batch_size = tokens.shape[0]
        total_examples += batch_size

        model.train()
        optimizer.zero_grad()

        class_predictions = model(tokens)
        loss = calculate_loss(loss_function, class_predictions, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * batch_size

        combined_predictions = combine_and_threshold_predictions(class_predictions)

        correct_predictions += accuracy_score(combined_predictions.cpu().numpy(), targets.cpu().numpy())*batch_size
    return epoch_loss, correct_predictions, total_examples

epochs = 300

startTime = time.time()

for epoch in range(epochs):
    loss, correct, total = training_epoch(model, optimizer, loss_function, trainLoader)
        
    if epoch % 5 == 0 or epoch == epochs - 1:
            print(f'Time elapsed: {(time.time() - startTime) / 60:.1f} mins')
            print(f'Epoch: {epoch + 1}, Training loss: {loss / total:.4f}, Training accuracy: {correct / total * 100:.2f}%')

print(f'Total Training Time: {(time.time() - startTime) / 60:.4f} mins')

Time elapsed: 0.1 mins
Epoch: 1, Training loss: 866.9306, Training accuracy: 85.46%
Time elapsed: 0.3 mins
Epoch: 6, Training loss: 866.6493, Training accuracy: 86.16%
Time elapsed: 0.6 mins
Epoch: 11, Training loss: 866.6069, Training accuracy: 86.81%
Time elapsed: 0.8 mins
Epoch: 16, Training loss: 866.5367, Training accuracy: 86.93%
Time elapsed: 1.1 mins
Epoch: 21, Training loss: 866.4456, Training accuracy: 87.15%
Time elapsed: 1.4 mins
Epoch: 26, Training loss: 866.5102, Training accuracy: 87.22%
Time elapsed: 1.6 mins
Epoch: 31, Training loss: 866.4626, Training accuracy: 87.51%
Time elapsed: 1.9 mins
Epoch: 36, Training loss: 866.5276, Training accuracy: 87.55%
Time elapsed: 2.2 mins
Epoch: 41, Training loss: 866.4223, Training accuracy: 87.50%
Time elapsed: 2.5 mins
Epoch: 46, Training loss: 866.3824, Training accuracy: 87.68%
Time elapsed: 2.7 mins
Epoch: 51, Training loss: 866.3660, Training accuracy: 87.79%
Time elapsed: 3.0 mins
Epoch: 56, Training loss: 866.2569, Training

In [11]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 6.630MB


In [142]:
torch.save(model.state_dict(), './models/lstm.pt')

In [12]:
test = pd.read_csv('./processed-data/test.csv', converters={'TokensWithPadding': pd.eval})
test.head()

testingTokens = dataframeColumnToTensor(test['TokensWithPadding'])
# Sanity check for tensors
print(type(testingTokens))
print(type(testingTokens[0]))

testLoader = DataLoader(dataset=testingTokens, shuffle=False)

<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [37]:
# Get predictions in usable format
def extractOneHotEncoding(prediction):
    labelProbabilities = []
    rawPrediction = []
    for label in prediction.keys():
        newPrediction = prediction[label]
        labelProbabilities.append(int(newPrediction > 0.5))
        rawPrediction.append(float(newPrediction))
    return [i for i, x in enumerate(labelProbabilities) if x == 1], rawPrediction

model.eval()
count = 0
predictions = {}
rawPredictions = []
for tokens in testLoader:
    # Move tokens to the same device as the model
    tokens = tokens.to(device)
    with torch.no_grad():
        oneHot, rawPrediction = extractOneHotEncoding(model(tokens))
        predictions[f'{count + 30000}.jpg'] = oneHot
        rawPredictions.append(rawPrediction)
    count+=1


In [145]:
# Cater for the missing class 12
def classMapping(prediction):
    if prediction == 0:
      return 1
    elif prediction == 1:
      return 2
    elif prediction == 2:
      return 3
    elif prediction == 3:
      return 4
    elif prediction == 4:
      return 5
    elif prediction == 5:
      return 6
    elif prediction == 6:
      return 7
    elif prediction == 7:
      return 8
    elif prediction == 8:
      return 9
    elif prediction == 9:
      return 10
    elif prediction == 10:
      return 11
    elif prediction == 11:
      return 13
    elif prediction == 12:
      return 14
    elif prediction == 13:
      return 15
    elif prediction == 14:
      return 16
    elif prediction == 15:
      return 17
    elif prediction == 16:
      return 18
    elif prediction == 17:
      return 19
    else:
      return 0

filepath = 'predictions/' + datetime.now().strftime("%d-%m-%Y-%H-%M") + '-510369965-490424191-490299418.csv'

with open(filepath, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['ImageID', 'Labels'])
    for key in predictions.keys():
        row = []
        for predictedClass in predictions[key]:
            mapped = classMapping(predictedClass)
            if mapped > 0:
              row.append(mapped)
        row = list(set(row))
        writer.writerow([key, " ".join(str(label) for label in row)])

In [38]:

# raw predictions are used for the ensemble
with open('./processed-data/rawPredictions-lstm.json', 'w', encoding='utf-8') as f:
    json.dump(rawPredictions, f, ensure_ascii=False, indent=4)