In [584]:
import re
import pandas as pd
from io import StringIO
import nltk
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

lemma = nltk.wordnet.WordNetLemmatizer()

ps = PorterStemmer()

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/matt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/matt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/matt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [585]:
def extractDataFrame(fileName):
  with open(fileName) as file:
    lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
    df = pd.read_csv(StringIO(''.join(lines)), escapechar="/")
  return df

In [586]:
def splitLabels(labels):
  return [int(label) for label in labels.split(' ')]

In [587]:
def tokenise(caption):
    # Replace non-alphabetic characters with single whitespace
    caption = re.sub(r'[^a-zA-Z\s]', ' ', caption.lower())
    # Remove any whitespace that appears in sequence
    caption = re.sub(r"\s+", " ", caption)
    # Remove new leading and trailing whitespace
    caption = caption.strip()
    # Apply stemmer to each remaining word in sentence
    caption = lemma.lemmatize(caption)
    caption = ps.stem(caption)

    # Tokenize
    word_tokens = word_tokenize(caption)
    # Remove stop words
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    # Replace value with tokenised data
    return filtered_sentence

In [588]:
train = extractDataFrame('data/train.csv')

train['Tokens'] = train['Caption'].apply(tokenise)
train['Labels'] = train['Labels'].apply(splitLabels)

uniqueLabels = set()
for labels in train['Labels']:
  uniqueLabels.update(labels)

numUniqueLabels = len(uniqueLabels)
maxLabel = max(uniqueLabels)

print(uniqueLabels)
print('Unique labels:', numUniqueLabels)
print('Max label value:', maxLabel)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19}
Unique labels: 18
Max label value: 19


In [589]:
def labelMapping(uLabels):
  labelToIdMapping = {}
  idToLabelMapping = {}
  for idx, label in enumerate(uLabels):
    labelToIdMapping[label] = idx
    idToLabelMapping[idx] = label
  return labelToIdMapping, idToLabelMapping

In [590]:
labelToIdMapping, idToLabelMapping = labelMapping(uniqueLabels)

print(idToLabelMapping)

def mapLabelToId(labels):
  ids = []
  for label in labels:
    ids.append(labelToIdMapping[label])
  return ids

{0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 13, 12: 14, 13: 15, 14: 16, 15: 17, 16: 18, 17: 19}


In [591]:
def oneHotEncodeLabel(labels):
  oneHotClasses = np.zeros([numUniqueLabels, numUniqueLabels])
  for label in labels:
    # Onehot encode each label in the list
    oneHotClasses[label][label] = 1.
  return oneHotClasses

In [592]:
train['LabelIds'] = train['Labels'].apply(mapLabelToId)
train['EncodedLabels'] = train['LabelIds'].apply(oneHotEncodeLabel)

print(train['Labels'][3])
print(train['LabelIds'][3])
print(train['EncodedLabels'][3])

[8, 3, 13]
[7, 2, 11]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0

In [593]:
tokenLengths = []
for x in range(train.shape[0]):
    tokenLengths.append(len(train["Tokens"][x]))

maxTokenLength = max(tokenLengths)
print('Max token Length:', maxTokenLength)

Max token Length: 28


In [594]:
# Initialise vocabulary including padding and unknown word tokens
# Is end of sentence needed? '[EOS]'
vocabulary = set(['[PAD]', '[UNK]'])

for tokens in train["Tokens"]:
  vocabulary.update(tokens)

print(vocabulary)
print('Unique words:', len(vocabulary))

Unique words: 7330


In [595]:
dictionary = {}
idToDictionary = {}
for idx, word in enumerate(vocabulary):
  dictionary[word] = idx
  idToDictionary[idx] = word

# print(dictionary)

In [596]:
# Add dictionary IDs column to dataframe
def addDictionaryIds(tokens):
    ids = []
    for token in tokens:
      if token in dictionary:
        ids.append(dictionary[token])
      else:
        ids.append(dictionary['[UNK]'])
    return ids

In [597]:
train['DictionaryIds'] = train['Tokens'].apply(addDictionaryIds)

In [598]:
def addTokenPadding(tokens):
  padding = maxTokenLength - len(tokens)
  return tokens + padding*[dictionary['[PAD]']]

In [599]:
train['TokensWithPadding'] = train['DictionaryIds'].apply(addTokenPadding)
train.head()

Unnamed: 0,ImageID,Labels,Caption,Tokens,LabelIds,EncodedLabels,DictionaryIds,TokensWithPadding
0,0.jpg,[1],Woman in swim suit holding parasol on sunny day.,"[woman, swim, suit, holding, parasol, sunny, day]",[0],"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[4899, 4656, 5733, 365, 2191, 3385, 2802]","[4899, 4656, 5733, 365, 2191, 3385, 2802, 3925..."
1,1.jpg,"[1, 19]",A couple of men riding horses on top of a gree...,"[couple, men, riding, horses, top, green, field]","[0, 17]","[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[6938, 5148, 742, 3540, 5184, 1839, 3062]","[6938, 5148, 742, 3540, 5184, 1839, 3062, 3925..."
2,2.jpg,[1],They are brave for riding in the jungle on tho...,"[brave, riding, jungle, eleph]",[0],"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[713, 742, 4532, 4256]","[713, 742, 4532, 4256, 3925, 3925, 3925, 3925,..."
3,3.jpg,"[8, 3, 13]",a black and silver clock tower at an intersect...,"[black, silver, clock, tower, intersection, ne...","[7, 2, 11]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[6880, 7121, 5664, 5544, 7318, 5848, 6545]","[6880, 7121, 5664, 5544, 7318, 5848, 6545, 392..."
4,4.jpg,"[8, 3, 7]",A train coming to a stop on the tracks out side.,"[train, coming, stop, tracks, sid]","[7, 2, 6]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[3105, 161, 4425, 3010, 2582]","[3105, 161, 4425, 3010, 2582, 3925, 3925, 3925..."


In [600]:
# Setup test data

# Get test CSV dataframe
test = extractDataFrame('data/test.csv')

# Tokenise
test['Tokens'] = test['Caption'].apply(tokenise)
# Apply dictionary IDs
test['DictionaryIds'] = test['Tokens'].apply(addDictionaryIds)
# Add padding
test['TokensWithPadding'] = test['DictionaryIds'].apply(addTokenPadding)

test.head()

Unnamed: 0,ImageID,Caption,Tokens,DictionaryIds,TokensWithPadding
0,30000.jpg,A little girl waring a krispy kreme hat holdin...,"[little, girl, waring, krispy, kreme, hat, hol...","[241, 4577, 4157, 1828, 4802, 765, 365, 2468, ...","[241, 4577, 4157, 1828, 4802, 765, 365, 2468, ..."
1,30001.jpg,A beautiful young woman holding an orange fris...,"[beautiful, young, woman, holding, orange, fri...","[1353, 1685, 4899, 365, 2240, 7161]","[1353, 1685, 4899, 365, 2240, 7161, 3925, 3925..."
2,30002.jpg,A group of people sitting on couch next to a c...,"[group, people, sitting, couch, next, coffee]","[610, 4101, 1495, 4485, 5804, 6481]","[610, 4101, 1495, 4485, 5804, 6481, 3925, 3925..."
3,30003.jpg,A person on a snowboard rides on the hill.,"[person, snowboard, rides, hil]","[3718, 5041, 1316, 71]","[3718, 5041, 1316, 71, 3925, 3925, 3925, 3925,..."
4,30004.jpg,A man riding a skateboard with a helmet on in ...,"[man, riding, skateboard, helmet, con]","[298, 742, 2713, 3483, 4922]","[298, 742, 2713, 3483, 4922, 3925, 3925, 3925,..."


In [601]:
train.to_csv('./processed-data/train.csv')
test.to_csv('./processed-data/test.csv')