## Generate corpus and gruond-truth references of released videos

### Corpus file contents
0. train_data: captions and idxs of training videos in format [corpus_widxs, vidxs, corpus_pidxs], where:
    - corpus_widxs is a list of lists with the index of words in the vocabulary
    - vidxs is a list of indexes of video features in the features file
    - corpus_pidxs is a list of lists with the index of POS tags in the POS tagging vocabulary
1. val_data: same format of train_data.
2. test_data: same format of train_data.
3. vocabulary: in format {'word': count}.
4. idx2word: is the vocabulary in format {idx: 'word'}.
5. word_embeddings: are the vectors of each word. The i-th row is the word vector of the i-th word in the vocabulary.
6. idx2pos: is the vocabulary of POS tagging in format {idx: 'POSTAG'}

### Generate split for training and validation

In [2]:
import pandas as pd
data = pd.read_csv('../../../data/TACoS-Multi-Level/corpus/annosDetailed-processed.csv', '\t', usecols=[0,1,2,6,9], names=['video-id', 'start-frame', 'end-frame', 'sentence', 'label'], engine='python')  
data

Unnamed: 0,video-id,start-frame,end-frame,sentence,label
0,s13-d21,185,224,the person walked into the kitchen,enter
1,s13-d21,185,224,the person entered the kitchen,enter
2,s13-d21,185,224,the person entered the kitchen,enter
3,s13-d21,185,224,the person entered a kitchen,enter
4,s13-d21,185,224,the person walked into the kitchen,enter
...,...,...,...,...,...
52588,s37-d74,11013,11041,the person unplugged the machine,unplug
52589,s37-d74,11013,11041,the person unplugged the coffee maker,unplug
52590,s37-d74,11013,11041,the person unplugged the machine,unplug
52591,s37-d74,11013,11041,the person unplugged the coffeemaker,unplug


In [3]:
# list(data['sentence'])

train_vidxs, train_corpus = list(data['video-id']), list(data['sentence'])
# valid_vidxs, valid_corpus = zip(*[(int(d['id']), d['label']) for d in valid_data])
# test_vidxs = [(int(d['id'])) for d in test_data]

### Get pretrained embeddings

In [4]:
import os
import numpy as np

wordvectors = {}
# with open('./glove.42B.300d.txt') as f:
with open('./glove.6B.300d.txt') as f:
    for line in f:
        s = line.strip().split(' ')
        if len(s) == 301:
            wordvectors[s[0]] = np.array(s[1:], dtype=float)
    print(len(wordvectors))

400000


### Determine the vocabulary from train split

In [5]:
import nltk
nltk.download('punkt')

vocab, total_len = {}, 0
for cap in train_corpus:
    tokens = nltk.word_tokenize(cap.lower())
    total_len += len(tokens)
    for w in tokens:
        try:
            vocab[w] += 1
        except:
            vocab[w] = 1

print('Avg. count of words per caption:', total_len/len(train_corpus))
print('Count of unique words: ', len(vocab))

to_del = []
for w in vocab.keys():
    if not w in wordvectors:
        to_del.append(w)
        print('missing word: {}'.format(w))

print('count of missing words: ', len(to_del))
        
for w in to_del:
    del vocab[w]
        
idx2word = {idx: word for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
word2idx = {word: idx for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
EOS, UNK = 0, 1

print(len(vocab), len(idx2word), len(word2idx))

word_embeddings = np.zeros((len(idx2word), 300))
for idx, word in idx2word.items():
    if idx == EOS:
        word_embeddings[idx] = wordvectors['eos']
    elif idx == UNK:
        word_embeddings[idx] = wordvectors['unk']
    else:
        word_embeddings[idx] = wordvectors[word]

[nltk_data] Downloading package punkt to /home/jeperez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Avg. count of words per caption: 8.27064438233225
Count of unique words:  2863
missing word: refriderator
missing word: medium-thick
missing word: counter-top
missing word: re-closed
missing word: re-wrapped
missing word: straight-edged
missing word: cup-board
missing word: cubbard
missing word: dishtowel
missing word: grapfruit
missing word: hhe
missing word: pieceshe
missing word: stove-top
missing word: drainer
missing word: longways
missing word: pluged
missing word: re-sealed
missing word: adusted
missing word: timer/settings
missing word: -lrb-
missing word: -rrb-
missing word: outter
missing word: choped
missing word: untoed
missing word: un-potted
missing word: wrraped
missing word: diffferent
missing word: width-wise
missing word: length-wise
missing word: drainboard
missing word: potaoes
missing word: evenly-sized
missing word: egg-whites
missing word: outershell
missing word: snowpeas
missing word: hand-washed
missing word: de-stemmed
missing word: flowerets
missing word: ov

### Determine POS-tagging vocabulary from train split

In [52]:
import nltk

pos_vocab = {}
pos_unique_words = {}
for cap in train_corpus:
    for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower())):
        try:
            pos_vocab[tag[1]] += 1
            try: 
                pos_unique_words[tag[1]][tag[0]] += 1
            except:
                pos_unique_words[tag[1]][tag[0]] = 1
        except:
            pos_vocab[tag[1]] = 1
            pos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in pos_unique_words.items()]))
            
idx2pos = {idx: tag for idx, tag in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
pos2idx = {tag: idx for idx, tag in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
EOS, UNK = 0, 1
print(len(idx2pos))

Unique words per tag:
 DT:	19
 NN:	1178
 VBD:	577
 IN:	76
 NNS:	347
 VBG:	198
 CC:	6
 JJ:	582
 TO:	1
 VB:	306
 RP:	15
 ,:	1
 JJR:	28
 RB:	196
 PRP$:	5
 VBN:	172
 CD:	22
 PRP:	11
 FW:	6
 VBP:	65
 WDT:	3
 RBR:	11
 VBZ:	80
 PDT:	6
 POS:	2
 MD:	5
 JJS:	9
 EX:	1
 WRB:	2
 WP:	3
 NNP:	6
 ::	3
 ``:	2
 .:	2
 RBS:	1
 '':	1
38


### Determine Universal POS-tagging from train split

In [53]:
import nltk
nltk.download('universal_tagset')

upos_vocab = {}
upos_unique_words = {}
for cap in train_corpus:
    for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower()), tagset='universal'):
        try:
            upos_vocab[tag[1]] += 1
            try: 
                upos_unique_words[tag[1]][tag[0]] += 1
            except:
                upos_unique_words[tag[1]][tag[0]] = 1
        except:
            upos_vocab[tag[1]] = 1
            upos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per universal tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in upos_unique_words.items()]))
            
idx2upos = {idx: word for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
upos2idx = {word: idx for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
print(len(idx2upos))

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jeperez/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Unique words per universal tag:
 DET:	26
 NOUN:	1475
 VERB:	1178
 ADP:	76
 CONJ:	6
 ADJ:	609
 PRT:	18
 .:	9
 ADV:	207
 PRON:	19
 NUM:	22
 X:	6
14


### Generate ground-truth references files

In [20]:
with open('../results/20B-SS-v2_val_references.txt', 'w') as f:
    for vidx, cap in zip(valid_vidxs, valid_corpus):
        f.write('{}\t{}\n'.format(vidx, cap.lower()))

### Generate corpus.pkl file

In [21]:
import pickle

train_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in train_corpus]
valid_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in valid_corpus]

train_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in train_corpus]
valid_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in valid_corpus]

assert len(train_corpus_widxs) == len(train_vidxs) and len(train_vidxs) == len(train_corpus_pidxs) and len(train_vidxs) == len(train_corpus)
assert len(valid_corpus_widxs) == len(valid_vidxs) and len(valid_vidxs) == len(valid_corpus_pidxs) and len(valid_vidxs) == len(valid_corpus)

train_data = [train_corpus_widxs, train_vidxs, train_corpus_pidxs, train_corpus]
valid_data = [valid_corpus_widxs, valid_vidxs, valid_corpus_pidxs, valid_corpus]
test_data = [None, test_vidxs, None]

with open('../../../data/Something-Something-v2/20b-ss-v2_corpus_pos.pkl', 'wb') as outfile:
    pickle.dump([train_data, valid_data, test_data, vocab, idx2word, word_embeddings, idx2pos], outfile)