In [25]:
import os
import random

from tqdm import tqdm_notebook as tqdm
import torch
import torchtext
from torchtext.vocab import GloVe
from transformers import DistilBertTokenizer
from tqdm import tqdm, tqdm_notebook
import numpy as np
import TextNet

print("Torch Version: ", torch.__version__)
print("Torchtext Version: ", torchtext.__version__)

Torch Version:  1.3.0+cu92
Torchtext Version:  0.4.0


In [26]:
EMB_CACHE = os.path.expanduser("../../words/glove/")
DATASET_CACHE = os.path.expanduser("./")
BATCH_SIZE = 8
DEVICE = torch.device('cpu')

In [27]:
train_img = torch.load("../cached_data/train_img")
train_cap = torch.load("../cached_data/train_cap")
train_mask = torch.load("../cached_data/train_mask")

val_img = torch.load("../cached_data/val_img")
val_cap = torch.load("../cached_data/val_cap")
val_mask = torch.load("../cached_data/val_mask")

print("Loaded train data", train_img.size(), train_cap.size(), train_mask.size())
print("Loaded val data", val_img.size(), val_cap.size(), val_mask.size())


Loaded train data torch.Size([10000, 3, 224, 224]) torch.Size([10000, 52]) torch.Size([10000, 52])
Loaded val data torch.Size([5000, 3, 224, 224]) torch.Size([5000, 43]) torch.Size([5000, 43])


In [28]:
def build_vocab(train_cap):
    
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    vocab = []
    for i in range (len(train_cap)):
        words = tokenizer.decode(train_cap[i].numpy()).split(' ')
        for j in range (len(words)):
            if  words[j]== '[CLS]' or  words[j]== '[PAD]' or words[j]=='[SEP]':
                continue;
            else:  
                if words[j] not in vocab: #add unique words
                    vocab.append(words[j])

    vocab.append('[CLS]')
    vocab.append('[PAD]')
    vocab.append('[SEP]')

    return vocab

#print(vocab)
#print(train_cap[0][0])
vocab = build_vocab(train_cap)
print(len(vocab))


5837


In [34]:
def build_embed(vocab):
    
    glove = torchtext.vocab.Vectors('../../words/glove/glove.6B.300d.txt')
    print(len(glove[vocab[10]]))

    EMBEDDING_DIM = 300
    embedding_matrix = np.zeros((len(vocab) + 1, EMBEDDING_DIM))
    i = 0
    for word in vocab:
        embedding_vector = glove[word]
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
            i = i+1
    return embedding_matrix

embedding_matrix = build_embed(vocab)
print(np.shape(embedding_matrix))

300
(5838, 300)


In [50]:
def build_train(train_cap):
    
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    vocab = []
    for i in range (len(train_cap)):
        words = tokenizer.decode(train_cap[i].numpy()).split(' ')
        vocab.append(words)
    return vocab

#print(vocab)
#print(train_cap[0][0])
train = build_train(train_cap)
print(len(train))


10000


In [51]:
TEXT = torchtext.data.Field(sequential=True, lower=True, include_lengths=True, batch_first=True, tokenize = 'spacy')
print(TEXT)

<torchtext.data.field.Field object at 0x7f6b74551b38>


In [53]:
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300, cache=EMB_CACHE))
vocab_new = TEXT.vocab
print(vocab_new.vectors.size())

torch.Size([5839, 300])


In [14]:
# vocab = TEXT.build_vocab(vocab, vectors=GloVe(name='6B', dim=300, cache=EMB_CACHE))
# vocab = TEXT.vocab

In [15]:
# print(vocab.vectors.size())

In [16]:
#tokenize = lambda x: x.split()
#TEXT = torchtext.data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)

In [17]:
# TEXT = torchtext.data.Field(sequential=True, lower=True, include_lengths=True, batch_first=True, tokenize = 'spacy')

In [18]:
# vocab = TEXT.build_vocab(vocab, vectors=GloVe(name='6B', dim=300, cache=EMB_CACHE))
# print(vocab.itos)

In [19]:
# from torch import nn
# embedding = nn.Embedding(1000,128)
# embedding(torch.LongTensor([3,4]))


In [20]:
# import json

# with open('../dataset/annotations/captions_train2014.json') as json_file:
#     data = json.load(json_file)
#     print('Caption: ' + p['caption'])
      

In [21]:
# import pickle
# with open('../../words/vocab.pkl', 'rb') as f:
#         vocab = pickle.load(f)
            

In [22]:
# def load_glove(word_index):
#     EMBEDDING_FILE = '../../words/glove/glove.6B.300d.txt'
#     def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
#     embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    
#     all_embs = np.stack(embeddings_index.values())
#     emb_mean,emb_std = -0.005838499,0.48782197
#     embed_size = all_embs.shape[1]

#     # word_index = tokenizer.word_index
#     nb_words = min(max_features, len(word_index))
#     embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
#     for word, i in word_index.items():
#         if i >= max_features: continue
#         embedding_vector = embeddings_index.get(word)
#         #ALLmight
#         if embedding_vector is not None: 
#             embedding_matrix[i] = embedding_vector
#         else:
#             embedding_vector = embeddings_index.get(word.capitalize())
#             if embedding_vector is not None: 
#                 embedding_matrix[i] = embedding_vector
#     return embedding_matrix 
    
            

In [23]:
# def load_fasttext(word_index):    
#     EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
#     def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
#     embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

#     all_embs = np.stack(embeddings_index.values())
#     emb_mean,emb_std = all_embs.mean(), all_embs.std()
#     embed_size = all_embs.shape[1]

#     # word_index = tokenizer.word_index
#     nb_words = min(max_features, len(word_index))
#     embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
#     for word, i in word_index.items():
#         if i >= max_features: continue
#         embedding_vector = embeddings_index.get(word)
#         if embedding_vector is not None: embedding_matrix[i] = embedding_vector

#     return embedding_matrix

In [None]:
# class Captions(torchtext.data.TabularDataset):

#     @classmethod
#     def splits(annotations.captions, root='../dataset/annotations', 
#                train='captions_train2014.json', **kwargs):
    
#         fields = {'annotations': annotations.captions}
        
        
#         return super(Captions, cls).splits(
#             fields=fields, root=root, train=train,
#             format='json',**kwargs)

# TEXT = torchtext.data.Field(sequential=True, lower=True, include_lengths=True, batch_first=True, tokenize='spacy')
# train = Captions.splits(root='../dataset/annotations', info =None, images = None, licenses = None, annotations=('annotations',TEXT))

In [None]:
# fields = {'annotations':TEXT}
# train_data = torchtext.data.TabularDataset.splits(
#                             path = '../dataset/annotations',
#                             train = 'captions_train2014.json',
#                             format = 'json',
#                             fields = fields
# )