# 0. Set up

In [4]:
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
import pickle
from pickle_utils import pickle_load, pickle_dump

In [6]:
captions = pd.read_csv("captions.csv", error_bad_lines=False, warn_bad_lines=False)
print("Skipping bad lines - return to this later")
print(captions.shape)
captions.sample(10, random_state=22)

Skipping bad lines - return to this later
(155392, 3)


Unnamed: 0,image,above_text,below_text
54053,My Precious Gollum,Hello TARA...,HELLO PRECIOUS
67160,Ecstatic Michael Phelps,tHERE'S A POT OF THE STUFF?,i LOVE POT.
49626,katt williams shocked,What,you actually thought you were getting rp?
9727,Okay Guy,TOOK AN ARROW TO THE KNEE,OKAY..
22047,Rich Men Laughing,and then we told them,their health insurance premiums wouldnt go up
150193,kim jong un,they see me rulin',they hatin'
121915,The Olympic Queen,vodka,
132328,Paperclip,it looks like you're having trouble,fapping to this meme
120444,Honey BooBoo,happy birthday,ali boo boo
15655,Not sure if troll,not sure if nicki minaj,or a mutant from mortal combat


# 1. Load GloVe and save objects for later use

In [7]:
def loadGloveModel(gloveFile, shape=(1917494, 300), downweight_factor=1.):
    '''
    Load GloVe pre-trained word vectors
    
    INPUT
    =====
    gloveFile: file with GloVe word vectors
        .txt file
    shape: (vocabulary size, number of latent dimensions)
        tuple
    downweight_factor: rescaling factor for GloVe embeddings
        float
    
    OUTPUTS
    =======
    glove_index_dict: dictionary w/ keys=words and values=row index in glove_embedding_weights
        dict
    glove_embedding_weights: matrix w/ n_row = vocabulary size and n_col = latent dimension
        np.array
    '''
    glove_index_dict = {}
    glove_embedding_weights = np.empty(shape)
    with open(gloveFile, 'r', encoding="utf8") as fp:
        i = 0
        for l in fp:
            if i < shape[0]-1:
                l = l.strip().split()
                w = l[0]
                glove_vector = [float(x) for x in l[1:]]
                glove_index_dict[w] = i        
                glove_embedding_weights[i,:] = glove_vector
                i += 1
    glove_embedding_weights *= downweight_factor
    
    return glove_index_dict, glove_embedding_weights

In [9]:
glove_objs = loadGloveModel("glove.txt")

In [12]:
pickle_dump(glove_objs, "glove_objs.pkl")

writing total_bytes=4632882161...
writing bytes [0, 1073741824)... done.
writing bytes [1073741824, 2147483648)... done.
writing bytes [2147483648, 3221225472)... done.
writing bytes [3221225472, 4294967296)... done.
writing bytes [4294967296, 4632882161)... done.


# 2. Removing bad captions and labels

## a. Where there are NaNs in the captions or labels

In [None]:
np.sum(pd.isna(captions))

In [None]:
captions.iloc[np.where(pd.isna(captions.image))]

In [None]:
captions = captions[pd.notnull(captions.image)]
captions = captions.replace(np.nan, '', regex=True)

In [None]:
np.sum(pd.isna(captions))

## b. Next thing

# 3. Creating word mappings

In [None]:
empty = 0 # RNN mask of no data
eos = 1  # end of sentence
start_idx = eos+1 # first real word

def get_idx(vocab, vocabcount):
    word2idx = dict((word, idx+start_idx) for idx,word in enumerate(vocab))
    word2idx['<empty>'] = empty
    word2idx['<eos>'] = eos
    
    idx2word = dict((idx,word) for word,idx in word2idx.items())

    return word2idx, idx2word

In [None]:
# word2idx, idx2word = get_idx(vocab, vocabcount)