# Tokenize captions and get embeddings

### Imports

In [46]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
from matplotlib.image import imread
import pickle

In [2]:
captions = pd.read_csv("captions.csv", error_bad_lines=False, warn_bad_lines=False)
print("Skipping bad lines - return to this later")
print(captions.shape)
captions.sample(10, random_state=22)

Skipping bad lines - return to this later
(155392, 3)


Unnamed: 0,image,above_text,below_text
54053,My Precious Gollum,Hello TARA...,HELLO PRECIOUS
67160,Ecstatic Michael Phelps,tHERE'S A POT OF THE STUFF?,i LOVE POT.
49626,katt williams shocked,What,you actually thought you were getting rp?
9727,Okay Guy,TOOK AN ARROW TO THE KNEE,OKAY..
22047,Rich Men Laughing,and then we told them,their health insurance premiums wouldnt go up
150193,kim jong un,they see me rulin',they hatin'
121915,The Olympic Queen,vodka,
132328,Paperclip,it looks like you're having trouble,fapping to this meme
120444,Honey BooBoo,happy birthday,ali boo boo
15655,Not sure if troll,not sure if nicki minaj,or a mutant from mortal combat


There seems to be a non-negligible number of captions written in Spanish.

In [3]:
np.sum(pd.isna(captions))

image           13
above_text    6137
below_text    7199
dtype: int64

In [4]:
captions.iloc[np.where(pd.isna(captions.image))]

Unnamed: 0,image,above_text,below_text
18546,,several people get up and leave as they can se...,
43899,,teacher is even later than you,
57525,,Ekki málið :),
100719,,Ert þú starfsmaður þarna eða eigandi?,
100723,,uppiskorpi!!!,
100725,,Eða bara eldisfiskur. LOL.,
100728,,Takk kærlega fyrir þetta :),
105241,,makes us strong,
105243,,makes us strong,
114690,,Nei þá nærðu í rauðvín,


NA values for labels appear to happen when text is in a different language. I think it is safe to say that we can drop these. For `above_text` and `below_text`, this indicates that the meme did not contain text either above or below the picture. We can't throw these out, so just replace them with a empty string.

**IS THIS THE RIGHT THING TO DO?**

In [5]:
captions = captions[pd.notnull(captions.image)]

In [6]:
captions = captions.replace(np.nan, '', regex=True)

In [7]:
np.sum(pd.isna(captions))

image         0
above_text    0
below_text    0
dtype: int64

### Set up vocabulary dictionary

In Dank Learning, it looks like they create a vocabulary dictionary from all words in the captions and labels, i.e., meme format names. See [here](https://github.com/alpv95/MemeProject/blob/master/im2txt/MemeNote.ipynb) for their exact process.

In [8]:
all_phrases = np.append(captions.image, [captions.above_text, captions.below_text])

In [9]:
rand_inds = np.random.randint(len(all_phrases)-1, size=10)
for phrase in all_phrases[rand_inds]:
    print(phrase)

its all over youtube
Progressive Guitarist
this is madness
i like the way you flatshot that .25" plate.
Oh you think war is your ally? But you merely adopted the Uniwar.
i'm going to make a ton of threads on /b/
Advice Polack
That would make me soooo happy.
your people is just to stupid
#HASHTAGS EVERYWHERE


### Tokenize

In [10]:
tokenizer = RegexpTokenizer(r'[\w\']+')

Just look at all (unique) words to inspect if anything looks wrong.

In [11]:
all_words = []
for phrase in all_phrases:
    for word in tokenizer.tokenize(phrase):
        all_words.append(word)
unique_words = list(set(all_words))
all_words.sort()
unique_words.sort()

In [12]:
print("Number of words:", len(all_words))
print("Number of unique words:", len(unique_words))

Number of words: 1845822
Number of unique words: 117496


In [13]:
word2idx = dict([(y,x) for x,y in enumerate(unique_words)])

In [14]:
# getting empty strings in weird places... that seems off
list(word2idx.keys())[0:6]

["'", "''", "'''angry", "''50", "''BFF's''", "''Busy"]

Tokenize each caption and label.

In [15]:
list_of_tokens_label = []
for label in captions.image:
    list_of_tokens_label.append(tokenizer.tokenize(label))
    
list_of_tokens_above_text = []
for above_text in captions.above_text:
    list_of_tokens_above_text.append(tokenizer.tokenize(above_text))
    
list_of_tokens_below_text = []
for below_text in captions.below_text:
    list_of_tokens_below_text.append(tokenizer.tokenize(below_text))

Access GloVe pre-trained vectors from [Standford's Github repo](https://github.com/stanfordnlp/GloVe). This is the Common Crawl one with 42 billion tokens.

In [17]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [18]:
glove_model = loadGloveModel("glove.42B.300d.txt")

Loading Glove Model
Done. 613311  words loaded!


In [84]:
# for now doing this on a sample of 10
sample_inds = [223, 51788, 2112, 777, 1120]

In [89]:
real_label = []
real_above_text = []
real_below_text = []
label_embeddings = []
above_text_embeddings = []
below_text_embeddings = []
for si in sample_inds:
    label_embeddings.append([glove_model.get(elem) for elem in list_of_tokens_label[si]])
    above_text_embeddings.append([glove_model.get(elem) for elem in list_of_tokens_above_text[si]])
    below_text_embeddings.append([glove_model.get(elem) for elem in list_of_tokens_below_text[si]])
    real_label.append(list_of_tokens_label[si])
    real_above_text.append(list_of_tokens_above_text[si])
    real_below_text.append(list_of_tokens_below_text[si])

In [47]:
sample_tuple = (real_label, real_above_text, real_below_text, label_embeddings, above_text_embeddings, below_text_embeddings)
with open("sample_data", "wb") as f:
    pickle.dump(sample_tuple, f)

### Handle Spanish words

When using GloVe embeddings, we'll map words to unknown tag and we can naively remove all the unknown tags (as some of these will likely correspond to Spanish words). This means remove all captions with unknown tags.