In [104]:
import os
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
%matplotlib inline

In [100]:
data_dir = os.path.join(os.getcwd(), 'Data/cifar-100-python/')
meta = pickle.load(open(os.path.join(data_dir, 'meta'), "rb"), encoding='latin1')
fine_label_names = meta['fine_label_names']

## GloVe

In [108]:
embedding_type = 'glove'
dim = 100
save_dir = 'Data/Embeddings/Full/%s/' % (embedding_type)
data_dir = os.path.join(os.getcwd(), save_dir)

In [109]:
def load_glove(dim=300):
    file_path = os.path.join(data_dir, 'glove.6B.%sd.txt' % (dim))
    
    glove_dict = {}

    with open(file_path) as file:
        for line in file:
            line = line.split()
            glove_dict[line[0]] = np.array(line[1:]).astype(np.float)

    return glove_dict

In [110]:
model = load_glove(dim=dim)

In [111]:
missing_labels = []
for label in fine_label_names:
    if label not in glove_dict.keys():
        missing_labels.append(label)
        print(label)

aquarium_fish
lawn_mower
maple_tree
oak_tree
palm_tree
pickup_truck
pine_tree
sweet_pepper
willow_tree


In [112]:
words_to_embeddings = {}
labels_to_words = {}
labels_to_embeddings = {}

for i in range(len(fine_label_names)):
    label = fine_label_names[i]
    
    if label == 'aquarium_fish':
        labels_to_embeddings[i] = (model['aquarium'] + model['fish']) / 2
        words_to_embeddings[label] = (model['aquarium'] + model['fish']) / 2
    elif '_tree' in label:
        labels_to_embeddings[i] = model[label.replace('_tree', '')]
        words_to_embeddings[label] = model[label.replace('_tree', '')]
    elif label == 'lawn_mower': 
        labels_to_embeddings[i] = model['lawnmower']
        words_to_embeddings[label] = model['lawnmower']
    elif label == 'pickup_truck': 
        labels_to_embeddings[i] = model['truck']
        words_to_embeddings[label] = model['truck']
    elif label == 'sweet_pepper': 
        labels_to_embeddings[i] = model['peppers']
        words_to_embeddings[label] = model['peppers']
    else:
        labels_to_embeddings[i] = model[label]
        words_to_embeddings[label] = model[label]
        
    labels_to_words[i] = label

In [113]:
# Save the files
pickle.dump(words_to_embeddings, open(os.path.join(data_dir, '%s_%s_words_to_embeddings.pk' % (embedding_type, dim)), "wb"))
pickle.dump(labels_to_words, open(os.path.join(data_dir, '%s_%s_labels_to_words.pk' % (embedding_type, dim)), "wb"))
pickle.dump(labels_to_embeddings, open(os.path.join(data_dir, '%s_%s_labels_to_embeddings.pk' % (embedding_type, dim)), "wb"))

## Word2Vec

In [89]:
embedding_type = 'w2v'
dim = 300
save_dir = 'Data/Embeddings/Full/%s/' % (embedding_type)
data_dir = os.path.join(os.getcwd(), save_dir)

In [93]:
# Load the model
model = gensim.models.KeyedVectors.load_word2vec_format(os.path.join(data_dir, 'GoogleNews-vectors-negative300.bin'), binary=True)

In [94]:
words_to_embeddings = {}
labels_to_words = {}
labels_to_embeddings = {}

for i in range(len(fine_label_names)):
    label = fine_label_names[i]
    
    if label == 'aquarium_fish':
        labels_to_embeddings[i] = (model['aquarium'] + model['fish']) / 2
        words_to_embeddings[label] = (model['aquarium'] + model['fish']) / 2
    elif '_tree' in label:
        labels_to_embeddings[i] = model[label.replace('_tree', '')]
        words_to_embeddings[label] = model[label.replace('_tree', '')]
    elif label == 'lawn_mower': 
        labels_to_embeddings[i] = model['lawnmower']
        words_to_embeddings[label] = model['lawnmower']
    elif label == 'pickup_truck': 
        labels_to_embeddings[i] = model['truck']
        words_to_embeddings[label] = model['truck']
    elif label == 'sweet_pepper': 
        labels_to_embeddings[i] = model['peppers']
        words_to_embeddings[label] = model['peppers']
    else:
        labels_to_embeddings[i] = model[label]
        words_to_embeddings[label] = model[label]
        
    labels_to_words[i] = label

In [95]:
# Save the files
pickle.dump(words_to_embeddings, open(os.path.join(data_dir, '%s_%s_words_to_embeddings.pk' % (embedding_type, dim)), "wb"))
pickle.dump(labels_to_words, open(os.path.join(data_dir, '%s_%s_labels_to_words.pk' % (embedding_type, dim)), "wb"))
pickle.dump(labels_to_embeddings, open(os.path.join(data_dir, '%s_%s_labels_to_embeddings.pk' % (embedding_type, dim)), "wb"))

## Wordnet

In [44]:
# Get an array of CIFAR Labels
infile = open("Data/Labels/CIFAR_100_labels_wordnet.txt", "r")
words = []
for line in infile:
    row = line.rstrip().rstrip(',').split(', ')
    for word in row:
        words.append(word)
infile.close()
words = sorted(words)

In [49]:
def read_embeddings_from_txt(filename, working_dict={}):
    infile = open(filename, "r")
    infile.readline()
    embeddings = working_dict
    for line in infile:
        row = line.rstrip().split()
        for word in row:
            embeddings[row[0]] = np.array(row[1:]).astype(np.float32)
    infile.close()
    return embeddings

def read_multiple(filenames):
    embeddings = {}
    for f in filenames:
        embeddings = read_embeddings_from_txt(f, embeddings)
    return embeddings


In [50]:
embeddings = read_embeddings_from_txt("Data/Embeddings/Full/wordnet/wordnet.100d/domain_topic.100d.txt")
embeddings = read_embeddings_from_txt("Data/Embeddings/Full/wordnet/wordnet.100d/hypernym_noun.100d.txt", embeddings)
embeddings = read_embeddings_from_txt("Data/Embeddings/Full/wordnet/wordnet.100d/hypernym_verb.100d.txt", embeddings)
embeddings = read_embeddings_from_txt("Data/Embeddings/Full/wordnet/wordnet.100d/member_holonyms.100d.txt", embeddings)

In [51]:
# Word to Embedding Dictionary
words_to_embeddings = {}
for word in words:
    if word == 'aquarium fish':
        words_to_embeddings[word] = (embeddings['aquarium'] + embeddings['fish']) / 2
    else:
        words_to_embeddings[word] = embeddings[word]
        
# Label Number to Words and Label Number to Embedding Dictionaries
labels_to_words = {}
labels_to_embeddings = {}
for i in range(100):
    labels_to_words[i] = words[i]
    labels_to_embeddings[i] = words_to_embeddings[words[i]]

# Save the files
pickle.dump(words_to_embeddings, open("Data/Embeddings/CIFAR/CIFAR_100_word_to_embedding_wordnet.pk", "wb"))
pickle.dump(labels_to_words, open("Data/Embeddings/CIFAR/CIFAR_100_label_to_word_wordnet.pk", "wb"))
pickle.dump(labels_to_embeddings, open("Data/Embeddings/CIFAR/CIFAR_100_label_to_embedding_wordnet.pk", "wb"))
        