In [11]:
import h5py

#to open mini.h5 file, r represents read only
with h5py.File('mini.h5', 'r') as f:
    all_words = [word.decode('utf-8') for word in f['mat']['axis1'][:]]
    all_embeddings = f['mat']['block0_values'][:]
    
print("All words dimension = {0}".format(len(all_words)))
print("all_embeddings dimensions: {0}".format(all_embeddings.shape))

print(all_words[133])

#To extract just the English Letter
#checks if each word starts with the prefix '/c/en/' using the startswith method. If it does, it extracts the substring starting from the 6th character and adds it to the english_words list.
english_words = [word[6:] for word in all_words if word.startswith('/c/en/')]
english_word_indices = [i for i, word in enumerate(all_words) if word.startswith('/c/en/')]
english_embeddings = all_embeddings[english_word_indices]

print("all_words dimensions: {0}".format(len(english_words)))
print("all_embeddings dimensions: {0}".format(english_embeddings.shape))

print(english_words[1337])



All words dimension = 362891
all_embeddings dimensions: (362891, 300)
/c/de/abhängen
all_words dimensions: 150875
all_embeddings dimensions: (150875, 300)
activated_carbon


In [14]:
import numpy as np

#To normalize the embedding vectors
#norms of the embedding vectors represent their magnitudes or lengths
norms = np.linalg.norm(english_embeddings, axis=1)
#Normalization scales the embedding vectors to have unit length. It is performed by dividing each embedding vector by its corresponding norm
normalized_embeddings = english_embeddings.astype('float32') / norms.astype('float32').reshape([-1, 1])

#This line creates a dictionary called index that maps each word in the english_words list to its corresponding index
index = {word: i for i, word in enumerate(english_words)}

def similarity_score(w1, w2):
    #calculates the similarity score between two words, w1 and w2
    score = np.dot(normalized_embeddings[index[w1], :], normalized_embeddings[index[w2], :])
    return score

def closest_to_vector(v, n):
    all_scores = np.dot(normalized_embeddings, v)
    best_words = map(lambda i: english_words[i], reversed(np.argsort(all_scores)))
    return [next(best_words) for _ in range(n)]

def most_similar(w, n):
    return closest_to_vector(normalized_embeddings[index[w], :], n)

def solve_analogy(a1, b1, a2):
    b2 = normalized_embeddings[index[b1], :] - normalized_embeddings[index[a1], :] + normalized_embeddings[index[a2], :]
    return closest_to_vector(b2, 1)

# A word is as similar with itself as possible:
print('cat\tcat\t', similarity_score('cat', 'cat'))

# Closely related words still get high scores:
print('cat\tfeline\t', similarity_score('cat', 'feline'))
print('cat\tdog\t', similarity_score('cat', 'dog'))

# Unrelated words, not so much
print('cat\tmoo\t', similarity_score('cat', 'moo'))
print('cat\tfreeze\t', similarity_score('cat', 'freeze'))

# Antonyms are still considered related, sometimes more so than synonyms
print('antonyms\topposites\t', similarity_score('antonym', 'opposite'))
print('antonyms\tsynonyms\t', similarity_score('antonym', 'synonym'))

print(most_similar('cat', 10))
print(most_similar('dog', 10))
print(most_similar('duke', 10))

print(solve_analogy("man", "brother", "woman"))
print(solve_analogy("man", "husband", "woman"))
print(solve_analogy("spain", "madrid", "france"))

cat	cat	 1.0000001
cat	feline	 0.8199548
cat	dog	 0.590724
cat	moo	 0.0039538303
cat	freeze	 -0.030225191
antonyms	opposites	 0.3941065
antonyms	synonyms	 0.46883982
['cat', 'humane_society', 'kitten', 'feline', 'colocolo', 'cats', 'kitty', 'maine_coon', 'housecat', 'sharp_teeth']
['dog', 'dogs', 'wire_haired_dachshund', 'doggy_paddle', 'lhasa_apso', 'good_friend', 'puppy_dog', 'bichon_frise', 'woof_woof', 'golden_retrievers']
['duke', 'dukes', 'duchess', 'duchesses', 'ducal', 'dukedom', 'duchy', 'voivode', 'princes', 'prince']


In [15]:
import tensorflow as tf
import string
import numpy as np

remove_punct = str.maketrans('', '', string.punctuation)

# This function converts a line of our data file into
# a tuple (x, y), where x is 300-dimensional representation
# of the words in a review, and y is its label.
def convert_line_to_example(line):
    # Pull out the first character: that's our label (0 or 1)
    y = int(line[0])

    # Split the line into words using Python's split() function
    words = line[2:].translate(remove_punct).lower().split()

    # Look up the embeddings of each word, ignoring words not
    # in our pretrained vocabulary.
    embeddings = [normalized_embeddings[index[w]] for w in words if w in index]

    # Take the mean of the embeddings
    x = tf.reduce_mean(tf.convert_to_tensor(embeddings), axis=0)
    return {'x': x, 'y': y}

# Apply the function to each line in the file.
with open("movie-simple.txt", "r", encoding='utf-8', errors='ignore') as f:
    dataset = [convert_line_to_example(l) for l in f.readlines()]

print(len(dataset))


1411


In [18]:
import random
import numpy as np
random.shuffle(dataset)

batch_size = 100
total_batches = len(dataset) // batch_size
train_batches = 3 * total_batches // 4
train, test = dataset[:train_batches * batch_size], dataset[train_batches * batch_size:]

import tensorflow as tf

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(100, activation=tf.nn.relu),
    tf.keras.layers.Dense(20, activation=tf.nn.relu),
    tf.keras.layers.Dense(1)
])

# Loss function
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)

# Metric for accuracy
accuracy_metric = tf.keras.metrics.BinaryAccuracy()

# Optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=0.05)

# Training loop
num_epochs = 250
for epoch in range(num_epochs):
    random.shuffle(train)
    for batch_start in range(0, len(train), batch_size):
        batch_data = train[batch_start:batch_start + batch_size]
        reviews = np.array([sample['x'] for sample in batch_data])
        labels = np.array([sample['y'] for sample in batch_data])
        labels = labels.reshape(-1, 1)  # Reshape labels to match logits shape

        with tf.GradientTape() as tape:
            logits = model(reviews)
            loss_value = loss_fn(labels, logits)
        
        gradients = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        
        accuracy_metric.update_state(labels, tf.math.sigmoid(logits))
    
    if epoch % 10 == 0:
        print("Epoch: {0} \t Loss: {1} \t Acc: {2}".format(epoch, loss_value, accuracy_metric.result()))
    
    accuracy_metric.reset_states()

# Evaluation
test_reviews = np.array([sample['x'] for sample in test])
test_labels = np.array([sample['y'] for sample in test])
test_labels = test_labels.reshape(-1, 1)  # Reshape test labels
logits = model(test_reviews)
test_predictions = tf.math.sigmoid(logits)
test_accuracy = accuracy_metric(test_labels, test_predictions)
print("Final accuracy: {0}".format(test_accuracy))

# Check some words
words_to_test = ["exciting", "hated", "boring", "loved"]

for word in words_to_test:
    word_embedding = normalized_embeddings[index[word]].reshape(1, 300)
    word_probabilities = tf.math.sigmoid(model(word_embedding))
    print(word, word_probabilities)


Epoch: 0 	 Loss: 0.6860820055007935 	 Acc: 0.5460000038146973
Epoch: 10 	 Loss: 0.6760073900222778 	 Acc: 0.5519999861717224
Epoch: 20 	 Loss: 0.6886350512504578 	 Acc: 0.5519999861717224
Epoch: 30 	 Loss: 0.665454626083374 	 Acc: 0.5540000200271606
Epoch: 40 	 Loss: 0.6349549293518066 	 Acc: 0.5889999866485596
Epoch: 50 	 Loss: 0.6492026448249817 	 Acc: 0.6140000224113464
Epoch: 60 	 Loss: 0.6269775629043579 	 Acc: 0.6779999732971191
Epoch: 70 	 Loss: 0.5658120512962341 	 Acc: 0.7570000290870667
Epoch: 80 	 Loss: 0.5489456653594971 	 Acc: 0.8090000152587891
Epoch: 90 	 Loss: 0.4851008653640747 	 Acc: 0.8659999966621399
Epoch: 100 	 Loss: 0.40472617745399475 	 Acc: 0.8859999775886536
Epoch: 110 	 Loss: 0.324491411447525 	 Acc: 0.9049999713897705
Epoch: 120 	 Loss: 0.24773387610912323 	 Acc: 0.9150000214576721
Epoch: 130 	 Loss: 0.268240749835968 	 Acc: 0.9150000214576721
Epoch: 140 	 Loss: 0.2238071858882904 	 Acc: 0.9269999861717224
Epoch: 150 	 Loss: 0.1866254210472107 	 Acc: 0.93699