<a href="https://colab.research.google.com/github/King-of-Haskul/Machine-Learning-And-Data-Science/blob/main/word_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## One-hot encoding of tokens

#### Word-level one-hot encoding (toy example)

In [None]:
import numpy as np

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

token_index = {}
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1
            
max_length = 10 #We'll only consider the first max_length words in each sample

results = np.zeros(shape=(len(samples), max_length, max(token_index.values()) + 1))

for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1

#### Using Keras for word-level one-hot encodng

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

tokenizer = Tokenizer(num_words=1000) #Creates a tokenizer that only takes into account 1000 most common words
tokenizer.fit_on_texts(samples)


one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary') #Directly get one-hot binary representations

word_index = tokenizer.word_index
print('Found %s unique tokens.' %len(word_index))

Found 9 unique tokens.


#### One-hot hashing trick

In [None]:
samples = ['The cat sat on the mat.', 'The dog ate my homework.']

dimensionality = 1000 #dimensionality of the hashing space. Stores the words as vectors of size 1000.
max_length= 10

results = np.zeros((len(samples), max_length, dimensionality))
for i,sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = abs(hash(word)) % dimensionality   #Hashes the word into a random integer index between 0 and 1000
        results[i, j, index] = 1

In [None]:
results.shape

(2, 10, 1000)

### Word embeddings

#### Learning word embeddings with the Embedding layer on IMDB dataset

In [None]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(1000, 64) #The number of possible tokens = 10000, Dimensionality of the embeddings=64

#Loading the IMDB data and preprocessing it
from tensorflow.keras.datasets import imdb
from tensorflow.keras import preprocessing

max_features = 10000  #Number of words to consider as features
maxlen = 20           #Cuts off the text after this number of words (among the max_features most common words)

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)  #Turns the list of integers into a 2D integer tensor of shape (samples, maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


Using an Embedding layer and classifier on the IMDB data

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense

model = Sequential()
model.add(Embedding(10000, 8, input_length=maxlen)) #Output has shape (samples, maxlen, 8) i.e. 8 dimensional embeddings
model.add(Flatten())  #Flattens the 3D tensor of embeddings into a 2D tensor of shape (samples, maxlen*8)
model.add(Dense(1, activation='sigmoid'))  #Classifier on top

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten (Flatten)            (None, 160)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x24c08cc81c0>

## Using GloVe word embeddings on IMBD dataset

#### Preprocessing the labels of the raw IMDB data

In [None]:
import os

imdb_dir = '/Users/pc/Downloads/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name= os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':  #Checking the last 4 characters to confirm .txt file
            f = open(os.path.join(dir_name, fname),  encoding="utf8")
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

#### Tokenizing the text

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100  #Cutss off reviews after 100 words
training_samples = 2000
validation_samples = 1000 
max_words = 10000  #Considers only the top 10,000 words in the dataset

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)  #Builds the word index
sequences = tokenizer.texts_to_sequences(texts)  #Turns strings into lists of integer indices

word_index = tokenizer.word_index
length = len(word_index)
print(f"Found {length} unique tokens.")

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor: ', data.shape)
print('Shape of lavbel tensor: ', labels.shape)

indices = np.arange(data.shape[0])  #Splits the data into a training set and a validation set, but first shuffles the data.
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples : training_samples + validation_samples]
y_val = labels[training_samples : training_samples + validation_samples]

Found 87393 unique tokens.
Shape of data tensor:  (25000, 100)
Shape of lavbel tensor:  (25000,)


#### Parsing and preparing the GloVe word-embeddings:

In [None]:
glove_dir = '/Users/pc/Downloads/glove.6B'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
num = len(embeddings_index)
print(f"Found {num} word vectors.")

Found 400000 word vectors.


In [None]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

#### Building, training and evaluating Model

In [None]:
#Model definition
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_4 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_8 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


In [None]:
#Loading pretrained word embeddings into the Embedding layer
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [None]:
#Training and Evaluation
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))

model.save_weights('pre_trained_glove_model.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Training the same model without pretrained word embeddings

In [None]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Evaluating on the test set

In [None]:
#Tokenizing the data of the test set
test_dir = os.path.join(imdb_dir, 'test')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in sorted(os.listdir(dir_name)):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname), encoding = "utf-8")
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

sequences = tokenizer.texts_to_sequences(texts)
x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(labels)

In [None]:
#Evaluating the model on the test set
model.load_weights('pre_trained_glove_model.h5')
model.evaluate(x_test, y_test)



[0.8871650099754333, 0.5060799717903137]

Abbysmal 50% accuracy, basically no imporvement on the random baseline :(