
## Pre-trained word embeddings: case Blog Gender


In [None]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

print(tf.__version__)

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import pandas as pd

# fix random seed for reproducibility
seed = 2020
np.random.seed(seed)  

import sklearn as sk
from sklearn.model_selection import train_test_split

from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Embedding, Conv1D,  MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.models import load_model


import nltk


In [None]:
# helper functions for visualisation
# plotting the loss functions used in this notebook
# we plot the loss we want to optimise on the left (in this case: accuracy)
def plot_history(history):
  plt.figure(figsize = (12,4))
  plt.subplot(1,2,1)

  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.plot(history.epoch, np.array(history.history['accuracy']),'g-',
           label='Train accuracy')
  plt.plot(history.epoch, np.array(history.history['val_accuracy']),'r-',
           label = 'Validation accuracy')
  plt.legend()

  plt.subplot(1,2,2)
  plt.xlabel('Epoch')
  plt.ylabel('Loss minimised by model')
  plt.plot(history.epoch, np.array(history.history['loss']),'g-',
           label='Train loss')
  plt.plot(history.epoch, np.array(history.history['val_loss']),'r-',
           label = 'Validation loss')
  plt.legend()

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
url = 'https://raw.githubusercontent.com/HOGENT-Databases/DB3-Workshops/master/data/blog-gender-dataset.csv'
df_dataset = pd.read_csv(url)
df_dataset.columns
df_dataset.dtypes

In [None]:
df_dataset["gender"] = df_dataset["gender"].str.strip()
df_dataset["gender"] = df_dataset["gender"].str.upper()

df_dataset.groupby("gender").count()

In [None]:
df_dataset.fillna(value='', inplace=True)

In [None]:
# Calculate the number of words
df_dataset['numberOfWords'] = df_dataset.text.str.split().apply(len)
df_dataset.tail(10)

In [None]:
df_dataset.describe()

In [None]:
# Changing spam and ham into 0 and 1
df_dataset['gender'] = np.where(df_dataset['gender'] == "M", 0, 1)
df_dataset.head()

In [None]:
# Extract a training & validation split
from sklearn.model_selection import train_test_split
X = df_dataset.drop(['gender','numberOfWords'],axis=1)
y = df_dataset['gender']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

In [None]:
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)

print('X_train shape:', X_train.shape)
print(type(X_train))

In [None]:
# the labels from the downloaded data are integer numbers
# for a multi-class classification task, we again convert each integer
# to a vector with 19 zeros and a single '1', corresponding to the right class
num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)


# look at the new labels for the first sample
print(y_test[0])
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

### Create a vocabulary index

Let's use the TextVectorization to index the vocabulary found in the dataset. Later, we'll use the same layer instance to vectorize the samples.

Our layer will only consider the top 20,000 words, and will truncate or pad sequences to be actually 40 tokens long.

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=500)
text_ds = tf.data.Dataset.from_tensor_slices(X_train).batch(128)
vectorizer.adapt(text_ds)


In [None]:
# You can retrieve the computed vocabulary used via vectorizer.get_vocabulary(). 
# Let's print the top 5 words:
vectorizer.get_vocabulary()[:5]

In [None]:
# Let's vectorize a test sentence:
output = vectorizer([["i saw the cat sat on the mat"]])
output.numpy()[0, :8]

As you can see, "i" gets represented as "2". Why not 0, given that "i" was the first word in the vocabulary? That's because index 0 is reserved for padding and index 1 is reserved for "out of vocabulary" tokens.

Here's a dict mapping words to their indices:

In [None]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

As you can see, we obtain the same encoding as above for our test sentence:

In [None]:
test = ["i","saw","the", "cat", "sat", "on", "the", "mat"]
[word_index[w] for w in test]

### Load pre-trained word embeddings


The archive contains text-encoded vectors of various sizes: 50-dimensional, 100-dimensional, 200-dimensional, 300-dimensional. We'll use the 100D ones.

Let's make a dict mapping words (strings) to their NumPy vector representation:

In [None]:
path_to_glove_file = '/content/gdrive/My Drive/glove.6B.100d.txt'

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
print(embeddings_index['cat'])

Now, let's prepare a corresponding embedding matrix that we can use in a Keras Embedding layer. It's a simple NumPy matrix where entry at index i is the pre-trained vector for the word of index i in our vectorizer's vocabulary.

In [None]:
num_tokens = len(voc) + 2
# each word is represented by a vector of 100 floats (glove.6B.100d.txt)
embedding_dim = 100
hits = 0
misses = 0
missed_words = []

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
# word_index is a dictionary that maps each word to an index
# we loop through all the words of word_index.items()
for word, i in word_index.items():
# we try to retrieve the vector of 100 floats for this word out of embeddings_index  
    embedding_vector = embeddings_index.get(word)
# if we found the corresponding vector of 100 floats    
    if embedding_vector is not None:
      # we put the vector on position i of embedding_matrix
        embedding_matrix[i] = embedding_vector
        hits += 1    
    else:
      # Words not found in embedding index will be all-zeros.    
        misses += 1
        missed_words.append(word)
        
print("Converted %d words (%d misses)" % (hits, misses))

print("*** Missed words = words not in word_index ***")
print(missed_words[0:10])
print()
print("*** i has which index in word_index? ***")
index_i = word_index['i']
print(index_i)
print()
print("*** the vector of 100 floats representing i ***")
print(embedding_matrix[index_i])
print()
print("*** cat has which index in word_index? ***")
index_cat = word_index['cat']
print(index_cat)
print()
print("*** the vector of 100 floats representing cat ***")
print(embedding_matrix[index_cat])

### Build the model

A simple 1D convnet with global max pooling and a classifier at the end.
We load the pre-trained word embeddings matrix into an Embedding layer.

Note that we set trainable=False so as to keep the embeddings fixed (we don't want to update them during training).

In [None]:
num_classes = 2

def initial_model():
    # we create a variable called model, and we set it equal to an instance of a Sequential object.
    model = Sequential()

    model.add(Embedding(num_tokens, embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix),trainable=False))
    model.add(Conv1D(64,activation='relu',kernel_size=3))
    model.add(MaxPooling1D(3))  
    model.add(Conv1D(64,activation='relu',kernel_size=3))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(16, activation='relu', kernel_initializer='he_uniform'))    
    model.add(Dropout(0.5))      
    model.add(Dense(num_classes, activation='softmax'))


    # Before we can train our model, we must compile it
    # To the compile() function, we are passing the optimizer, the loss function, and the metrics that we would like to see. 
    # Notice that the optimizer we have specified is called Adam. Adam is just a variant of SGD. 
    model.compile(loss='categorical_crossentropy',
                  optimizer= tf.keras.optimizers.Adam(learning_rate = 0.001),
                  metrics=['accuracy']) 
    return model


### Train the model
First, convert our list-of-strings data to NumPy arrays of integer indices. The arrays are right-padded.

In [None]:
X_train_final = vectorizer(np.array([s for s in X_train])).numpy()
X_test_final = vectorizer(np.array([s for s in X_test])).numpy()

y_train_final = np.array(y_train)
y_test_final = np.array(y_test)

print(X_train.shape)
print(X_test.shape)
print(X_train_final.shape)
print(X_test_final.shape)
print(y_train_final.shape)
print(y_test_final.shape)

In [None]:
model_1 = initial_model()
model_1.summary()


# We now add batch size to the mix of training parameters
# If you don't specify batch size below, all training data will be used for each learning step
batch_size = 32
epochs = 10

history_1 = model_1.fit(X_train_final, y_train_final,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test_final, y_test_final)
                    )



In [None]:
# model_1 now contains the model at the end of the training run
# We analyse the result:

[train_loss, train_accuracy] = model_1.evaluate(X_train_final, y_train_final, verbose=0)
print("Training set Accuracy:{:7.2f}".format(train_accuracy))
print("Training set Loss:{:7.4f}\n".format(train_loss))

[val_loss, val_accuracy] = model_1.evaluate(X_test_final, y_test_final, verbose=0)
print("Validation set Accuracy:{:7.2f}".format(val_accuracy))
print("Validation set Loss:{:7.4f}\n".format(val_loss))

#Now we visualise what happened during training
plot_history(history_1)

In [None]:
X_example = vectorizer(np.array([s for s in ["My new dress is awesome"]])).numpy()
pred = model_1.predict([X_example])
print(pred)

In [None]:
X_example = vectorizer(np.array([s for s in ["Last night I was playing the FIFA soccer game"]])).numpy()
pred = model_1.predict([X_example])
print(pred)