In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Masking, Dense, LSTM, GRU, SimpleRNN
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences

2022-06-27 18:50:33.705966: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-27 18:50:33.705989: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# for a start, let's use the imdb reviews dataset
from tensorflow.keras.datasets import imdb

In [3]:
def load_data(percentage_of_sentences=None):
    # Load the data
    (sentences_train, y_train), (sentences_test, y_test) = imdb.load_data()
    
    # Take only a given percentage of the entire data
    if percentage_of_sentences is not None:
        assert(percentage_of_sentences> 0 and percentage_of_sentences<=100)
        
        len_train = int(percentage_of_sentences/100*len(sentences_train))
        sentences_train = sentences_train[:len_train]
        y_train = y_train[:len_train]
        
        len_test = int(percentage_of_sentences/100*len(sentences_test))
        sentences_test = sentences_test[:len_test]
        y_test = y_test[:len_test]
            
    # Load the {interger: word} representation
    word_to_id = imdb.get_word_index()
    word_to_id = {k:(v+3) for k,v in word_to_id.items()}
    for i, w in enumerate(['<PAD>', '<START>', '<UNK>', '<UNUSED>']):
        word_to_id[w] = i

    id_to_word = {v:k for k, v in word_to_id.items()}

    # Convert the list of integers to list of words (str)
    X_train = [' '.join([id_to_word[_] for _ in sentence[1:]]) for sentence in sentences_train]
    
    return X_train


### Just run this cell to load the data
data = load_data(percentage_of_sentences=10)

In [4]:
def get_X_y(sentence, length=20):
    '''
    returns a tuple containing:
    - a sentence (a string)
    - the word immediately following that sentence
    '''
    words = sentence.split()
    # return None if the sentence is too short
    if len(words) <= length:
        return None
    # pick a random part of the sentence
    first_word_idx = np.random.randint(0, len(words) - length)
    # build X (a part of the sentence) and y (the word immediately following X)
    X = words[first_word_idx : first_word_idx + length]
    y = words[first_word_idx + length]
    # return X and y
    return X, y

In [5]:
# check the output of the function
get_X_y(data[0])

(['fly',
  'fishing',
  'was',
  'amazing',
  'really',
  'cried',
  'at',
  'the',
  'end',
  'it',
  'was',
  'so',
  'sad',
  'and',
  'you',
  'know',
  'what',
  'they',
  'say',
  'if'],
 'you')

In [6]:
def create_dataset(sentences, number_of_samples = 50000):
    '''
    creates a dataset (X and y) made of number_of_samples observations
    '''
    X, y = [], []
    indices = np.random.randint(0, len(sentences), size=number_of_samples)
    # call get_X_y number_of_samples times
    for idx in indices:
        ret = get_X_y(sentences[idx])
        if ret is None:
            continue
        xi, yi = ret
        X.append(xi)
        y.append(yi)
    return X, y

In [7]:
X, y = create_dataset(data)

In [8]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [9]:
len(X_train), len(X_test), len(y_train), len(y_test)

(34969, 14988, 34969, 14988)

In [10]:
# fit a word2vec model
word2vec = Word2Vec(sentences=X_train, vector_size=20, min_count=10, window=10)
wv = word2vec.wv

In [11]:
# check embedding size
wv["movie"].size

20

In [12]:
wv.most_similar("movie")

[('film', 0.8951712846755981),
 ('flick', 0.8017658591270447),
 ('thing', 0.7820267677307129),
 ('show', 0.7294928431510925),
 ('episode', 0.7083120942115784),
 ('shame', 0.7069810032844543),
 ('crap', 0.6890944838523865),
 ('everything', 0.6858541369438171),
 ('remake', 0.66377192735672),
 ('fun', 0.6494260430335999)]

In [13]:
wv["movie"]

array([-2.5588408 , -2.3808193 ,  3.109312  , -1.3963088 , -0.86325824,
        5.4049573 , -1.891605  ,  1.2675676 ,  0.6402838 , -2.7112646 ,
       -0.8533588 , -1.773583  ,  0.89358914, -1.0737842 ,  1.3911701 ,
       -0.57071954,  3.5721765 ,  1.614322  , -1.8713921 , -0.13573104],
      dtype=float32)

In [14]:
wv.similar_by_vector(wv["movie"])

[('movie', 1.0),
 ('film', 0.8951712250709534),
 ('flick', 0.8017658591270447),
 ('thing', 0.7820267677307129),
 ('show', 0.7294928431510925),
 ('episode', 0.7083120942115784),
 ('shame', 0.7069809436798096),
 ('crap', 0.6890944838523865),
 ('everything', 0.6858541369438171),
 ('remake', 0.6637718677520752)]

In [15]:
vocab_size = len(wv.key_to_index)
vocab_size

5350

In [16]:
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

# Embed the training and test sentences
X_train_embed = embedding(word2vec, X_train)
X_test_embed = embedding(word2vec, X_test)


# Pad the training and test embedded sentences
X_train_pad = pad_sequences(X_train_embed, dtype='float32', padding='post', maxlen=40)
X_test_pad = pad_sequences(X_test_embed, dtype='float32', padding='post', maxlen=40)

In [17]:
# check X_train_pad and W_test_pad
type(X_train_pad)

numpy.ndarray

In [18]:
X_train_pad.shape

(34969, 40, 20)

In [19]:
X_test_pad.shape

(14988, 40, 20)

In [20]:
wv.vector_size

20

In [21]:
len(X_train)

34969

In [22]:
len(X_test)

14988

In [23]:
# check y_train and y_test
type(y_train)

list

In [24]:
len(y_train)

34969

In [25]:
# check how many words from y_train are in wv
sum([word in wv for word in y_train]) / len(y_train)

0.9123223426463439

In [26]:
# check how many words from y_test are in wv
sum([word in wv for word in y_test]) / len(y_test)

0.9133973845743262

In [27]:
X_train_pad.shape, X_test_pad.shape, len(y_train), len(y_test)

((34969, 40, 20), (14988, 40, 20), 34969, 14988)

In [28]:
# filter X_train_pad (and X_test_pad) to keep only the observations for which y_train (and y_test) is in wv
mask_train = [word in wv for word in y_train]
X_train_pad = X_train_pad[mask_train, :, :]
y_train = [word for word in y_train if word in wv]
mask_test = [word in wv for word in y_test]
X_test_pad = X_test_pad[mask_test, :, :]
y_test = [word for word in y_test if word in wv]

In [29]:
X_train_pad.shape, X_test_pad.shape, len(y_train), len(y_test)

((31903, 40, 20), (13690, 40, 20), 31903, 13690)

In [30]:
# transform y_train and y_test into vectors
y_train_vec = np.array([wv[word] for word in y_train])
y_test_vec = np.array([wv[word] for word in y_test])

In [31]:
y_train_vec.shape, y_test_vec.shape

((31903, 20), (13690, 20))

In [32]:
y_train_vec[0]

array([-0.29732156, -1.3444813 ,  1.3773881 ,  1.7681061 ,  0.50167847,
        1.2660348 ,  2.9678817 , -0.7449177 , -0.1939574 ,  0.08014315,
       -0.6463477 ,  1.7013825 ,  3.7770576 ,  2.0104382 , -0.78710914,
       -0.25413918,  0.17603111,  2.5254753 , -5.7983294 ,  0.7528816 ],
      dtype=float32)

In [33]:
# build a model
def init_model():
    model = Sequential()
    model.add(Masking())
    model.add(GRU(20, activation="tanh", return_sequences=True))
    model.add(GRU(16, activation="tanh", return_sequences=False))
    model.add(Dense(8, activation="relu"))
    model.add(Dense(20, activation="linear"))

    model.compile(loss='mae',
                  optimizer='rmsprop',
                  metrics=['mse'])
    
    return model

model = init_model()

2022-06-27 18:50:47.148035: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-06-27 18:50:47.148070: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-06-27 18:50:47.148091: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (think): /proc/driver/nvidia/version does not exist
2022-06-27 18:50:47.148344: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [34]:
es = EarlyStopping(patience=3, restore_best_weights=True)

model.fit(X_train_pad, y_train_vec, 
          batch_size = 16,
          epochs=100,
          validation_split=0.3,
          callbacks=[es]
         )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100


<keras.callbacks.History at 0x7fb465fe0f70>

In [35]:
model.evaluate(X_test_pad, y_test_vec)



[0.9244070053100586, 1.8006491661071777]

In [36]:
X_test[0], y_test[0]

(['film',
  'designed',
  'to',
  'appeal',
  'to',
  'the',
  'deepest',
  'darkest',
  'parts',
  'of',
  'our',
  'being',
  'and',
  'if',
  'the',
  'movie',
  "wasn't",
  'so',
  'boring',
  'this'],
 'film')

In [37]:
X_test_pad[:1, :, :].shape

(1, 40, 20)

In [38]:
y_pred = model.predict(X_test_pad[:1, :, :])
y_pred



array([[-1.5774165 , -1.1677458 ,  1.5750263 , -0.34980276, -0.34538257,
         0.78398883, -0.15256947,  1.0021098 , -0.2668214 , -0.7212622 ,
         0.48466468, -0.5917636 ,  0.4909073 , -0.47223043,  0.80382305,
        -0.74209684,  2.4024122 ,  1.0991554 , -0.66273177,  0.05816066]],
      dtype=float32)

In [39]:
y_pred[0].shape

(20,)

In [40]:
wv.similar_by_vector(y_pred[0])

[('flick', 0.8876231908798218),
 ('picture', 0.864725649356842),
 ('film', 0.8458181023597717),
 ('shame', 0.8452079892158508),
 ('entertaining', 0.83879154920578),
 ('script', 0.8369504809379578),
 ('movie', 0.836172878742218),
 ('terrible', 0.8344402313232422),
 ('case', 0.8285846710205078),
 ('overall', 0.8186780214309692)]

In [41]:
# now we need to append y_pred to X_test[0] and predict over and over to predict multiple words

In [42]:
new_word = wv.similar_by_vector(y_pred[0], topn=1)[0][0]
new_word

'flick'

In [43]:
X_new = X_test[0][1:]
X_new.append(new_word)
X_new = [X_new]

In [44]:
len(X_new)

1

In [45]:
len(X_new[0]), len(X_test[0])

(20, 20)

In [46]:
X_new_embed = embedding(word2vec, X_new)
X_new_pad = pad_sequences(X_new_embed, dtype='float32', padding='post', maxlen=40)

In [47]:
X_new_pad.shape

(1, 40, 20)

In [48]:
y_pred = model.predict(X_new_pad)



In [49]:
wv.similar_by_vector(y_pred[0])

[('holes', 0.9149927496910095),
 ('entertainment', 0.9103526473045349),
 ('mood', 0.9066997170448303),
 ('watchable', 0.9061463475227356),
 ('dated', 0.9026296734809875),
 ('missed', 0.9013277292251587),
 ('scary', 0.9009276032447815),
 ('disbelief', 0.9006526470184326),
 ('subtitles', 0.9005107879638672),
 ('remotely', 0.8999220132827759)]

In [50]:
# let's put this into a function
def repeat_prediction(sentence, repetition=10):
    new_sentence = sentence
    X = sentence.split()
    for i in range(repetition):
        X_embed = embedding(word2vec, [X])
        X_pad = pad_sequences(X_embed, dtype='float32', padding='post', maxlen=40)
        y_pred = model.predict(X_pad)
        new_word = wv.similar_by_vector(y_pred[0], topn=1)[0][0]
        X.pop(0)
        X.append(new_word)
        new_sentence += " " + new_word
    return new_sentence

sentence = "he doesn't realize that his behavior should change and continues to act as he had before he listens to rap music sings along and plays the stereotypical part of an urban black man the real humor in this"
new_sentence = repeat_prediction(sentence)
new_sentence



"he doesn't realize that his behavior should change and continues to act as he had before he listens to rap music sings along and plays the stereotypical part of an urban black man the real humor in this flick pc disbelief entirely threw fairly ironic ironic ironic ironic"

In [51]:
sentence = "i like this movie and the actor"
new_sentence = repeat_prediction(sentence)
new_sentence



'i like this movie and the actor dated nevertheless curious dated entirely offensive entirely disbelief entirely disbelief'