<a href="https://colab.research.google.com/github/Lisaaa2021/NLP-practice/blob/main/NLP_with_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP with CNN

### Exercise objectives:

- Use CNN instead of RNN for NLP


In [1]:
# Load the data

import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import text_to_word_sequence

def load_data(percentage_of_sentences=None):
    train_data, test_data = tfds.load(name="imdb_reviews", split=["train", "test"], batch_size=-1, as_supervised=True)

    train_sentences, y_train = tfds.as_numpy(train_data)
    test_sentences, y_test = tfds.as_numpy(test_data)
    
    # Take only a given percentage of the entire data
    if percentage_of_sentences is not None:
        assert(percentage_of_sentences> 0 and percentage_of_sentences<=100)
        
        len_train = int(percentage_of_sentences/100*len(train_sentences))
        train_sentences, y_train = train_sentences[:len_train], y_train[:len_train]
  
        len_test = int(percentage_of_sentences/100*len(test_sentences))
        test_sentences, y_test = test_sentences[:len_test], y_test[:len_test]
    
    X_train = [text_to_word_sequence(_.decode("utf-8")) for _ in train_sentences]
    X_test = [text_to_word_sequence(_.decode("utf-8")) for _ in test_sentences]
    
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = load_data(percentage_of_sentences=10)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteFJR4IW/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteFJR4IW/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteFJR4IW/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m
Instructions for updating:
Use `tf.data.Dataset.get_single_element()`.


Instructions for updating:
Use `tf.data.Dataset.get_single_element()`.


In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tk = Tokenizer()
tk.fit_on_texts(X_train)
vocab_size = len(tk.word_index)

X_train_sequences = tk.texts_to_sequences(X_train)
X_test_sequences = tk.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_sequences, dtype='float32', padding='post', maxlen=150)
X_test_pad = pad_sequences(X_test_sequences, dtype='float32', padding='post',maxlen=150)

In [21]:
from tensorflow.keras.layers import Conv1D
from tensorflow.keras import layers, Sequential
def init_model():
  model_cnn = Sequential()
  model_cnn.add(layers.Embedding(
    input_dim=vocab_size+1,
    input_length=150,
    output_dim=100,
    mask_zero=True, # Included masking layer :)
    ))
  model_cnn.add(layers.Conv1D(16, kernel_size = 5))
  model_cnn.add(layers.Flatten())
  model_cnn.add(layers.Dense(5,))
  model_cnn.add(layers.Dense(1, activation="sigmoid"))

  model_cnn.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
  return model_cnn

In [22]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=5, restore_best_weights=True)

model_cnn = init_model()

model_cnn.fit(X_train_pad, y_train, 
          epochs=20, 
          batch_size=32,
          validation_split=0.3,
          callbacks=[es]
         )


res = model_cnn.evaluate(X_test_pad, y_test, verbose=0)

print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
The accuracy evaluated on the test set is of 75.520%


# Learn a Word2Vec representation, and then feed it to a NN with a `Conv1D`

In [23]:
import gensim.downloader as api
import numpy as np

word2vec = api.load("glove-wiki-gigaword-50")

In [24]:
from gensim.models import Word2Vec
word2vec = Word2Vec(sentences=X_train,size=100)

In [25]:
def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence: #for each word in one sentence
        if word in word2vec.wv: 
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences: #for sentence in 2500 sentences
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed #has all words in the X_train
  

In [26]:
X_train_embed_word2vec = embedding(word2vec, X_train)
X_test_embed_word2vec = embedding(word2vec, X_test)

In [27]:
X_train_pad_word2vec = pad_sequences(X_train_embed_word2vec, dtype='float32', padding='post', maxlen=150)
X_test_pad_word2vec = pad_sequences(X_test_embed_word2vec, dtype='float32', padding='post', maxlen=150)

In [34]:
model_word2vec = Sequential()
#model_word2vec.add(layers.Masking(mask_value=0))
model_word2vec.add(layers.Conv1D(16, kernel_size = 5))
model_word2vec.add(layers.Flatten())
model_word2vec.add(layers.Dense(5,))
model_word2vec.add(layers.Dense(1, activation="sigmoid"))

model_word2vec.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [35]:
es_2 = EarlyStopping(patience=5, restore_best_weights=True)

model_word2vec.fit(X_train_pad_word2vec, y_train, 
          epochs=20, 
          batch_size=32,
          validation_split=0.3,
          callbacks=[es_2]
         )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


<keras.callbacks.History at 0x7f25c7cc7790>

In [36]:
res = model_word2vec.evaluate(X_test_pad_word2vec, y_test, verbose=0)

print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')

The accuracy evaluated on the test set is of 53.360%
