<a href="https://colab.research.google.com/github/HuyenNguyenHelen/LING-5412/blob/main/Assignment_Feedforward_Network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import re
import numpy as np
import shutil
import tensorflow as tf
import string
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
import pandas as pd
import glob
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
print(tf.__version__)


2.6.0


# Part 1: IMDB sentiment analysis

## Loading the dataset

In [2]:
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
data = tf.keras.utils.get_file ('aclImdb_v1',
                                url,
                                untar = True,
                                cache_dir = '.',
                                cache_subdir = '')
data_dir = os.path.join (os.path.dirname(data), 'aclImdb')
print(os.listdir(data_dir))

['imdb.vocab', 'test', 'README', 'imdbEr.txt', 'train']


In [3]:
train_dir = os.path.join (data_dir, 'train')
test_dir = os.path.join (data_dir, 'test')
print(os.listdir(train_dir))

['neg', 'urls_neg.txt', 'urls_pos.txt', 'unsup', 'pos', 'urls_unsup.txt', 'unsupBow.feat', 'labeledBow.feat']


In [4]:
# We only use files in the two folders: pos, and neg, so let's remove other files
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [5]:
# Loading data from the directory
batch_size = 32
seed = 42
raw_train = tf.keras.utils.text_dataset_from_directory ('aclImdb/train',
                                                        batch_size =batch_size,
                                                        validation_split = 0.2,
                                                        subset = 'training',
                                                        seed = seed)
raw_val = tf.keras.utils.text_dataset_from_directory ('aclImdb/train',
                                                      batch_size = batch_size,
                                                      validation_split = 0.2,
                                                      subset = 'validation',
                                                      seed = seed)
raw_test = tf.keras.utils.text_dataset_from_directory ('aclImdb/test',
                                                       batch_size = batch_size)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


## Text representation

In [6]:
def custom_preprocessing (text):
  lowercase = tf.strings.lower (text)
  stripped_html = tf.strings.regex_replace (lowercase,'<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), 
                                  '')
  
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(standardize = custom_preprocessing,
                                           max_tokens = max_features,
                                           output_mode = 'int',
                                           output_sequence_length = sequence_length)
# Extracting features for vectorizing using training set
train_text = raw_train.map (lambda x, y: x)
vectorize_layer.adapt(train_text)

# Defining a function for fitting vectorizer function/layer to vectorize text (review)
def fitting_vectorizer (text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer (text), label

# storing text batch and label batch
text_batch, label_batch = next(iter(raw_train))

## print an instance with vectorized review and label for observing
print ('REVIEW:', text_batch[0])
print('LABEL:', raw_train.class_names[label_batch[0]] )


REVIEW: tf.Tensor(b'Silent Night, Deadly Night 5 is the very last of the series, and like part 4, it\'s unrelated to the first three except by title and the fact that it\'s a Christmas-themed horror flick.<br /><br />Except to the oblivious, there\'s some obvious things going on here...Mickey Rooney plays a toymaker named Joe Petto and his creepy son\'s name is Pino. Ring a bell, anyone? Now, a little boy named Derek heard a knock at the door one evening, and opened it to find a present on the doorstep for him. Even though it said "don\'t open till Christmas", he begins to open it anyway but is stopped by his dad, who scolds him and sends him to bed, and opens the gift himself. Inside is a little red ball that sprouts Santa arms and a head, and proceeds to kill dad. Oops, maybe he should have left well-enough alone. Of course Derek is then traumatized by the incident since he watched it from the stairs, but he doesn\'t grow up to be some killer Santa, he just stops talking.<br /><br />

In [7]:
# print an example of vectorized data
print ('Vocabulary size: ', len(vectorize_layer.get_vocabulary()))
for i in range (90, 100):
  print ('{} ------> {}'.format(i, vectorize_layer.get_vocabulary()[i]))

Vocabulary size:  10000
90 ------> made
91 ------> movies
92 ------> then
93 ------> them
94 ------> films
95 ------> way
96 ------> make
97 ------> any
98 ------> could
99 ------> too


In [8]:
train = raw_train.map(fitting_vectorizer)
val = raw_val.map(fitting_vectorizer)
test = raw_test.map(fitting_vectorizer)


In [9]:
# Configure the dataset for performance
autotune = tf.data.AUTOTUNE
train = train.cache().prefetch (buffer_size = autotune)
val = val.cache().prefetch (buffer_size = autotune)
test = test.cache().prefetch (buffer_size = autotune)

## Building a neural network classifier

In [32]:
# Defining an evaluation metric function
def printing_eval_scores (y_true, y_pred, report=''):
  accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
  precision = sklearn.metrics.precision_score(y_true, y_pred, average='binary')
  recall = sklearn.metrics.recall_score(y_true, y_pred, average='binary')
  f1 = sklearn.metrics.f1_score(y_true, y_pred , average='binary')
  print('accuracy score: {:.3f}'.format(accuracy))
  print('precision score: {:.3f}'.format(precision))
  print('recall score: {:.3f}'.format(recall))
  print('F1 score: {:.3f}'.format(f1))
  if report is True:
    print(classification_report(y_true, y_pred))
  else:
    pass
  return accuracy, precision, recall, f1

### With different numbers of embedding dimentions

In [29]:
# Creating the model
embedding_dim = [16, 28, 50, 100]
for n in embedding_dim:
  print ("========= embedding vectors'size= %s ============" %n)
  model = tf.keras.Sequential([layers.Embedding(max_features + 1, n, name="embedding"),
                              layers.Dropout(0.2),
                              layers.GlobalAveragePooling1D(),
                              layers.Dropout(0.2),
                              layers.Dense(1, activation = 'sigmoid')])
  print(model.summary())

  # configure the model uisng optimizer and loss function
  model.compile(loss = losses.BinaryCrossentropy(from_logits = True),
                optimizer = 'adam',
                metrics = tf.metrics.BinaryAccuracy(threshold = 0.0 )) ## Why threshold = 0.0??
  # training the model
  epochs = 10
  history = model.fit(train,
                      validation_data = val,
                      epochs = epochs)
  # testing the model
  # pred_label = tf.argmax(model.predict(test),1)
  pred_label = (model.predict(test) > 0.5).astype("int32")
  true_label = np.concatenate([y for x, y in test], axis=0)

  loss, accuracy = model.evaluate(test)
  print('\nTesting performance:\n Loss: {:.3f} - Accuracy: {:.3f}'. format(loss, accuracy))
  printing_eval_scores (true_label, pred_label, report=True)

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout_44 (Dropout)         (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d_22  (None, 16)                0         
_________________________________________________________________
dropout_45 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_25 (Dense)             (None, 1)                 17        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/

### With different dropout 

In [12]:
# Creating the model
embedding_dim = 16
dropouts = [0.0, 0.1, 0.2, 0.3]
for i in dropouts:
  print ("========= dropout = %s ============" %i)
  model = tf.keras.Sequential([layers.Embedding(max_features + 1,embedding_dim,  name="embedding"),
                              layers.Dropout(i),
                              layers.GlobalAveragePooling1D(),
                              layers.Dropout(0.2),
                              layers.Dense(1)])
  print(model.summary())

  # configure the model uisng optimizer and loss function
  model.compile(loss = losses.BinaryCrossentropy(from_logits = True),
                optimizer = 'adam',
                metrics = tf.metrics.BinaryAccuracy(threshold = 0.0 )) ## Why threshold = 0.0??
  # training the model
  epochs = 10
  history = model.fit(train,
                      validation_data = val,
                      epochs = epochs)
  # testing the model
  pred_label = (model.predict(test) > 0.5).astype("int32")
  true_label = np.concatenate([y for x, y in test], axis=0)

  loss, accuracy = model.evaluate(test)
  print('\nTesting performance:\n Loss: {:.3f} - Accuracy: {:.3f}'. format(loss, accuracy))
  printing_eval_scores (true_label, pred_label, report=True)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout_8 (Dropout)          (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d_4 ( (None, 16)                0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Testing performance:
 Loss: 0.309 - Accuracy: 0.874
accuracy score: 0.500
precision score: 0.250
recall score: 0.500
F1 score: 0.333
              precision    recall  f1-score   support

           0       0.50      1.00      0.67     12500
           1       0.00      0.00      0.00     12500

    accuracy                           0.50     25000
   macro avg       0.25      0.50      0.33     25000
weighted avg       0.25      0.50      0.33     25000

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout_12 (Dropout)         (None, None, 16)          0         
_________________________________________________________________
global_average_poolin

### Adding a Dense layer

In [13]:
# Creating the model
embedding_dim = 16
dropout =  0.1
activations = ['relu', 'softmax', 'sigmoid']
for f in activations:
  print ("========= activation function = %s ============" %f)
  model = tf.keras.Sequential([layers.Embedding(max_features + 1,embedding_dim,  name="embedding"),
                              layers.Dropout(dropout),
                              layers.GlobalAveragePooling1D(),
                              layers.Dropout(dropout),
                              layers.Dense(32, activation= f),
                              layers.Dense(1)])
  print(model.summary())

  # configure the model uisng optimizer and loss function
  model.compile(loss = losses.BinaryCrossentropy(from_logits = True),
                optimizer = 'adam',
                metrics = tf.metrics.BinaryAccuracy(threshold = 0.0 )) ## Why threshold = 0.0??
  # training the model
  epochs = 10
  history = model.fit(train,
                      validation_data = val,
                      epochs = epochs)
  # testing the model
  pred_label = (model.predict(test) > 0.5).astype("int32")
  true_label = np.concatenate([y for x, y in test], axis=0)

  loss, accuracy = model.evaluate(test)
  print('\nTesting performance:\n Loss: {:.3f} - Accuracy: {:.3f}'. format(loss, accuracy))
  printing_eval_scores (true_label, pred_label, report=True)

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout_16 (Dropout)         (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d_8 ( (None, 16)                0         
_________________________________________________________________
dropout_17 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 33        
Total params: 160,593
Trainable params: 160,593
Non-trainable params: 0
________________________________________________

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Testing performance:
 Loss: 0.330 - Accuracy: 0.872
accuracy score: 0.500
precision score: 0.250
recall score: 0.500
F1 score: 0.333
              precision    recall  f1-score   support

           0       0.50      1.00      0.67     12500
           1       0.00      0.00      0.00     12500

    accuracy                           0.50     25000
   macro avg       0.25      0.50      0.33     25000
weighted avg       0.25      0.50      0.33     25000

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          160016    
_________________________________________________________________
dropout_20 (Dropout)         (None, None, 16)          0         
_________________________________________________________________
global_average_pooli

### With different Batch size

In [33]:
# Loading data from the directory
batch_size = 64
seed = 42
raw_train = tf.keras.utils.text_dataset_from_directory ('aclImdb/train',
                                                        batch_size =batch_size,
                                                        validation_split = 0.2,
                                                        subset = 'training',
                                                        seed = seed)
raw_val = tf.keras.utils.text_dataset_from_directory ('aclImdb/train',
                                                      batch_size = batch_size,
                                                      validation_split = 0.2,
                                                      subset = 'validation',
                                                      seed = seed)
raw_test = tf.keras.utils.text_dataset_from_directory ('aclImdb/test',
                                                       batch_size = batch_size)

# storing text batch and label batch
text_batch, label_batch = next(iter(raw_train))

## print an instance with vectorized review and label for observing
print ('REVIEW:', text_batch[0])
print('LABEL:', raw_train.class_names[label_batch[0]] )


# Creating the model
embedding_dim = 16
dropout =  0.1
activation =  'softmax'

print ("======== activation function = {}, dropout = {}, batch size = {} ============".format(activation, dropout, batch_size ))
model = tf.keras.Sequential([layers.Embedding(max_features + 1,embedding_dim, name="embedding"),
                            layers.Dropout(dropout),
                            layers.GlobalAveragePooling1D(),
                            layers.Dropout(dropout),
                            layers.Dense(32, activation= activation),
                            layers.Dense(1)])
print(model.summary())

# configure the model uisng optimizer and loss function
model.compile(loss = losses.BinaryCrossentropy(from_logits = True),
              optimizer = 'adam',
              metrics = tf.metrics.BinaryAccuracy(threshold = 0.0 )) ## Why threshold = 0.0??
# training the model
epochs = 10
history = model.fit(train,
                    validation_data = val,
                    epochs = epochs)
# testing the model
pred_label = (model.predict(test) > 0.5).astype("int32")
true_label = np.concatenate([y for x, y in test], axis=0)

loss, accuracy = model.evaluate(test)
print('\nTesting performance:\n Loss: {:.3f} - Accuracy: {:.3f}'. format(loss, accuracy))
printing_eval_scores (true_label, pred_label, report=True)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
REVIEW: tf.Tensor(b"First of all, I liked very much the central idea of locating the '' intruders'', Others in the fragile Self, on various levels - mainly subconscious but sometimes more allegorical. In fact the intruders are omnipresent throughout the film : in the Swiss-French border where the pretagonist leads secluded life; in the his recurring daydream and nightmare; inside his ailing body after heart transplantation.... In the last half of the film, he becomes intruder himself, returning in ancient french colony in the hope of atoning for the past. <br /><br />The overall tone is bitter rather than pathetic, full of regrets and guilts, sense of failure being more or less dominant. This is a quite grim picture of an old age, ostensibly self-dependent but hopelessly void and lonely inside. The

(0.87028, 0.8943511970690977, 0.83976, 0.8661963114246812)

### With different training algorithm
Here we mostly focus on adapting learning rate method

In [None]:
# configure the model uisng optimizer and loss function
optimizers = ['adagrad', 'rmsprop', 'adam']

print ("======== activation function = {}, dropout = {}, batch size = {} ============".format(activation, dropout, batch_size ))
for opt in optimizers:
  print( '========== optimizer = %s' %opt)
  model.compile(loss = losses.BinaryCrossentropy(from_logits = True),
                optimizer = opt,
                metrics = tf.metrics.BinaryAccuracy(threshold = 0.0 )) ## Why threshold = 0.0??
  # training the model
  epochs = 10
  history = model.fit(train,
                      validation_data = val,
                      epochs = epochs)
  # testing the model
  pred_label = (model.predict(test) > 0.5).astype("int32")
  true_label = np.concatenate([y for x, y in test], axis=0)

  loss, accuracy = model.evaluate(test)
  print('\nTesting performance:\n Loss: {:.3f} - Accuracy: {:.3f}'. format(loss, accuracy))
  printing_eval_scores (true_label, pred_label, report=True)



## Word Embedding

In [None]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:

from sklearn.metrics.pairwise import cosine_similarity
def Find_similar_w (word, n):
  all_cos_sim = {}
  idx = vocab.index(word)
  weight = weights[idx]
  for i in range(len(weights)-1):
    cosine_sim = cosine_similarity(weight.reshape(1, -1), weights[i].reshape(1, -1))
    all_cos_sim[vocab[i]] = cosine_sim
  # Sorting the dictionary in descending order
  sorted_cos = {k:v for k, v in sorted(all_cos_sim.items(), key = lambda item: item[1], reverse=True)}
  print ("Top {} most similar with '{}' \n".format(n, word))
  for k, v in list(sorted_cos.items())[:n]:
    print ('{} =====> {}'. format(k,v))
  return  sorted_cos
most_similar_w = Find_similar_w (word = 'boring', n = 20)


## Comparing with a Logistic Regression model

In [None]:
# loading the ds
def Getdata(dir):
  review, label = [], []
  for folder in glob.glob (dir+'/*'):
    for file in glob.glob(folder+'/*'): 
      fo = open(file)
      doc = fo.read()
      review.append (doc)
      if 'pos' in file:
        label.append(1)
      elif 'neg' in file:
        label.append(0)
  df = pd.DataFrame(zip(review,label), columns = ['review', 'label'])
  return df
  
train_df = Getdata(train_dir)
test_df = Getdata(test_dir)
# Spliting the dataset for training and testing
X_train, X_val, y_train, y_val = train_test_split (train_df['review'],train_df['label'], train_size = 0.8, random_state = 42, shuffle = True)
X_test, y_test = test_df['review'], test_df['label']
print ('Shapes of X_train, y_train: ', X_train.shape, y_train.shape)
print ('Shapes of X_val, y_val: ', X_val.shape, y_val.shape)
print ('Shapes of X_test, y_test: ', X_test.shape, y_test.shape) 

In [None]:

def printing_eval_scores (y_true, y_pred, report=''):
  accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
  precision = sklearn.metrics.precision_score(y_true, y_pred)
  recall = sklearn.metrics.recall_score(y_true, y_pred)
  f1 = sklearn.metrics.f1_score(y_true, y_pred)
  print('accuracy score: {:.3f}'.format(accuracy))
  print('precision score: {:.3f}'.format(precision))
  print('recall score: {:.3f}'.format(recall))
  print('F1 score: {:.3f}'.format(f1))
  if report is True:
    print(classification_report(y_true, y_pred))
  else:
    pass
  return accuracy, precision, recall, f1

### With Countvectorizer text presentation

In [None]:

# Vectorizing the documents
vectorizer = CountVectorizer(binary = True)
X_train_count = vectorizer.fit_transform(X_train.to_list())
X_val_count = vectorizer.transform(X_val.to_list())
X_test_count = vectorizer.transform(X_test.to_list())
print ('Shapes of X_train, y_train: ', X_train_count.shape, y_train.shape)
print ('Shapes of X_val, y_val: ', X_val_count.shape, y_val.shape)
print ('Shapes of X_test, y_test: ', X_test_count.shape, y_test.shape)

# Sklearn Logistic Regression Model
sk_lr_1 = LogisticRegression(solver='lbfgs', max_iter=500).fit(X_train_count, y_train )
y_predict = sk_lr_1.predict(X_test_count)

# Model performing
## on training set
print('Model performance with Countvectorizer: \non validation set:')
printing_eval_scores (y_val, sk_lr_1.predict(X_val_count))

## on test set
print('\n===========================')
print('on test set:')
printing_eval_scores (y_test, y_predict, report = True)

### With tf-idf text presentation

In [None]:
# Vectorizing the documents
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train.to_list())
X_val_tfidf = tfidf.transform(X_val.to_list())
X_test_tfidf = tfidf.transform(X_test.to_list())
print ('Shapes of X_train, y_train: ', X_train_tfidf.shape, y_train.shape)
print ('Shapes of X_val, y_val: ', X_val_tfidf.shape, y_val.shape)
print ('Shapes of X_test, y_test: ', X_test_tfidf.shape, y_test.shape)

# Sklearn Logistic Regression Model
sk_lr_2 = LogisticRegression(solver='lbfgs', max_iter=500).fit(X_train_tfidf, y_train )
y_predict = sk_lr_2.predict(X_test_tfidf)

# Model performing
## on training set
print('Model performance with tfidf: \non validation set:')
printing_eval_scores (y_val, sk_lr_2.predict(X_val_tfidf))

## on test set
print('\n===========================')
print('on test set:')
printing_eval_scores (y_test, y_predict, report = True)

# Part 2: Multiclass classification - Stackoverflow DS

## Loading the dataset

In [None]:
url_2 = 'http://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'
train_dir = tf.keras.utils.get_file ('train',
                                url_2,
                                untar = True,
                                cache_dir = '.',
                                cache_subdir = '')
test_dir = tf.keras.utils.get_file ('test',
                                url_2,
                                untar = True,
                                cache_dir = '.',
                                cache_subdir = '')

print(os.listdir(train_dir))
print(os.listdir(test_dir))


In [None]:
# Loading data from the directory
batch_size = 32
seed = 42
raw_train = tf.keras.utils.text_dataset_from_directory ('train',
                                                        batch_size =batch_size,
                                                        validation_split = 0.2,
                                                        subset = 'training',
                                                        seed = seed)
raw_val = tf.keras.utils.text_dataset_from_directory ('train',
                                                      batch_size = batch_size,
                                                      validation_split = 0.2,
                                                      subset = 'validation',
                                                      seed = seed)
raw_test = tf.keras.utils.text_dataset_from_directory ('test',
                                                       batch_size = batch_size)

## Text representation

In [None]:
def custom_preprocessing (text):
  lowercase = tf.strings.lower (text)
  stripped_html = tf.strings.regex_replace (lowercase,'<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), 
                                  '')
  
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(standardize = custom_preprocessing,
                                           max_tokens = max_features,
                                           output_mode = 'int',
                                           output_sequence_length = sequence_length)
# Extracting features for vectorizing using training set
train_text = raw_train.map (lambda x, y: x)
vectorize_layer.adapt(train_text)

# Defining a function for fitting vectorizer function/layer to vectorize text (review)
def fitting_vectorizer (text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer (text), label

# storing text batch and label batch
text_batch, label_batch = next(iter(raw_train))

## print an instance with vectorized review and label for observing
print ('text:', text_batch[0])
print('label:', raw_train.class_names[label_batch[0]] )

In [None]:
train = raw_train.map(fitting_vectorizer)
val = raw_val.map(fitting_vectorizer)
test = raw_test.map(fitting_vectorizer)

In [None]:
# Configure the dataset for performance
autotune = tf.data.AUTOTUNE
train = train.cache().prefetch (buffer_size = autotune)
val = val.cache().prefetch (buffer_size = autotune)
test = test.cache().prefetch (buffer_size = autotune)

In [None]:
for i, j in test:
  print(j)

## Building a neural network multiclass classifier

In [None]:
# Defining an evaluation metric function
def printing_eval_scores (y_true, y_pred, report=''):
  accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
  precision = sklearn.metrics.precision_score(y_true, y_pred, average='macro')
  recall = sklearn.metrics.recall_score(y_true, y_pred, average='macro')
  f1 = sklearn.metrics.f1_score(y_true, y_pred , average='macro')
  print('accuracy score: {:.3f}'.format(accuracy))
  print('precision score: {:.3f}'.format(precision))
  print('recall score: {:.3f}'.format(recall))
  print('F1 score: {:.3f}'.format(f1))
  if report is True:
    print(classification_report(y_true, y_pred))
  else:
    pass
  return accuracy, precision, recall, f1

In [None]:
# Creating the model
embedding_dim = 16
dropout =  0.1
activation = 'relu'

print ("======== activation function = {}, dropout = {}, batch size = {} ============".format(activation, dropout, batch_size ))
model = tf.keras.Sequential([layers.Embedding(max_features + 1,embedding_dim, name="embedding_2"),
                            layers.Dropout(dropout),
                            layers.GlobalAveragePooling1D(),
                            layers.Dropout(dropout),
                            layers.Dense(32, activation= activation),
                            layers.Dense(4)])
print(model.summary())

# configure the model uisng optimizer and loss function
model.compile(loss = losses.SparseCategoricalCrossentropy(from_logits = True),
              optimizer = 'adam',
              metrics = 'accuracy') 
# training the model
epochs = 10
history = model.fit(train,
                    validation_data = val,
                    epochs = epochs)
# testing the model
pred_label = tf.argmax(model.predict(test),1)
true_label = np.concatenate([y for x, y in test], axis=0)

loss, accuracy = model.evaluate(test)
print('\nTesting performance:\n Loss: {:.3f} - Accuracy: {:.3f}'. format(loss, accuracy))
printing_eval_scores (true_label, pred_label, report=True)