## Importing Library
Kindly change all paths before executing this code

In [1]:
import os
import numpy as np
from sklearn.model_selection import train_test_split

# 1. Data Preparation

### Read BBC news data and get texts and labels
* label 0: 'business'
* label 1: 'entertainment'
* label 2: 'politics'
* label 3: 'sport'
* label 4: 'tech'

In [2]:
data_dir = '/datasets/'
labels = []
texts = []
label_count = 0
for label_type in ['business', 'entertainment', 'politics', 'sport', 'tech']:
    dir_name = os.path.join(data_dir, label_type)
    print(dir_name)
    for fname in os.listdir(dir_name):
        f = open(os.path.join(dir_name, fname), encoding="utf8", errors='ignore')
        texts.append(f.read())
        f.close()
        labels.append(label_count)
    label_count = label_count + 1

/datasets/business
/datasets/entertainment
/datasets/politics
/datasets/sport
/datasets/tech


## Divide the data into training and testing samples

In [3]:
x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.20, random_state=42)
y_train, y_test= np.array(y_train),np.array(y_test)
print("Training set: ", len(x_train))
print("Testing set: ",len(x_test))

Training set:  1780
Testing set:  445


# Data Preprocessing

# How are you going to tokenize the text?





Tokenization is essentially splitting a phrase, sentence, paragraph, or an entire text document into smaller units, such as individual words or terms. Each of these smaller units are called tokens.


1. Before we start tokenization we must perform some prepreocessing on the text. We will remove stopwords, punctuations, links and numbers from all the documents and it will help to increase the accuracy of model.
2. Code given below will remove all unwanted pieces of information and tokenize the text for each document.

3. After that we will tokenize text.I will use default tokenizer to for further preprocessing like all words should be lower case, remove spaces.

4. Tokenizer will consider words with max frequency which are 1000 in my case. Only top 100 words will be considered from each document. I am considering tokens of one words only.

In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import re
nltk.download('stopwords')
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
def preprocess(text, stem=False):
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)


[nltk_data] Downloading package stopwords to C:\Users\Muhammad
[nltk_data]     Muneeb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#Preprocessing for each of the document.
for i in range(0, len(x_train)):
    x_train[i] = preprocess(x_train[i])
for i in range(0, len(x_test)):
    x_test[i] = preprocess(x_test[i])



In [6]:
# Use this code if you are testing this code on GPU.
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)
      

# What do we mean by sequence padding? For your analysis, what is the maximum length of padding you used?

Padding sequences is used to ensure that all sequences in a list have the same length. By default this is done by padding 0 in the beginning of each sequence until each sequence has the same length as the longest sequence. 

In my case padding length is same as the max word length which is 1000.

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100 
training_samples = 1500
validation_samples = 280 
max_words = 1000 

tokenizer = Tokenizer(num_words=max_words)# Tokenizer
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)# We have to fit training and testing data.
x_test = tokenizer.texts_to_sequences(x_test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)
datax=x_train
datay=y_train

x_train = datax[:training_samples]
y_train = datay[:training_samples]
x_val = datax[training_samples : training_samples + validation_samples]
y_val = datay[training_samples : training_samples + validation_samples]


y_train=y_train.reshape(-1,1)
y_test=y_test.reshape(-1,1)
y_val=y_val.reshape(-1,1)

from keras.utils.np_utils import to_categorical# One-hot-encoding for labels.
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_val =  to_categorical(y_val)


Found 26649 unique tokens.


# What do we mean by word embeddings, and what are the advantages of using it?


Word embeddings are distributed representations of text in an n-dimensional space. These are essential for solving most NLP problems.
Advantage
Word embeddings are a type of word representation that allows words with similar meaning to have a similar representation. That you can either train a new embedding or use a pre-trained embedding on your natural language processing task.

1. Easy to visualize and find coorelation of words 
2. Good for numerical calculation
3. Compact

That there are 3 main algorithms for learning a word embedding from text data.
1. Embedding Layer 
2. Word2Vec (Skip-gram, CBOW)
3. GloVe 



In [8]:
# After that we will use embedding.

glove_dir = 'D:\MSSEMESTER3\Deeplearning\Assignments\Assignment3'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'),encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

      
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

Found 400000 word vectors.


In [9]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

def plotting(x_test,y_test):
    y_pred = model.predict(x_test)
    y_pred = np.argmax(y_pred, axis=1)
    y_test = np.argmax(y_test, axis=1)
    y_pred = y_pred.reshape(-1,1)
    y_test = y_test.reshape(-1,1)
    print(confusion_matrix(y_test, y_pred))
    print(accuracy_score(y_test, y_pred))    
    
    
    
    
    

In [10]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
##First model Embedding layer is non trainable 
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(5, activation='softmax'))
model.summary()
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.compile(optimizer='RMSprop',
              loss='categorical_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=30,
                    batch_size=10,
                    validation_data=(x_val, y_val))
model.evaluate(x_test, y_test)
plotting(x_test,y_test)

##Second model Embedding layer is trainable 
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(5, activation='softmax'))
model.summary()
model.compile(optimizer='RMSprop',
              loss='categorical_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=30,
                    batch_size=10,
                    validation_data=(x_val, y_val))
model.evaluate(x_test, y_test)
plotting(x_test,y_test)




Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          100000    
_________________________________________________________________
flatten (Flatten)            (None, 10000)             0         
_________________________________________________________________
dense (Dense)                (None, 32)                320032    
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 165       
Total params: 420,197
Trainable params: 420,197
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch

Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[109   1   4   0   1]
 [  1  67   1   2   1]
 [  2   0  73   0   1]
 [  0   1   0 101   0]
 [  3   2   2   1  72]]
0.9483146067415731


# List three Recurrent Neural Network-specific hyperparameters you used for training your models? What alternative values those parameters can replace with?

There are the hyper parameters that we can change.

1. Number of Layers for RNN (It can be changed in my case it is 1)
2. Model of RNN
3. Batch Size (It can be changed in my case it is 30)
4. Sequence Length
5. Number of Epochs
6. Clip Gradients at Value
7. Learning Rate (By default it is 0.002) It can be changed and in my case it is 0.002
8. Decay Rate (By default it is 0.97) It can be changed.

I am using 3 hyper parameters. Number of layers for every RNN can be changed depending on the complexity of data.
Batch size can be 5,10,20.
Number of epochs can be changed to improve the accuracy of model on training data. In my case it is 30.

In [11]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding, SimpleRNN
from keras.layers import LSTM


##First model Embedding layer is non trainable 
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(LSTM(32))
model.add(Dense(5, activation='softmax'))
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=30,
                    batch_size=10,
                    validation_data=(x_val, y_val))
model.evaluate(x_test, y_test)
plotting(x_test,y_test)

##Second model Embedding layer is trainable 

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(LSTM(32))
model.add(Dense(5, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=30,
                    batch_size=10,
                    validation_data=(x_val, y_val))
model.evaluate(x_test, y_test)
plotting(x_test,y_test)
 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[105   1   6   1   2]
 [  1  67   0   2   2]
 [  0   0  74   1   1]
 [  0   0   0 101   1]
 [  2   1   0   0  77]]
0.952808988764045
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


[[107   1   5   0   2]
 [  1  64   5   1   1]
 [  2   0  73   0   1]
 [  1   1   1  99   0]
 [  2   2   2   0  74]]
0.9370786516853933


In [12]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding, SimpleRNN
from keras.layers import GRU
##First model Embedding layer is non trainable
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))

model.add(GRU(32))
model.add(Dense(5, activation='softmax'))
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=30,
                    batch_size=10,
                    validation_data=(x_val, y_val))

model.evaluate(x_test, y_test)
plotting(x_test,y_test)
##Second model Embedding layer is trainable  
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))

model.add(GRU(32))
model.add(Dense(5, activation='softmax'))


model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=30,
                    batch_size=10,
                    validation_data=(x_val, y_val))

model.evaluate(x_test, y_test)
plotting(x_test,y_test)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[[105   2   6   0   2]
 [  0  71   1   0   0]
 [  0   1  74   0   1]
 [  0   1   0 101   0]
 [  3   1   0   0  76]]
0.9595505617977528
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


[[103   1   2   3   6]
 [  0  66   5   0   1]
 [  7   2  66   0   1]
 [  1   0   1 100   0]
 [  2   3   1   0  74]]
0.9191011235955057


# Compare the three models’ performance in the testing set. Explain your observation?

Accuracy
Non Trainable Embedding
1. Fully Connected 92
2. LSTM 95
3. GRU 95

Trainable Embedding

4. Fully Connected 94
5. LSTM 94
6. GRU 91

GRU with non trainable embedding layer performs better as compared to other classifiers. Performance of Lstm and Gru is good as compared to fully connected network. This is because we are dealing with text sequence and RNN models perform good with sequence data.


GRU use less training parameters and therefore use less memory, execute faster and train faster than LSTM's whereas LSTM is more accurate on dataset using longer sequence. In our case we do not have long sequences so we can GRU.


GRU has two gates (reset and update gates) whereas an LSTM has three gates (namely input, output and forget gates) which helps lstm to work better on long sequences.



In [19]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output
    
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
    
    
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


num_heads = 5  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(max_words,))
embedding_layer = TokenAndPositionEmbedding(maxlen, 26649, embedding_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(5, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

model.compile("rmsprop", "categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    x_train, y_train, batch_size=30, epochs=5, validation_data=(x_val, y_val)
)
model.evaluate(x_test, y_test)
plotting(x_test,y_test)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[[105   4   4   1   1]
 [  0  70   1   1   0]
 [  2   0  73   0   1]
 [  0   0   0 102   0]
 [  2   2   1   0  75]]
0.9550561797752809
