In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tqdm import tqdm
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

### Implementing Transformer block as a layer

In [2]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads = num_heads,
                                            key_dim = embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-06)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-06)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
        
    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

### Implement embedding layer

In [3]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim = vocab_size, output_dim = embed_dim)
        self.pos_emb = layers.Embedding(input_dim = maxlen, output_dim = embed_dim)
    
    def call(self,x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [4]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df['sent_bin'] = df['sentiment'].map({'positive':0, 'negative':1})

In [6]:
df.head()

Unnamed: 0,review,sentiment,sent_bin
0,One of the other reviewers has mentioned that ...,positive,0
1,A wonderful little production. <br /><br />The...,positive,0
2,I thought this was a wonderful way to spend ti...,positive,0
3,Basically there's a family where a little boy ...,negative,1
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,0


In [7]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:
cleaned_text = []
for i in tqdm(range(len(df))):
    text = re.sub('<.*?>','',df['review'][i])
    tokens = text.split()
    table = str.maketrans(' ',' ',string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word not in stop_words]
    tokens = ' '.join(tokens)
    cleaned_text.append(tokens)

100%|██████████| 50000/50000 [00:43<00:00, 1144.48it/s]


In [9]:
df['cleaned_review'] = cleaned_text
df.head()

Unnamed: 0,review,sentiment,sent_bin,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,0,one reviewer mentioned watching oz episode you...
1,A wonderful little production. <br /><br />The...,positive,0,a wonderful little production the filming tech...
2,I thought this was a wonderful way to spend ti...,positive,0,i thought wonderful way spend time hot summer ...
3,Basically there's a family where a little boy ...,negative,1,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,0,petter matteis love time money visually stunni...


In [10]:
from sklearn.model_selection import train_test_split
X = df['cleaned_review']
y = df['sent_bin']

In [11]:
vocab_size = 20000
maxlen = 200
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.33,random_state = 0)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(33500,) (16500,) (33500,) (16500,)


In [12]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(X_train)
sequence_train = tokenizer.texts_to_sequences(X_train)
sequence_test = tokenizer.texts_to_sequences(X_test)
x_train = keras.preprocessing.sequence.pad_sequences(sequence_train, maxlen=maxlen)
x_test = keras.preprocessing.sequence.pad_sequences(sequence_test, maxlen=maxlen)

In [13]:
embed_dim = 32
num_heads = 2
ff_dim = 32
inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen,vocab_size,embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs = inputs, outputs = outputs)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
token_and_position_embedding (None, 200, 32)           646400    
_________________________________________________________________
transformer_block (Transform (None, 200, 32)           10656     
_________________________________________________________________
global_average_pooling1d (Gl (None, 32)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 20)                660       
_________________________________________________________________
dropout_3 (Dropout)          (None, 20)                0     

In [14]:
model.compile(loss = "sparse_categorical_crossentropy",
             optimizer = "adam",
             metrics = ['accuracy'])

In [15]:
history = model.fit(
    x_train,y_train,batch_size = 32, epochs = 3, validation_data = (x_test,y_test)
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [17]:
y_pred = model.predict(x_test,verbose=1)
y_pred



array([[0.530455  , 0.46954495],
       [0.00102405, 0.998976  ],
       [0.9863089 , 0.01369111],
       ...,
       [0.9988656 , 0.00113445],
       [0.0026451 , 0.99735487],
       [0.00519826, 0.99480176]], dtype=float32)

In [18]:
pred_labels = np.argmax(y_pred, axis=1)
pred_labels

array([0, 1, 0, ..., 0, 1, 1])

In [19]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [21]:
print(confusion_matrix(y_test, pred_labels))

[[7055 1164]
 [ 813 7468]]


In [22]:
print(classification_report(y_test, pred_labels))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88      8219
           1       0.87      0.90      0.88      8281

    accuracy                           0.88     16500
   macro avg       0.88      0.88      0.88     16500
weighted avg       0.88      0.88      0.88     16500

