In [20]:
import pandas as pd
import pathlib
import sys
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json
from keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from keras.layers import GRU, Dense, Embedding, Flatten, Dropout
from keras.activations import softmax

In [7]:
# Load tokenizer configuration and vocabulary from JSON file
with open('D:\project\AWS\Emotional_analysis_NLP\data\interim\\tokenizer.json', 'r') as f:
    tokenizer_json = json.load(f)
tokenizer = tokenizer_from_json(tokenizer_json)

In [9]:
# data split into feature and target
train = pd.read_csv('D:\project\AWS\Emotional_analysis_NLP\data\processed\\train.csv')
train.head()

Unnamed: 0,text,label
0,ive blabbed enough tonight im tired ive feelin...,0
1,woke really early morning drove feel ecstatic ...,1
2,feel never gave rest day megabrick feeling stu...,3
3,feeling restless teary flat sad strange today,4
4,feel like im doomed ive even began,0


In [10]:
X_train = train['text'].astype(str)
y_train = train['label']

In [11]:
test = pd.read_csv('D:\project\AWS\Emotional_analysis_NLP\data\processed\\test.csv')
test.head()

Unnamed: 0,text,label
0,id say maybe made feel foolish would reeeeeeal...,0
1,joined lds church admit feeling somewhat asham...,0
2,must admit didnt feel like hugging angry disgu...,3
3,hate still feel nerve damaged badly enough oft...,0
4,im actually feeling little smug,1


In [12]:
X_test = test['text'].astype(str)
y_test = test['label']

In [13]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (333447,)
X_test shape: (83362,)
y_train shape: (333447,)
y_test shape: (83362,)


In [14]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [15]:
# Max Len in X_train_sequences
maxlen = max(len(tokens) for tokens in X_train_sequences)
print("Maximum sequence length (maxlen):", maxlen)

Maximum sequence length (maxlen): 79


In [16]:
# Perform padding on X_train and X_test sequences
X_train_padded = pad_sequences(X_train_sequences, maxlen=maxlen, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=maxlen, padding='post')

In [17]:
# Print the padded sequences for X_train and X_test
print("X_train_padded:")
print(X_train_padded)
print("\nX_test_padded:")
print(X_test_padded)

X_train_padded:
[[   20 26719    73 ...     0     0     0]
 [  275     5   738 ...     0     0     0]
 [    1    45   735 ...     0     0     0]
 ...
 [    1   606     0 ...     0     0     0]
 [    4    18   476 ...     0     0     0]
 [    1   149  1686 ...     0     0     0]]

X_test_padded:
[[  123    33   172 ...     0     0     0]
 [ 2941 12893   748 ...     0     0     0]
 [  194   310    48 ...     0     0     0]
 ...
 [  851   276  1128 ...     0     0     0]
 [ 1874  6225   645 ...     0     0     0]
 [   83     1   401 ...     0     0     0]]


In [18]:
print(f'X_train shape: {X_train_padded.shape}')
print(f'X_test shape: {X_test_padded.shape}')

X_train shape: (333447, 79)
X_test shape: (83362, 79)


# Vocabulary size

In [19]:
# Embedding Vocabulary Size 
vocabulary_size = len(set(token for sequence in X_train_padded for token in sequence))
vocabulary_size

49510

# Model building
- Sequential: This initializes a sequential model, allowing you to build the model layer by layer in a linear fashion.¶
- Embedding layer: This layer is responsible for converting integer-encoded words into dense vectors of fixed size. It takes three parameters: input_dim, which specifies the size of the vocabulary (in this case, vocabulary_size), output_dim, which determines the dimension of the dense embedding, and input_length, which specifies the length of input sequences (in this case, maxlen).
- Bidirectional GRU layer: This layer adds a bidirectional Gated Recurrent Unit (GRU) layer with 128 units. GRU is a type of recurrent neural network (RNN) that is similar to LSTM but has fewer parameters. The bidirectional wrapper allows the GRU to process input sequences in both forward and backward directions, enhancing the model's ability to capture long-range dependencies in the data.
- Batch normalization layer: Batch normalization is a technique used to improve the training speed and stability of neural networks by normalizing the activations of each layer. It helps mitigate the internal covariate shift problem.
- Dropout regularization: Dropout is a regularization technique that randomly sets a fraction of input units to zero during training, which helps prevent overfitting by reducing the reliance on specific neurons.
- Dense layer with ReLU activation: This layer adds a fully connected dense layer with 64 units and Rectified Linear Unit (ReLU) activation function. ReLU is a non-linear activation function that introduces non-linearity to the model, allowing it to learn complex patterns in the data.
- Dropout regularization: Another dropout layer is added after the dense layer to further prevent overfitting.
- Output layer with softmax activation: The final layer is a dense layer with 6 units (assuming there are 6 classes) and softmax activation function. Softmax activation normalizes the output vector into a probability distribution over the classes, making it suitable for multi-class classification problems.¶

In [27]:
# Define the model
model = Sequential()

# Add an embedding layer with input_dim=1000, output_dim=100, input_length=75
model.add(Embedding(input_dim=vocabulary_size, output_dim=100))

# Add a bidirectional GRU layer with 128 units
model.add(Bidirectional(GRU(128)))

# Add batch normalization layer
model.add(BatchNormalization())

# Add dropout regularization
model.add(Dropout(0.5))

# Add a dense layer with 64 units and ReLU activation
model.add(Dense(64, activation='relu'))

# Add dropout regularization
model.add(Dropout(0.5))

# Add the output layer with 6 units for 6 labels and softmax activation
model.add(Dense(6, activation='softmax'))
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Print model summary
model.summary()

In [30]:
# Model Train 
history = model.fit(X_train_padded, y_train,
                     epochs=5, batch_size=1500,
                       validation_data=(X_test_padded, y_test))

Epoch 1/5
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 1s/step - accuracy: 0.6455 - loss: 0.9351 - val_accuracy: 0.3067 - val_loss: 1.7166
Epoch 2/5
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m339s[0m 2s/step - accuracy: 0.9121 - loss: 0.2213 - val_accuracy: 0.7026 - val_loss: 1.9129
Epoch 3/5
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 890ms/step - accuracy: 0.9216 - loss: 0.1629 - val_accuracy: 0.8372 - val_loss: 1.3122
Epoch 4/5
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 924ms/step - accuracy: 0.9337 - loss: 0.1235 - val_accuracy: 0.8371 - val_loss: 1.1761
Epoch 5/5
[1m223/223[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 925ms/step - accuracy: 0.9394 - loss: 0.1063 - val_accuracy: 0.8151 - val_loss: 1.6056
