In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.optimizers import Adam


from layers.embedding import EmbeddingLayer
from layers.rnn.bidirectionalRNN import BidirectionalRNN
from layers.rnn.unidirectionalRNN import UnidirectionalRNN
from layers.dense import DenseLayer

from something.model import Model
from something.rnn import RNN
from utils.evaluate import evaluate_model

from sklearn.metrics import f1_score
import random

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
valid = pd.read_csv('data/valid.csv')
train['label'] = train['label'].map({'neutral': 0, 'positive': 1, 'negative': 2}).astype(np.float32)
test['label'] = test['label'].map({'neutral': 0, 'positive': 1, 'negative': 2}).astype(np.float32)
valid['label'] = valid['label'].map({'neutral': 0, 'positive': 1, 'negative': 2}).astype(np.float32)

# Set random seed for reproducibility
seed = 42
random.seed(seed)                         # python random
np.random.seed(seed)                      # numpy
tf.random.set_seed(seed)   

In [3]:
tokenizer = Tokenizer(oov_token="<UNK>")  # Reserve a token for unknown words
tokenizer.fit_on_texts(train['text'].values) 
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding (index 0 is reserved)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 2796


In [4]:
embedding_dim = 100 # Dimension of the embedding layer
max_length = 100  # Maximum length of input sequences

train_sequences = tokenizer.texts_to_sequences(train['text'].values)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
valid_sequences = tokenizer.texts_to_sequences(valid['text'].values)
valid_padded = pad_sequences(valid_sequences, maxlen=max_length, padding='post', truncating='post')
test_sequences = tokenizer.texts_to_sequences(test['text'].values)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

vocab_size = len(tokenizer.word_index) + 1  # +1 for padding (index 0 is reserved)



In [5]:
print(f"Train padded shape: {train_padded.shape}")
print(f"Valid padded shape: {valid_padded.shape}")
print(f"Test padded shape: {test_padded.shape}")

Train padded shape: (500, 100)
Valid padded shape: (100, 100)
Test padded shape: (400, 100)


In [6]:
# Build the model

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    SimpleRNN(units=16, activation='tanh', return_sequences=False),
    # Dropout(0.5),
    Dense(3, activation='softmax')  # for binary classification (e.g., sentiment)
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(
    train_padded,
    train['label'].values,
    validation_data=(valid_padded, valid['label'].values),
    epochs=10,          # number of passes over the data, adjust as needed
    batch_size=32,      # number of samples per batch, adjust as needed
    verbose=0          # verbosity mode, 0 = silent, 1 = progress bar, 2 = one line per epoch
)

# Evaluate the model
evaluate_model(model, test_padded, test['label'])

embedding_layer = model.layers[0]
embedding_weights = embedding_layer.get_weights()[0]
rnn_layer = model.layers[1]
rnn_weights = rnn_layer.get_weights()
dense_layer = model.layers[2]
dense_weights = dense_layer.get_weights()

print(f"Embedding weights shape: {embedding_weights.shape}")
print(f"RNN weights shape: {rnn_weights[0].shape}, {rnn_weights[1].shape}, {rnn_weights[2].shape}")
print(f"Dense weights shape: {dense_weights[0].shape}, {dense_weights[1].shape}")

embedding_layer_scratch = EmbeddingLayer(vocab_size, embedding_dim).load_weights(embedding_weights)
rnn_layer_scratch = UnidirectionalRNN(16, 100, 32, "tanh").load_weights(rnn_weights)
dense_layer_scratch = DenseLayer(3, 16, activation='softmax', init_method='zeros').load_weights(dense_weights)

modelScratch = RNN([
    embedding_layer_scratch,
    rnn_layer_scratch,
    dense_layer_scratch
], 32)

# Evaluate the model
evaluate_model(modelScratch, test_padded, test['label'])

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Macro F1 Score: 0.2503
Embedding weights shape: (2796, 100)
RNN weights shape: (100, 16), (16, 16), (16,)
Dense weights shape: (16, 3), (3,)
Macro F1 Score: 0.2503


In [7]:
embedding_result_scratch = embedding_layer_scratch.forward(test_padded[:1])  # Forward pass through the
rnn_layer_scratch_result = rnn_layer_scratch.forward(embedding_result_scratch).data.numpy()  # Forward pass through the RNN
print(rnn_layer_scratch_result)

[[-0.3671592  -0.13667315  0.452211    0.68996036 -0.32589558 -0.18146661
  -0.4594127   0.00197553  0.31480086  0.24719322 -0.17510526  0.13378109
  -0.2850381  -0.1010542  -0.5650211   0.28710228]]


In [8]:
rnn_layer(embedding_layer(test_padded[:1])).numpy()  # Forward pass through the RNN

array([[-0.36715928, -0.1366732 ,  0.45221096,  0.6899603 , -0.32589522,
        -0.18146665, -0.45941257,  0.00197551,  0.31480086,  0.24719319,
        -0.17510548,  0.13378106, -0.28503802, -0.10105424, -0.565021  ,
         0.28710228]], dtype=float32)

In [9]:
model.predict(test_padded[:1])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step


array([[0.17294578, 0.16387613, 0.6631781 ]], dtype=float32)

In [10]:
modelScratch.predict(test_padded[:1]).data

tensor([[0.1729, 0.1639, 0.6632]])