In [2]:
# %pip install pandas numpy tensorflow scikit-learn torch

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense # type: ignore
from tensorflow.keras.models import Sequential, load_model # type: ignore
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout # type: ignore
from tensorflow.keras.optimizers import Adam # type: ignore


from layers.embedding import EmbeddingLayer
from layers.rnn.bidirectionalRNN import BidirectionalRNN
from layers.rnn.unidirectionalRNN import UnidirectionalRNN
from layers.lstm.unidirectionalLSTM import UnidirectionalLSTM
from layers.dense import DenseLayer

from something.model import Model
from something.rnn import RNN
from something.lstm import LSTM
from utils.evaluate import evaluate_model

from sklearn.metrics import f1_score
import random

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
valid = pd.read_csv('data/valid.csv')
train['label'] = train['label'].map({'neutral': 0, 'positive': 1, 'negative': 2}).astype(np.float32)
test['label'] = test['label'].map({'neutral': 0, 'positive': 1, 'negative': 2}).astype(np.float32)
valid['label'] = valid['label'].map({'neutral': 0, 'positive': 1, 'negative': 2}).astype(np.float32)

# Set random seed for reproducibility
seed = 42
random.seed(seed)                         # python random
np.random.seed(seed)                      # numpy
tf.random.set_seed(seed)   

In [5]:
tokenizer = Tokenizer(oov_token="<UNK>")  # Reserve a token for unknown words
tokenizer.fit_on_texts(train['text'].values) 
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding (index 0 is reserved)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 2796


In [6]:
embedding_dim = 100
max_length = 100  # Maximum length of input sequences

train_sequences = tokenizer.texts_to_sequences(train['text'].values)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
valid_sequences = tokenizer.texts_to_sequences(valid['text'].values)
valid_padded = pad_sequences(valid_sequences, maxlen=max_length, padding='post', truncating='post')
test_sequences = tokenizer.texts_to_sequences(test['text'].values)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

vocab_size = len(tokenizer.word_index) + 1  # +1 for padding (index 0 is reserved)



In [7]:
# Build the model
from tensorflow.keras.layers import LSTM as TF_LSTM

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    TF_LSTM(units=16, return_sequences=False),
    Dropout(0.5),
    Dense(3, activation='softmax')  # for sentiment classification
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(
    train_padded,
    train['label'].values,
    validation_data=(valid_padded, valid['label'].values),
    epochs=10,          # number of passes over the data, adjust as needed
    batch_size=32,      # number of samples per batch, adjust as needed
    verbose=0          # verbosity mode, 0 = silent, 1 = progress bar, 2 = one line per epoch
)

# Evaluate the model
evaluate_model(model, test_padded, test['label'])

embedding_layer = model.layers[0]
embedding_weights = embedding_layer.get_weights()[0]
lstm_layer = model.layers[1]
lstm_weights = lstm_layer.get_weights()
dense_layer = model.layers[3]
dense_weights = dense_layer.get_weights()

print(f"Embedding weights shape: {embedding_weights.shape}")
print(f"LSTM weights shape: {[w.shape for w in lstm_weights]}")
print(f"Dense weights shape: {dense_weights[0].shape}, {dense_weights[1].shape}")

# Create LSTM model from scratch using custom implementation
modelScratch = LSTM([
    EmbeddingLayer(vocab_size, embedding_dim, 'zeros').load_weights(embedding_weights),
    UnidirectionalLSTM(16, 100, 32, "tanh").load_weights(lstm_weights),
    DenseLayer(3, 16, activation='softmax', init_method='zeros').load_weights(dense_weights)
], 32)

# Evaluate the model
evaluate_model(modelScratch, test_padded, test['label'])

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Macro F1 Score: 0.1844
Embedding weights shape: (2796, 100)
LSTM weights shape: [(100, 64), (16, 64), (64,)]
Dense weights shape: (16, 3), (3,)
Initializing EmbeddingLayer with vocab_size=2796, embedding_dim=100, initializer=zeros
Predicting with batch size: 32, input shape: (400, 100)
Using EmbeddingLayer, converting indices to long tensor
Forward pass in EmbeddingLayer with indices=tensor([[ 247,    7,  288,  ...,    0,    0,    0],
        [ 495,  132,  534,  ...,    0,    0,    0],
        [2236,   85,   26,  ...,    0,    0,    0],
        ...,
        [  14,    3,   20,  ...,    0,    0,    0],
        [ 107,    1,    1,  ...,    0,    0,    0],
        [ 534,  637, 1806,  ...,    0,    0,    0]]), shape=torch.Size([32, 100]), data type=torch.int64
Using EmbeddingLayer, converting indices to long tensor
Forward pass in EmbeddingLayer wi