In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import time

In [2]:
def load_dataset(path):
    df = pd.read_csv(path)
    texts = df.iloc[:, 0].dropna().tolist()
    return texts

def tokenize(texts):
    tokens = []
    for line in texts:
        tokens.extend(line.lower().split())
    return tokens

def build_vocab(tokens):
    vocab = sorted(set(tokens))
    word_to_idx = {w: i for i, w in enumerate(vocab)}
    idx_to_word = {i: w for w, i in word_to_idx.items()}
    return vocab, word_to_idx, idx_to_word

def create_sequences(tokens, word_to_idx, seq_len):
    sequences, targets = [], []
    for i in range(len(tokens) - seq_len):
        seq = tokens[i:i+seq_len]
        target = tokens[i+seq_len]
        sequences.append([word_to_idx[w] for w in seq])
        targets.append(word_to_idx[target])
    return torch.tensor(sequences), torch.tensor(targets)

In [None]:
DATA_PATH = ""
SEQ_LEN = 5

texts = load_dataset(DATA_PATH)
tokens = tokenize(texts)
vocab, word_to_idx, idx_to_word = build_vocab(tokens)

vocab_size = len(vocab)
X, y = create_sequences(tokens, word_to_idx, SEQ_LEN)

print("Vocabulary size:", vocab_size)
print("Input shape:", X.shape)

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 3-4: truncated \UXXXXXXXX escape (4242190986.py, line 1)

In [None]:
class RNN_Numpy:
    def __init__(self, vocab_size, hidden_size):
        self.hidden_size = hidden_size
        self.Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
        self.Whh = np.random.randn(hidden_size, hidden_size) * 0.01
        self.Why = np.random.randn(vocab_size, hidden_size) * 0.01
        self.h = np.zeros((hidden_size, 1))

    def forward(self, x):
        self.h = np.tanh(self.Wxh @ x + self.Whh @ self.h)
        return self.Why @ self.h

In [None]:
rnn_test = RNN_Numpy(vocab_size, hidden_size=50)
x_test = np.zeros((vocab_size, 1))
x_test[word_to_idx[tokens[0]]] = 1
print("Output shape:", rnn_test.forward(x_test).shape)

In [None]:
class OneHotRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(vocab_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = torch.nn.functional.one_hot(x, vocab_size).float()
        out, _ = self.rnn(x)
        return self.fc(out[:, -1, :])

In [None]:
class EmbeddingRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        return self.fc(out[:, -1, :])

In [None]:
def train_model(model, X, y, epochs=20, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    losses = []
    start_time = time.time()

    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

        losses.append(loss.item())

        if epoch % 5 == 0:
            print(f"Epoch {epoch} | Loss: {loss.item():.4f}")

    training_time = time.time() - start_time
    return losses, training_time

In [None]:
# One-Hot Model
model_oh = OneHotRNN(vocab_size, hidden_size=128)
onehot_losses, onehot_time = train_model(model_oh, X, y)

# Embedding Model
model_emb = EmbeddingRNN(vocab_size, embed_dim=50, hidden_size=128)
embedding_losses, embedding_time = train_model(model_emb, X, y)

In [None]:
def generate_text(model, start_word, word_to_idx, idx_to_word, seq_len, length=20):
    model.eval()
    words = [start_word]

    for _ in range(length):
        seq = torch.tensor([[word_to_idx[w] for w in words[-seq_len:]]])
        with torch.no_grad():
            output = model(seq)
            next_word = torch.argmax(output, dim=1).item()
        words.append(idx_to_word[next_word])

    return " ".join(words)

In [None]:
print("OneHot:\n", generate_text(model_oh, tokens[0], word_to_idx, idx_to_word, SEQ_LEN))
print("\nEmbedding:\n", generate_text(model_emb, tokens[0], word_to_idx, idx_to_word, SEQ_LEN))

In [None]:
plt.figure()
plt.bar(["One-Hot", "Embedding"], [onehot_time, embedding_time])
plt.ylabel("Training Time (seconds)")
plt.title("Training Time Comparison")
plt.show()

In [None]:
plt.figure()
plt.plot(onehot_losses, label="One-Hot")
plt.plot(embedding_losses, label="Embedding")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
comparison_df = pd.DataFrame({
    "Method": ["One-Hot Encoding", "Trainable Embeddings"],
    "Final Loss": [onehot_losses[-1], embedding_losses[-1]],
    "Training Time (sec)": [onehot_time, embedding_time]
})

comparison_df