<code>Recurrent Neural Network</code>

In [6]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize , word_tokenize
import torch as t
from torch.nn.utils.rnn import pad_sequence
import re
from pprint import pprint

In [7]:
df = pd.read_csv("IMDB.csv")
final = []
for sentences in df["review"]:
    cleaned = []
    txt = re.sub(r'(<br \/>|\.\.\.|\'s|\'m)' , "" , sentences)
    cleaned.append(txt.lower())
    final.append(cleaned)
    
    
needed_size = 500
    
sent =[ ]
for arr in final[0:needed_size]: 
    for sentence in arr : 
        tokenized = word_tokenize(sentence)
        sent.append(tokenized)

all_words = [ word for arr in sent for word in arr]
all_words

vocab = {word:idx for idx,word in enumerate(set(all_words))}

full = []
for arr in final[0:needed_size] : 
    ids = []
    for sentence in arr:
        wrds = word_tokenize(sentence)
        for word in wrds : 
            if word in vocab:
                ids.append(vocab[word])
    full.append(ids)
    
len(vocab)
vocab_size = len(vocab) + 1

In [8]:
import torch as t
from torch.utils.data import TensorDataset, DataLoader

# Example: pad sequences to same length
from torch.nn.utils.rnn import pad_sequence

# Convert list of index lists to tensors
seq_tensors = [t.tensor(seq, dtype=t.long) for seq in full]

# Pad sequences
padded_seqs = pad_sequence(seq_tensors, batch_first=True)  # shape: (num_reviews, max_seq_len)

# Dummy target (for testing autocomplete you usually predict next word)
# Here we just shift the sequence by 1 as a simple next-word prediction
inputs = padded_seqs[:, :-1]
targets = padded_seqs[:, 1:]

# Create DataLoader
dataset = TensorDataset(inputs, targets)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [9]:
class Model(t.nn.Module):
    def __init__(self , vocab_size , embed_dim , hidden_dim):
        super().__init__()
        self.embedding = t.nn.Embedding(vocab_size , embed_dim)
        self.rnn = t.nn.LSTM(embed_dim , hidden_dim , batch_first=True)
        self.fc = t.nn.Linear(hidden_dim , vocab_size)

    def forward(self, x):
        x = self.embedding(x)           # (batch, seq_len, embed_dim)
        out, _ = self.rnn(x)            # (batch, seq_len, hidden_dim)
        out = self.fc(out)              # (batch, seq_len, vocab_size)
        return out

In [10]:
model = Model(vocab_size=vocab_size, embed_dim=128, hidden_dim=256)
optimizer = t.optim.Adam(model.parameters(), lr=0.0005)
criterion = t.nn.CrossEntropyLoss(ignore_index=0)

model.train()

for epoch in range(20):
    total_loss = 0
    
    for batch_x, batch_y in loader:
        optimizer.zero_grad()
        
        outputs = model(batch_x)
        loss = criterion(outputs.view(-1, vocab_size), batch_y.view(-1))
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch} Loss: {total_loss:.4f}")

Epoch 0 Loss: 151.3994
Epoch 1 Loss: 124.0467
Epoch 2 Loss: 108.9184
Epoch 3 Loss: 107.9385
Epoch 4 Loss: 107.3122
Epoch 5 Loss: 106.9587
Epoch 6 Loss: 106.4666
Epoch 7 Loss: 105.9576


KeyboardInterrupt: 

In [None]:
id_to_word = {idx: word for word, idx in vocab.items()}

In [None]:
import torch.nn.functional as F

def generate(model, start_text, max_len=20, temperature=1.0):
    model.eval()
    
    words = word_tokenize(start_text.lower())
    input_ids = [vocab.get(w, 0) for w in words]
    
    for _ in range(max_len):
        input_tensor = t.tensor(input_ids, dtype=t.long).unsqueeze(0)
        
        with t.no_grad():
            output = model(input_tensor)
        
        logits = output[0, -1] / temperature
        probs = F.softmax(logits, dim=0)
        
        next_token = t.multinomial(probs, 1).item()
        
        if next_token == 0:
            break
        
        input_ids.append(next_token)
    
    return " ".join([id_to_word.get(i, "") for i in input_ids])

In [None]:
print(generate(model, "this movie is ", temperature=0.8))

this movie is the the , the the , the the , the
