In [46]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import random
from tqdm import tqdm

import os
from torch.utils.tensorboard.writer import SummaryWriter
import numpy as np
import time
import math

In [49]:

class RNN(nn.Module) : # output_size should be vocab_size?
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, activation = nn.Tanh, dropout=0.0):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(output_size , input_size)
        self.Wyh = nn.Linear(num_layers*hidden_size ,output_size ) 
        self.Whh = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(num_layers)])
        self.Whx  = nn.ModuleList([nn.Linear(input_size, hidden_size) for _ in range(num_layers)]) #nn.Linear(input_size, hidden_size*num_layers, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.activation = activation()
        # self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers


    def forward(self , input , target = None)  :# (batch_size, seq_len (long))
        batch_size, seq_len = input.shape

        input = self.embedding(input)  # (batch_size, seq_len , input_size)

        h = torch.zeros((self.num_layers , batch_size,self.hidden_size) , device=input.device)  # (num_layers ,batch_size, hidden_size )

        output = torch.zeros((batch_size, seq_len , self.output_size), device = input.device) # (batch_size, seq_len , output_dim)
        for id in range(seq_len) :
            x = input[:,id] # (batch_size, input_size)
            
            for layer_id  in range(self.num_layers) :

                h[layer_id] = self.Whh[layer_id](h[layer_id]) + self.Whx[layer_id](x)   # (num_layers ,batch_size,hidden_size)

            h = self.activation(h) # (num_layers ,batch_size, hidden_dim)
            h = self.dropout(h)  # (num_layers , batch_size, hidden_dim)
            aggregation = h.transpose(0,1).contiguous().view(batch_size,-1) # (batch_size, hidden_dim*num_layers) 
            y = self.Wyh(aggregation)    # (batch_size, output_dim)
            output[:,id, :] = y # (batch_size, output_dim)

        if target is None :
            y = F.softmax(y , dim = -1) # (batch_size, seq_len , output_dim)
            loss = None

        else :
            output = output.view(batch_size*seq_len,-1)
            target = target.view(-1)
            loss = F.cross_entropy(output, target)

        return output, loss

In [50]:

model = RNN(64,16,31 , num_layers=5)
tensor1 = torch.randint(0,31,(8,5))
tensor2 = torch.randint(0,31,(8,5))

output,loss= model(tensor1,tensor2)
output.shape
print(loss.item())

3.4314284324645996
