# Lab 04: LSTM yourself - exercise

In [None]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/CS5242_2025_codes/labs_lecture06/lab04_lstm_yourself'
    print(path_to_file)
    # change current path to the folder containing "file_name"
    os.chdir(path_to_file)
    !pwd

In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import time
import utils

### With or without GPU?

It is recommended to run this code on GPU:<br> 
* Time for 1 epoch on CPU : 274 sec ( 4.56 min)<br> 
* Time for 1 epoch on GPU : 10.1 sec w/ GeForce GTX 1080 Ti <br>

In [2]:
device= torch.device("cuda")
device= torch.device("cpu")
print(device)

if torch.cuda.is_available():
    print('cuda available with GPU:',torch.cuda.get_device_name(0))

cpu


### Download Penn Tree Bank

The tensor train_data consists of 20 columns of 46,479 words.<br>
The tensor test_data consists of 20 columns of 4,121 words.

In [3]:
from utils import check_ptb_dataset_exists
data_path=check_ptb_dataset_exists()

train_data  =  torch.load(data_path+'ptb/train_data.pt')
test_data   =  torch.load(data_path+'ptb/test_data.pt')

print(  train_data.size()  )
print(  test_data.size()   )

torch.Size([46479, 20])
torch.Size([4121, 20])


### Some constants associated with the data set

In [4]:
bs = 20

vocab_size = 10000


### Make a LTSM class

Implement the vanilla RNN network without PyTorch:

$$
\begin{aligned}
&\tilde{h}_t = \tanh ( Rh_{t-1} + V g_t) \\
&c_t = \theta_t \odot c_{t-1} + \eta_t \odot \tilde{h}_t \\
&h_t = \psi_t \odot \tanh (c_t) \\
&\textrm{with}\\
&\theta_t = \textrm{sigmoid}(Ah_{t-1}+Bg_t)\\
&\eta_t = \textrm{sigmoid}(Ch_{t-1}+Dg_t)\\
&\psi_t = \textrm{sigmoid}(Eh_{t-1}+Fg_t)\\
\end{aligned}
$$



In [None]:
class my_LSTM(nn.Module):
    
    def __init__(self, hidden_size):
        super().__init__()
        self.R = nn.Linear(hidden_size, hidden_size)
        self.V = nn.Linear(hidden_size, hidden_size)
        self.A = nn.Linear(hidden_size, hidden_size)
        self.B = nn.Linear(hidden_size, hidden_size)
        self.C = nn.Linear(hidden_size, hidden_size)
        self.D = nn.Linear(hidden_size, hidden_size)
        self.E = nn.Linear(hidden_size, hidden_size)
        self.F = nn.Linear(hidden_size, hidden_size)
        # COMPLETE HERE 
        
    def forward(self, g_seq , hc_init ):
        # COMPLETE HERE 
        h_seq_ls = []
        h_prev, c_prev = hc_init
        t = len(g_seq)
        for i in range(t):
            h_tail_t = torch.tanh(self.R(h_prev) + self.V(g_seq[i]))
            theta_t = torch.sigmoid(self.A(h_prev) + self.B(g_seq[i]))
            yita_t = torch.sigmoid(self.C(h_prev) + self.D(g_seq[i]))
            phi_t = torch.sigmoid(self.E(h_prev) + self.F(g_seq[i]))
            c_t = theta_t * c_prev + yita_t * h_tail_t
            h_t = phi_t * torch.tanh(c_t)
            h_seq_ls.append(h_t)
            c_prev = c_t
        h_final = h_t
        c_final = c_t
        h_seq = torch.stack(h_seq_ls)        
        return h_seq , (h_final,c_final)


class my_LSTM_answer(nn.Module):
    
    def __init__(self, hidden_size):
        super().__init__()
        self.R = nn.Linear( hidden_size , hidden_size )
        self.V = nn.Linear( hidden_size , hidden_size )
        self.A = nn.Linear( hidden_size , hidden_size )
        self.B = nn.Linear( hidden_size , hidden_size )
        self.C = nn.Linear( hidden_size , hidden_size )
        self.D = nn.Linear( hidden_size , hidden_size )
        self.E = nn.Linear( hidden_size , hidden_size )
        self.F = nn.Linear( hidden_size , hidden_size )
        
    def forward(self, g_seq , hc_init ):
        h_init, c_init = hc_init
        h_t_pre = h_init
        c_t_pre = c_init
        V_g_all_t = self.V(g_seq) 
        B_g_all_t = self.B(g_seq) 
        D_g_all_t = self.D(g_seq) 
        F_g_all_t = self.F(g_seq) 
        h_seq = []
        c_seq = []
        num_t = g_seq.size(0)
        for t in range(num_t):
            R_h_t_pre = self.R(h_t_pre) 
            A_h_t_pre = self.A(h_t_pre) 
            C_h_t_pre = self.C(h_t_pre) 
            E_h_t_pre = self.E(h_t_pre) 
            theta_t = torch.sigmoid( A_h_t_pre + B_g_all_t[t] )
            eta_t   = torch.sigmoid( C_h_t_pre + D_g_all_t[t] )
            psi_t   = torch.sigmoid( E_h_t_pre + F_g_all_t[t] )
            h_tilde_t = torch.tanh( R_h_t_pre + V_g_all_t[t] )
            c_t = theta_t * c_t_pre + eta_t * h_tilde_t
            h_t = psi_t * torch.tanh( c_t )
            h_seq.append(h_t)
            c_seq.append(c_t)
            h_t_pre = h_t
            c_t_pre = c_t
        h_seq = torch.cat(h_seq) 
        h_final = h_seq[-1,:,:].unsqueeze(0) 
        c_final = c_t
        return h_seq , (h_final,c_final)
    
class three_layer_recurrent_net(nn.Module):

    def __init__(self, hidden_size):
        super(three_layer_recurrent_net, self).__init__()
        
        self.layer1 = nn.Embedding( vocab_size  , hidden_size  )
        self.layer2 = my_LSTM( hidden_size )
        self.layer3 = nn.Linear(    hidden_size , vocab_size   )

        
    def forward(self, word_seq, h_init, c_init ):
        
        g_seq                      =   self.layer1( word_seq )  
        h_seq , (h_final,c_final)  =   self.layer2( g_seq , (h_init,c_init) )     
        score_seq                  =   self.layer3( h_seq )
        
        return score_seq,  h_final , c_final


### Build the net. Choose the hidden size to be 300. How many parameters in total?

In [13]:
hidden_size=300

net = three_layer_recurrent_net( hidden_size )

print(net)

utils.display_num_param(net)

three_layer_recurrent_net(
  (layer1): Embedding(10000, 300)
  (layer2): my_LSTM(
    (R): Linear(in_features=300, out_features=300, bias=True)
    (V): Linear(in_features=300, out_features=300, bias=True)
    (A): Linear(in_features=300, out_features=300, bias=True)
    (B): Linear(in_features=300, out_features=300, bias=True)
    (C): Linear(in_features=300, out_features=300, bias=True)
    (D): Linear(in_features=300, out_features=300, bias=True)
    (E): Linear(in_features=300, out_features=300, bias=True)
    (F): Linear(in_features=300, out_features=300, bias=True)
  )
  (layer3): Linear(in_features=300, out_features=10000, bias=True)
)
There are 6732400 (6.73 million) parameters in this neural network


### Send the weights of the networks to the GPU

In [14]:
net = net.to(device)

### Set up manually the weights of the embedding module and Linear module

In [15]:
net.layer1.weight.data.uniform_(-0.1, 0.1)

net.layer3.weight.data.uniform_(-0.1, 0.1)

print('')




### Choose the criterion, as well as the following important hyperparameters: 
* initial learning rate = 5
* sequence length = 35

In [16]:
criterion = nn.CrossEntropyLoss()

my_lr = 5

seq_length = 35

### Function to evaluate the network on the test set

In [17]:
def eval_on_test_set():

    running_loss=0
    num_batches=0    
       
    h = torch.zeros(1, bs, hidden_size)
    c = torch.zeros(1, bs, hidden_size)
   
    h=h.to(device)
    c=c.to(device)
       
    for count in range( 0 , 4120-seq_length ,  seq_length) :
               
        minibatch_data =  test_data[ count   : count+seq_length   ]
        minibatch_label = test_data[ count+1 : count+seq_length+1 ]
        
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
                                  
        scores, h, c  = net( minibatch_data, h , c)
        
        minibatch_label =   minibatch_label.view(  bs*seq_length ) 
        scores          =            scores.view(  bs*seq_length , vocab_size)
        
        loss = criterion(  scores ,  minibatch_label )    
        
        h=h.detach()
        c=c.detach()
            
        running_loss += loss.item()
        num_batches += 1        
    
    total_loss = running_loss/num_batches 
    print('test: exp(loss) = ', math.exp(total_loss)  )
        

### Do 8 passes through the training set.

In [18]:
start=time.time()

for epoch in range(8):
    
    # divide the learning rate by 3 except after the first epoch
    if epoch >= 2:
        my_lr = my_lr / 3
    
    # create a new optimizer at the beginning of each epoch: give the current learning rate.   
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )
        
    # set the running quatities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0    
       
    # set the initial h and c to be the zero vector
    h = torch.zeros(1, bs, hidden_size)
    c = torch.zeros(1, bs, hidden_size)

    # send them to the gpu    
    h=h.to(device)
    c=c.to(device)
    
    for count in range( 0 , 46478-seq_length ,  seq_length):
        
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch
        minibatch_data =  train_data[ count   : count+seq_length   ]
        minibatch_label = train_data[ count+1 : count+seq_length+1 ]        
        
        # send them to the gpu
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        # Detach to prevent from backpropagating all the way to the beginning
        # Then tell Pytorch to start tracking all operations that will be done on h and c
        h=h.detach()
        c=c.detach()
        h=h.requires_grad_()
        c=c.requires_grad_()
                       
        # forward the minibatch through the net        
        scores, h, c  = net( minibatch_data, h , c)
        
        # reshape the scores and labels to huge batch of size bs*seq_length
        scores          =            scores.view(  bs*seq_length , vocab_size)  
        minibatch_label =   minibatch_label.view(  bs*seq_length )       
        
        # Compute the average of the losses of the data points in this huge batch
        loss = criterion(  scores ,  minibatch_label )
        
        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        utils.normalize_gradient(net)
        optimizer.step()   
            
        # update the running loss  
        running_loss += loss.item()
        num_batches += 1
        
        
        
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    elapsed = time.time()-start
    
    print('')
    print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t exp(loss)=',  math.exp(total_loss))
    eval_on_test_set() 



epoch= 0 	 time= 528.7225732803345 	 lr= 5 	 exp(loss)= 314.15647016072296
test: exp(loss) =  199.53680626532625

epoch= 1 	 time= 799.5920858383179 	 lr= 5 	 exp(loss)= 148.54531935927398
test: exp(loss) =  153.88531839551482

epoch= 2 	 time= 1072.3411841392517 	 lr= 1.6666666666666667 	 exp(loss)= 97.92192220499167
test: exp(loss) =  132.694059097334

epoch= 3 	 time= 1339.2665040493011 	 lr= 0.5555555555555556 	 exp(loss)= 82.1814306374303
test: exp(loss) =  127.1877796745336

epoch= 4 	 time= 1662.5641515254974 	 lr= 0.1851851851851852 	 exp(loss)= 76.74766216848371
test: exp(loss) =  124.93756630029408

epoch= 5 	 time= 1935.4638359546661 	 lr= 0.0617283950617284 	 exp(loss)= 74.76642236009587
test: exp(loss) =  124.04009122828424

epoch= 6 	 time= 2201.931361913681 	 lr= 0.0205761316872428 	 exp(loss)= 74.04428061736624
test: exp(loss) =  123.74754522711976

epoch= 7 	 time= 2466.529081583023 	 lr= 0.006858710562414266 	 exp(loss)= 73.7893102487923
test: exp(loss) =  123.664338

### Choose one sentence (taken from the test set)

In [19]:
# SENTENCES FROM TEST SET

sentence1 = "some analysts expect oil prices to remain relatively"

sentence2 = "over the next days and weeks they say investors should look for stocks to"

sentence2 = "some analysts expect oil prices to remain relatively"

sentence3 = "prices averaging roughly $ N a barrel higher in the third"

sentence4 = "i think my line has been very consistent mrs. hills said at a news"

sentence5 = "this appears particularly true at gm which had strong sales in"

# OR MAKE YOUR OWN SENTENCE. 
# NO CAPITAL LETTER ALLOWED. EACH WORD MUST BE IN THE ALLOWED VOCABULARY OF 10,000 WORDS

sentence6= "he was very"


# CHOOSE THE SENTENCE
mysentence = sentence3

### Convert the sentence into a vector, then send to GPU

In [20]:
minibatch_data=utils.sentence2vector(mysentence)
      
minibatch_data=minibatch_data.to(device)

print(minibatch_data)

tensor([[1786],
        [8705],
        [3246],
        [ 416],
        [  27],
        [  35],
        [2664],
        [ 209],
        [ 108],
        [  32],
        [3017]])


### Set the initial hidden state to zero, then run the LSTM.

In [None]:
h = torch.zeros(1, 1, hidden_size)
c = torch.zeros(1, 1, hidden_size)

# Note: my code assumes there is not the extra dimension
h = torch.zeros(1, hidden_size)
c = torch.zeros(1, hidden_size)
h=h.to(device)
c=c.to(device)

scores , h, c = net(minibatch_data , h, c)

### Display the network prediction for the next word

In [22]:
print(mysentence, '... \n')

utils.show_next_word(scores)

prices averaging roughly $ N a barrel higher in the third ... 

81.1%	 quarter
4.2%	 world
2.7%	 <eos>
1.9%	 and
1.4%	 year
0.8%	 period
0.7%	 of
0.7%	 game
0.7%	 consecutive
0.4%	 area
0.3%	 market
0.2%	 sector
0.2%	 quarters
0.2%	 process
0.1%	 week
0.1%	 they
0.1%	 business
0.1%	 term
0.1%	 parties
0.1%	 hour
0.1%	 range
0.1%	 category
0.1%	 largest
0.1%	 amount
0.1%	 <unk>
0.1%	 session
0.1%	 or
0.1%	 month
0.1%	 time
0.1%	 fall
