## Import Libraries

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import time
import copy

## EDA

In [2]:
filepath='/Users/vedanttripathi/Documents/20CSCE5218/Project/Data/input.txt'
raw_data = open(filepath,'r', encoding='utf-8').read()
print(raw_data[:50])
raw_data=raw_data[:5000]
chars = sorted(list(set(raw_data)))
n_chars, unique_chars = len(raw_data), len(chars)
print ('Total number of characters: ',n_chars,'and number of unique characters:',unique_chars) 
print('Unique characters are: ',''.join(chars))

First Citizen:
Before we proceed any further, hear
Total number of characters:  5000 and number of unique characters: 53
Unique characters are:  
 !',-.:;?ABCDEFHILMNORSTUVWYabcdefghijklmnoprstuvwyz


## Data Preprocessing

In [3]:
chars_to_idx = { ch:i for i,ch in enumerate(chars) } #char_to_int = dict((c, i) for i, c in enumerate(chars))
idx_to_chars = { i:ch for i,ch in enumerate(chars) } 
#type(idx_to_chars)

### Deciding Sequence Length

In [4]:
# Picking a long sentence from the first few sentences to decide Sequence length
line = 'First, you know Caius Marcius is chief enemy to the people.'
print('Number of cahracters in a long sentence', len(line))

# Since we are dealing with verses we choose one to determnine its length
verse = '''
First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?
'''
print('Number of cahracters in a verse', len(verse))

seq_len = 120 # we take the context length to be roughly two sentences long and greater than verse length 

Number of cahracters in a long sentence 59
Number of cahracters in a verse 87


### Input, output encoding as integers

In [5]:
dataX = []
dataY = []
for i in range(0, n_chars - seq_len, 1):
    seq_in = raw_data[i:i + seq_len]
    seq_out = raw_data[i + seq_len]
    dataX.append([chars_to_idx[char] for char in seq_in])
    dataY.append(chars_to_idx[seq_out])
n_patterns = len(dataX)
print('Total Patterns or number of elements in list dataX:', n_patterns)
print('dataX[0]:', dataX[0])
print('Length of every element of dataX:' ,len(dataX[0]))
print('dataY[0]:', dataY[0])

Total Patterns or number of elements in list dataX: 4880
dataX[0]: [15, 37, 45, 46, 47, 1, 12, 37, 47, 37, 52, 33, 42, 7, 0, 11, 33, 34, 43, 45, 33, 1, 50, 33, 1, 44, 45, 43, 31, 33, 33, 32, 1, 29, 42, 51, 1, 34, 48, 45, 47, 36, 33, 45, 4, 1, 36, 33, 29, 45, 1, 41, 33, 1, 46, 44, 33, 29, 39, 6, 0, 0, 10, 40, 40, 7, 0, 23, 44, 33, 29, 39, 4, 1, 46, 44, 33, 29, 39, 6, 0, 0, 15, 37, 45, 46, 47, 1, 12, 37, 47, 37, 52, 33, 42, 7, 0, 28, 43, 48, 1, 29, 45, 33, 1, 29, 40, 40, 1, 45, 33, 46, 43, 40, 49, 33, 32, 1, 45, 29]
Length of every element of dataX: 120
dataY[0]: 47


### Reshape Input Output

In [6]:
X = torch.tensor(dataX, dtype=torch.float32).reshape(n_patterns, seq_len, 1)
X = X / float(unique_chars)
y = torch.tensor(dataY)
print('X[0]:', X[0])
print('y[0]:', y[0])

X[0]: tensor([[0.2830],
        [0.6981],
        [0.8491],
        [0.8679],
        [0.8868],
        [0.0189],
        [0.2264],
        [0.6981],
        [0.8868],
        [0.6981],
        [0.9811],
        [0.6226],
        [0.7925],
        [0.1321],
        [0.0000],
        [0.2075],
        [0.6226],
        [0.6415],
        [0.8113],
        [0.8491],
        [0.6226],
        [0.0189],
        [0.9434],
        [0.6226],
        [0.0189],
        [0.8302],
        [0.8491],
        [0.8113],
        [0.5849],
        [0.6226],
        [0.6226],
        [0.6038],
        [0.0189],
        [0.5472],
        [0.7925],
        [0.9623],
        [0.0189],
        [0.6415],
        [0.9057],
        [0.8491],
        [0.8868],
        [0.6792],
        [0.6226],
        [0.8491],
        [0.0755],
        [0.0189],
        [0.6792],
        [0.6226],
        [0.5472],
        [0.8491],
        [0.0189],
        [0.7736],
        [0.6226],
        [0.0189],
        [0.8679],
    

## Defining a LSTM Model

In [7]:
class CharModel(nn.Module):
    def __init__(self,unique_chars, input_size, hidden_size, num_layers):
        super(CharModel, self).__init__()
        self.embedding = nn.Embedding(unique_chars, input_size, padding_idx=0)
        self.lstm = nn.LSTM(input_size,hidden_size,num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(256, unique_chars)
    
    def forward(self, x):
        x, _ = self.lstm(x)
        # take only the last output
        x = x[:, -1, :]
        # produce output
        x = self.linear(self.dropout(x))
        return x


## Defining the Train and Save function

In [9]:
def fitt(model, model_name, X_batch, y_batch, optimizer, loss_fn, n_epochs):
    lowest_loss = float('inf')  # Initialize with a high value
    best_model_params = None  # Store parameters of the best model

    for epoch in range(n_epochs):
        loss_ep=0.0
        losses_ep = []
        times_ep=[]
        start_time = time.time()
        # Train loop
        model.train()
        for X_batch, y_batch in loader:
            optimizer.zero_grad()
            y_pred = model(X_batch.to(device))
            loss = loss_fn(y_pred, y_batch.to(device))
            loss_ep+=loss.item()
            loss.backward()
            optimizer.step()
        # Calculate training time, losses and print for each epoch
        end_time = time.time()
        time_epoch = end_time - start_time
        losses_ep = losses_ep.append(loss_ep)
        times_ep=times_ep.append(time_epoch)
        print(f'Epoch {epoch+1}/{n_epochs}: Train time: {time_epoch:.2f}seconds, Loss over epoch: {loss_ep:.4f}')
        
        # Check for lowest loss and save parameters if applicable
        if loss_ep < lowest_loss:
            lowest_loss = loss_ep
            best_model_params = copy.deepcopy(model.state_dict())  # Deep copy for safety
    
    # Save the best model parameters after all epochs
    if best_model_params is not None:
        filename=f'{model_name}_best.pth'
        torch.save([best_model_params,chars_to_idx], filename)
    



I need to graph the losses per epoch and time per epoch
Then I can compare the three models to see how the losses and time per epoch change for them.

## Defining the Text Generating function

In [10]:
def generate_text(raw_data, unique_chars, seq_len, model):
    model.eval()
    start = np.random.randint(0, len(raw_data)-seq_len)
    prompt = raw_data[start:start+seq_len]
    print(f'Prompt: \n "{prompt}"')
    print()
    pattern = [chars_to_idx[c] for c in prompt]

    with torch.no_grad():
        for i in range(1000):
            # format input array of int into PyTorch tensor
            x = np.reshape(pattern, (1, len(pattern), 1)) / float(unique_chars)
            x = torch.tensor(x, dtype=torch.float32)
            # generate logits as output from the model
            prediction = model(x.to(device))
            # convert logits into one character
            index = int(prediction.argmax())
            result = idx_to_chars[index]
            print(result, end='')
            # append the new character into the prompt for the next iteration
            pattern.append(index)
            pattern = pattern[1:]
    print()
    print('Done.')


## Parameters common to all models

In [11]:
unique_chars=unique_chars
input_size=1
hidden_size=256

n_epochs = 2
batch_size = 128
lr=.01

In [12]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

loss_fn = nn.CrossEntropyLoss(reduction='sum')
loader = data.DataLoader(data.TensorDataset(X, y), shuffle=True, batch_size=batch_size)

## Small Model named model_0

### Number of LSTM layers

In [13]:
num_layers=1

### Instantiating the Model

In [14]:
model_0=CharModel(unique_chars,input_size,hidden_size,num_layers)

optimizer = optim.Adam(model_0.parameters(),lr=lr)

model_0.to(device)

CharModel(
  (embedding): Embedding(53, 1, padding_idx=0)
  (lstm): LSTM(1, 256, batch_first=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (linear): Linear(in_features=256, out_features=53, bias=True)
)

### Training Small Model

In [15]:
fitt(model=model_0, model_name='model_0', X_batch=X, y_batch=y, optimizer=optimizer,loss_fn=loss_fn,n_epochs=n_epochs) 


Epoch 1/2: Train time: 69.16seconds, Loss over epoch: 16282.3426
Epoch 2/2: Train time: 70.79seconds, Loss over epoch: 15678.1654


### Generating Text using Small Model

#### Reloading Saved Model

In [16]:
model_0 = CharModel(unique_chars, input_size, hidden_size, num_layers)

best_model_param, chars_to_idx = torch.load('model_0_best.pth')
model_0.load_state_dict(best_model_param)

<All keys matched successfully>

In [17]:
generate_text(raw_data=raw_data, unique_chars=unique_chars,seq_len=seq_len, model=model_0 )

Prompt: 
 "ple.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?
"

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

### Checking Structure of Small Model (model_0)

In [18]:
for name, param in model_0.named_parameters():
    print(f"Parameter Name: {name}")
    print(f"Parameter Shape: {param.shape}")

Parameter Name: embedding.weight
Parameter Shape: torch.Size([53, 1])
Parameter Name: lstm.weight_ih_l0
Parameter Shape: torch.Size([1024, 1])
Parameter Name: lstm.weight_hh_l0
Parameter Shape: torch.Size([1024, 256])
Parameter Name: lstm.bias_ih_l0
Parameter Shape: torch.Size([1024])
Parameter Name: lstm.bias_hh_l0
Parameter Shape: torch.Size([1024])
Parameter Name: linear.weight
Parameter Shape: torch.Size([53, 256])
Parameter Name: linear.bias
Parameter Shape: torch.Size([53])


### Saving Parameters of Small Model

In [19]:
model_0_params = {}
for name, param in model_0.named_parameters():
    model_0_params[name] = param.data

## Large Model named Model_0
The only change is on the parameter to nn.LSTM(): you set num_layers=2 instead of 1 to add another LSTM layer. But between the two LSTM layers, you also added a dropout layer through the parameter dropout=0.2. Replacing this model with the previous is all the change you need to make.
We call the model Model_0

### Number of LSTM layers

In [20]:
num_layers=2

### Instantiating the Model

In [21]:
Model_0 = CharModel(unique_chars, input_size, hidden_size, num_layers)

optimizer = optim.Adam(Model_0.parameters(),lr=lr)

Model_0.to(device)

CharModel(
  (embedding): Embedding(53, 1, padding_idx=0)
  (lstm): LSTM(1, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (linear): Linear(in_features=256, out_features=53, bias=True)
)

### Training and Saving Large Model

In [22]:
fitt(model=Model_0, model_name='Model_0',X_batch=X, y_batch=y, optimizer=optimizer,loss_fn=loss_fn,n_epochs=n_epochs) 

Epoch 1/2: Train time: 176.61seconds, Loss over epoch: 16346.9978
Epoch 2/2: Train time: 142.82seconds, Loss over epoch: 15800.0993


### Generating Text using Large Model

#### Reloading Saved Model

In [23]:
Model_0 = CharModel(unique_chars, input_size, hidden_size, num_layers)

best_model_param, chars_to_idx = torch.load('Model_0_best.pth')
Model_0.load_state_dict(best_model_param)

<All keys matched successfully>

In [24]:
generate_text(raw_data=raw_data, unique_chars=unique_chars,seq_len=seq_len, model=Model_0 )

Prompt: 
 "e is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger fo"

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

### Checking Model Structure of Large Model

In [25]:
for name, param in Model_0.named_parameters():
    print(f"Parameter Name: {name}")
    print(f"Parameter Shape: {param.shape}")

Parameter Name: embedding.weight
Parameter Shape: torch.Size([53, 1])
Parameter Name: lstm.weight_ih_l0
Parameter Shape: torch.Size([1024, 1])
Parameter Name: lstm.weight_hh_l0
Parameter Shape: torch.Size([1024, 256])
Parameter Name: lstm.bias_ih_l0
Parameter Shape: torch.Size([1024])
Parameter Name: lstm.bias_hh_l0
Parameter Shape: torch.Size([1024])
Parameter Name: lstm.weight_ih_l1
Parameter Shape: torch.Size([1024, 256])
Parameter Name: lstm.weight_hh_l1
Parameter Shape: torch.Size([1024, 256])
Parameter Name: lstm.bias_ih_l1
Parameter Shape: torch.Size([1024])
Parameter Name: lstm.bias_hh_l1
Parameter Shape: torch.Size([1024])
Parameter Name: linear.weight
Parameter Shape: torch.Size([53, 256])
Parameter Name: linear.bias
Parameter Shape: torch.Size([53])


## Large Model (Model_1) initialised using weights of Small Model

### Instantiating another Instance of the Large Model

In [26]:
Model_1 = CharModel(unique_chars, input_size, hidden_size, num_layers)

optimizer = optim.Adam(Model_1.parameters(),lr=lr)

Model_1.to(device)

CharModel(
  (embedding): Embedding(53, 1, padding_idx=0)
  (lstm): LSTM(1, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (linear): Linear(in_features=256, out_features=53, bias=True)
)

### Checking Model Structure of Large Model (Model_1)

In [27]:
for name, param in Model_1.named_parameters():
    print(f"Parameter Name: {name}")
    print(f"Parameter Shape: {param.shape}")

Parameter Name: embedding.weight
Parameter Shape: torch.Size([53, 1])
Parameter Name: lstm.weight_ih_l0
Parameter Shape: torch.Size([1024, 1])
Parameter Name: lstm.weight_hh_l0
Parameter Shape: torch.Size([1024, 256])
Parameter Name: lstm.bias_ih_l0
Parameter Shape: torch.Size([1024])
Parameter Name: lstm.bias_hh_l0
Parameter Shape: torch.Size([1024])
Parameter Name: lstm.weight_ih_l1
Parameter Shape: torch.Size([1024, 256])
Parameter Name: lstm.weight_hh_l1
Parameter Shape: torch.Size([1024, 256])
Parameter Name: lstm.bias_ih_l1
Parameter Shape: torch.Size([1024])
Parameter Name: lstm.bias_hh_l1
Parameter Shape: torch.Size([1024])
Parameter Name: linear.weight
Parameter Shape: torch.Size([53, 256])
Parameter Name: linear.bias
Parameter Shape: torch.Size([53])


### Loading Parameters of Small Model to M_1

In [28]:
for name, param in Model_1.named_parameters():
    if name in model_0_params:
        param.data.copy_(model_0_params[name])

### Training and Saving M_1

In [None]:
fitt(model=Model_1, model_name='Model_1',X_batch=X, y_batch=y, optimizer=optimizer,loss_fn=loss_fn,n_epochs=n_epochs)

Epoch 1/2: Train time: 177.86seconds, Loss over epoch: 16042.9848


### Generating Text using M_1

#### Reloading Saved Model

In [None]:
Model_1 = CharModel(unique_chars, input_size, hidden_size, num_layers)

best_model_param, chars_to_idx = torch.load('Model_1_best.pth')
Model_1.load_state_dict(best_model_param)

<All keys matched successfully>

In [None]:
generate_text(raw_data=raw_data, unique_chars=unique_chars,seq_len=seq_len, model=Model_1 )

Prompt: 
 " Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us"

  ii   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   aa   a

KeyboardInterrupt: 