## Cross Entropy Loss and Adam Optimizer

What is the behavior of cross entropy loss and adam optimizer?

In [1]:
INPUT_TEXT = ['the quick brown fox jumped over the lazy dog']

### Calamancy

In [2]:
import calamancy

Calamancy = calamancy.load("tl_calamancy_md-0.1.0")

Calamancy



<spacy.lang.tl.Tagalog at 0x7f5e3c1f7cd0>

In [3]:
import pandas as pd
import numpy as np
import tqdm
from torch import Tensor
from collections import Counter

def get_calamancy_tokens(data):
  # Allows it to work with both dataframes and
  # simple lists of strings
  if isinstance(data, pd.Series):
    data = data.values

  samples = []

  progress_bar = tqdm.tqdm(total=len(data))

  for sample in Calamancy.pipe(data):
    progress_bar.update(1)

    tokens = []
    for token in sample:
      tokens.append(token)

    samples.append(tokens)

  progress_bar.close()

  return samples

def get_token_vectors(tokens):
  vectors = []

  progress_bar = tqdm.tqdm(total=len(tokens))

  for sample in tokens:
    progress_bar.update(1)
    
    token_vectors = []
    # Check in case empty due to processing
    if not sample:
      token_vectors.append(np.zeros((200)))
    else:
      for token in sample:
        if token.has_vector:
          token_vectors.append(token.vector)
    token_vectors = Tensor(np.array(token_vectors))

    vectors.append(token_vectors)

  progress_bar.close()

  return vectors

def data_remove_stopwords(data):
  stopwords_list = open(
    './src/stopwords-tl.txt',
    'r',
  ).read().split('\n')
  stopwords_dict = Counter(stopwords_list)
  return [
    ' '.join([
      word for word in sample.split()
      if word not in stopwords_dict
    ])
    for sample 
    in data
  ]

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
input_tokenized = get_calamancy_tokens(INPUT_TEXT)

100%|██████████| 1/1 [00:00<00:00, 34.18it/s]


In [5]:
input_tokenized

[[the, quick, brown, fox, jumped, over, the, lazy, dog]]

In [6]:
input_vectorized = get_token_vectors(input_tokenized)

100%|██████████| 1/1 [00:00<00:00, 813.01it/s]


In [7]:
input_vectorized

[tensor([[-0.0448,  0.2607,  0.1448,  ...,  0.4243, -0.1153, -0.0620],
         [ 0.5258, -0.3996,  0.0114,  ...,  0.4171, -0.3223,  0.1630],
         [-0.0630,  0.6724,  1.0455,  ...,  0.2032, -0.4797,  0.5314],
         ...,
         [-0.0448,  0.2607,  0.1448,  ...,  0.4243, -0.1153, -0.0620],
         [-0.1454,  0.5539,  0.2981,  ...,  1.6336, -0.2340, -0.1128],
         [ 0.1416, -0.5068,  0.7797,  ...,  0.2225, -0.7088, -0.3081]])]

In [8]:
input_vectorized[0].shape

torch.Size([9, 200])

### LSTM

In [9]:
import torch
from torch import optim 

INPUT_SIZE = 200
NUM_OF_HIDDEN_NODES = 50
OUTPUT_SIZE = 2

LEARNING_RATE = 0.01

OPTIMIZER = optim.Adam

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [10]:
from torch import nn

class LstmModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.lstm = nn.LSTM(
      INPUT_SIZE,
      NUM_OF_HIDDEN_NODES,
      batch_first=True,
    )
    self.linear = nn.Linear(NUM_OF_HIDDEN_NODES, OUTPUT_SIZE)

    self.lstm_output = None
    self.lstm_hidden_state = None
    self.lstm_cell_state = None

  def forward(self, input):
    self.lstm_output, (self.lstm_hidden_state, self.lstm_cell_state) = self.lstm(input)

    linear_output = self.linear(self.lstm_output[:, -1])

    return linear_output

Lstm = LstmModel()

Lstm.to(DEVICE)

Lstm

LstmModel(
  (lstm): LSTM(200, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=2, bias=True)
)

### Loss

In [11]:
loss_function = nn.CrossEntropyLoss()

In [12]:
input_tensor = torch.stack(input_vectorized).to(DEVICE)

output = Lstm(input_tensor)

In [13]:
output

tensor([[-0.0580, -0.1203]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [14]:
from torch import LongTensor

output_loss = loss_function(
  output, 
  LongTensor([1]).to(DEVICE),
)

In [15]:
output_loss

tensor(0.7248, device='cuda:0', grad_fn=<NllLossBackward0>)

### Solve for Loss

Assume that actual value is 1

Hence, the actual probability distribution is [0, 1]

In [16]:
output_probabilities = torch.softmax(
  output,
  dim=1,
)

In [17]:
output_probabilities

tensor([[0.5156, 0.4844]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [18]:
actual_probabilities = [0, 1]

predicted_probabilities = output_probabilities[0][0] * actual_probabilities[0] + output_probabilities[0][1] * actual_probabilities[1]

-np.log(predicted_probabilities.cpu().detach().numpy())

0.7247954

### Loss for Multiple Inputs

In [19]:
MULTIPLE_INPUTS = [
  'the quick brown fox jumped',
  'over the lazy dog near',
  'the bank of the river',
]

TARGET_INPUTS = [1, 0, 1]

In [20]:
multiple_input_tokens = get_calamancy_tokens(MULTIPLE_INPUTS)

100%|██████████| 3/3 [00:00<00:00, 62.61it/s]


In [21]:
multiple_input_tokens

[[the, quick, brown, fox, jumped],
 [over, the, lazy, dog, near],
 [the, bank, of, the, river]]

In [22]:
multiple_input_vectors = get_token_vectors(multiple_input_tokens)

100%|██████████| 3/3 [00:00<00:00, 2097.85it/s]


In [23]:
multiple_input_vectors

[tensor([[-4.4816e-02,  2.6072e-01,  1.4476e-01,  1.3985e-01,  8.2186e-01,
          -1.8453e-01,  3.9610e-01, -3.3248e-01, -1.8321e-03, -3.2702e-01,
          -1.4955e-01, -7.9797e-01,  3.2212e-01,  5.0457e-01, -1.4466e-02,
          -1.5588e-01, -5.8039e-01, -8.2334e-01, -1.2749e-01,  1.2681e-02,
           1.7029e-01,  2.0639e-01,  1.8592e-01,  9.1905e-02,  7.9670e-01,
           3.7350e-03, -1.1508e-01, -9.2960e-02, -5.0474e-02, -5.2370e-01,
           2.4938e-01,  5.0046e-02, -2.2510e-01,  8.2782e-01, -8.8028e-02,
          -3.1285e-01,  2.1367e-01, -1.2164e-01,  6.2164e-02,  3.9969e-02,
          -1.9242e-01,  5.9761e-01,  8.7907e-01,  2.6625e-01, -6.2292e-01,
           3.5723e-01, -5.6910e-01, -2.2975e-02, -4.4734e-01, -2.3245e-01,
          -1.6014e-02,  2.4307e-02, -2.9628e-01,  3.4902e-03,  2.8123e-01,
          -3.6569e-01, -5.6186e-02, -8.3109e-02,  3.7591e-01, -3.0810e-01,
          -3.9157e-01, -5.8018e-01,  4.4514e-02, -3.4728e-01, -1.4072e-02,
           9.6888e-02,  4

In [24]:
multiple_input_vectors[0].shape

torch.Size([5, 200])

In [25]:
multiple_input_tensor = torch.stack(multiple_input_vectors).to(DEVICE)

multiple_output = Lstm(multiple_input_tensor)

In [26]:
multiple_output

tensor([[-0.0153,  0.0258],
        [-0.0802, -0.0404],
        [ 0.0309,  0.0406]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [27]:
multiple_loss = loss_function(
  multiple_output,
  LongTensor(TARGET_INPUTS).to(DEVICE)
)

In [28]:
multiple_loss

tensor(0.6914, device='cuda:0', grad_fn=<NllLossBackward0>)

Solving for Loss

In [29]:
multiple_softmax = torch.softmax(
  multiple_output,
  dim=1,
)

In [30]:
multiple_softmax

tensor([[0.4897, 0.5103],
        [0.4901, 0.5099],
        [0.4976, 0.5024]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [31]:
negative_log = -np.log(
  multiple_softmax.cpu().detach().numpy(),
)

In [32]:
negative_log

array([[0.71389526, 0.67282087],
       [0.71321046, 0.67347854],
       [0.698017  , 0.6883009 ]], dtype=float32)

In [33]:
negative_log_sum = negative_log[0][1] + negative_log[1][0] + negative_log[2][1]

In [34]:
negative_log_sum

2.0743322

In [35]:
negative_log_sum / 3

0.6914440790812174

## loss.backward() and optimizer.step()

In [36]:
optimizer = OPTIMIZER(Lstm.parameters(), lr=LEARNING_RATE)

### Getting Parameter Values

In [37]:
print("-----")
for name, param in Lstm.named_parameters():
  if param.requires_grad: 
    print(name)
    print("Data:")
    print(param.data)
    print("Grad:")
    print(param.grad)
    print(f"Shape: {param.data.shape}")
    print("-----")

-----
lstm.weight_ih_l0
Data:
tensor([[-0.1239,  0.1066, -0.0891,  ..., -0.0025,  0.0033, -0.1201],
        [-0.0176,  0.0893, -0.0530,  ...,  0.1127, -0.1001,  0.1381],
        [-0.0037, -0.0664, -0.0285,  ..., -0.0991,  0.1128,  0.1191],
        ...,
        [ 0.0917, -0.1298, -0.0994,  ..., -0.1303, -0.0218, -0.0057],
        [ 0.1219,  0.0866, -0.0026,  ..., -0.1135, -0.0781, -0.1379],
        [-0.0120,  0.1171,  0.0788,  ...,  0.0240, -0.0921,  0.0717]],
       device='cuda:0')
Grad:
None
Shape: torch.Size([200, 200])
-----
lstm.weight_hh_l0
Data:
tensor([[-0.0472, -0.0489, -0.0697,  ...,  0.0132, -0.0693, -0.0331],
        [ 0.0785, -0.0701,  0.0671,  ...,  0.0086,  0.1316,  0.0077],
        [ 0.0520,  0.0476,  0.0237,  ..., -0.0411,  0.0880, -0.0962],
        ...,
        [ 0.0227,  0.0427, -0.0436,  ...,  0.0777,  0.0803, -0.1196],
        [-0.0443, -0.0592, -0.1309,  ..., -0.1297, -0.0988,  0.0421],
        [ 0.1303, -0.0453, -0.0733,  ..., -0.0631,  0.0065, -0.0610]],
       

In [38]:
# Accumulate gradients in parameters
output_loss.backward()

In [39]:
# Note that each parameter has a grad value

print("---------")
for name, param in Lstm.named_parameters():
  if param.requires_grad: 
    print(name)
    print("Data:")
    print(param.data)
    print(f"Data Shape: {param.data.shape}")
    print("-----")
    print("Grad:")
    print(param.grad)
    print(f"Grad Shape: {param.grad.shape}")
    print("---------")

---------
lstm.weight_ih_l0
Data:
tensor([[-0.1239,  0.1066, -0.0891,  ..., -0.0025,  0.0033, -0.1201],
        [-0.0176,  0.0893, -0.0530,  ...,  0.1127, -0.1001,  0.1381],
        [-0.0037, -0.0664, -0.0285,  ..., -0.0991,  0.1128,  0.1191],
        ...,
        [ 0.0917, -0.1298, -0.0994,  ..., -0.1303, -0.0218, -0.0057],
        [ 0.1219,  0.0866, -0.0026,  ..., -0.1135, -0.0781, -0.1379],
        [-0.0120,  0.1171,  0.0788,  ...,  0.0240, -0.0921,  0.0717]],
       device='cuda:0')
Data Shape: torch.Size([200, 200])
-----
Grad:
tensor([[-1.1116e-04,  6.3557e-04,  4.2248e-05,  ...,  6.5106e-04,
          4.7354e-05,  1.4595e-04],
        [ 1.9653e-04, -5.4470e-04,  1.1752e-03,  ...,  3.9320e-04,
         -1.0222e-03, -3.5215e-04],
        [ 1.4098e-03, -3.6519e-03,  3.9341e-03,  ..., -3.2020e-04,
         -3.5482e-03, -1.2275e-03],
        ...,
        [-5.5501e-04,  2.1539e-03, -3.4086e-03,  ..., -1.2783e-03,
          3.1127e-03,  1.4112e-03],
        [-1.6210e-04,  6.7285e-04, -

In [40]:
optimizer.step()

In [42]:
# Note that the parameter data has changed

print("---------")
for name, param in Lstm.named_parameters():
  if param.requires_grad: 
    print(name)
    print("Data:")
    print(param.data)
    print(f"Data Shape: {param.data.shape}")
    print("-----")
    print("Grad:")
    print(param.grad)
    print(f"Grad Shape: {param.grad.shape}")
    print("---------")

---------
lstm.weight_ih_l0
Data:
tensor([[-0.1139,  0.0966, -0.0991,  ..., -0.0125, -0.0067, -0.1301],
        [-0.0276,  0.0993, -0.0630,  ...,  0.1027, -0.0901,  0.1481],
        [-0.0137, -0.0564, -0.0385,  ..., -0.0891,  0.1228,  0.1291],
        ...,
        [ 0.1017, -0.1398, -0.0894,  ..., -0.1203, -0.0318, -0.0157],
        [ 0.1319,  0.0766,  0.0074,  ..., -0.1235, -0.0881, -0.1479],
        [-0.0020,  0.1071,  0.0888,  ...,  0.0340, -0.1021,  0.0617]],
       device='cuda:0')
Data Shape: torch.Size([200, 200])
-----
Grad:
tensor([[-1.1116e-04,  6.3557e-04,  4.2248e-05,  ...,  6.5106e-04,
          4.7354e-05,  1.4595e-04],
        [ 1.9653e-04, -5.4470e-04,  1.1752e-03,  ...,  3.9320e-04,
         -1.0222e-03, -3.5215e-04],
        [ 1.4098e-03, -3.6519e-03,  3.9341e-03,  ..., -3.2020e-04,
         -3.5482e-03, -1.2275e-03],
        ...,
        [-5.5501e-04,  2.1539e-03, -3.4086e-03,  ..., -1.2783e-03,
          3.1127e-03,  1.4112e-03],
        [-1.6210e-04,  6.7285e-04, -

In [43]:
optimizer.zero_grad()

In [44]:
# Note that the gradients have been cleared

print("---------")
for name, param in Lstm.named_parameters():
  if param.requires_grad: 
    print(name)
    print("Data:")
    print(param.data)
    print(f"Data Shape: {param.data.shape}")
    print("-----")
    print("Grad:")
    print(param.grad)
    print("---------")

---------
lstm.weight_ih_l0
Data:
tensor([[-0.1139,  0.0966, -0.0991,  ..., -0.0125, -0.0067, -0.1301],
        [-0.0276,  0.0993, -0.0630,  ...,  0.1027, -0.0901,  0.1481],
        [-0.0137, -0.0564, -0.0385,  ..., -0.0891,  0.1228,  0.1291],
        ...,
        [ 0.1017, -0.1398, -0.0894,  ..., -0.1203, -0.0318, -0.0157],
        [ 0.1319,  0.0766,  0.0074,  ..., -0.1235, -0.0881, -0.1479],
        [-0.0020,  0.1071,  0.0888,  ...,  0.0340, -0.1021,  0.0617]],
       device='cuda:0')
Data Shape: torch.Size([200, 200])
-----
Grad:
None
---------
lstm.weight_hh_l0
Data:
tensor([[-0.0572, -0.0389, -0.0597,  ...,  0.0232, -0.0593, -0.0431],
        [ 0.0685, -0.0801,  0.0771,  ...,  0.0186,  0.1216, -0.0023],
        [ 0.0420,  0.0376,  0.0337,  ..., -0.0311,  0.0780, -0.1062],
        ...,
        [ 0.0327,  0.0527, -0.0536,  ...,  0.0677,  0.0903, -0.1096],
        [-0.0343, -0.0493, -0.1409,  ..., -0.1397, -0.0888,  0.0521],
        [ 0.1403, -0.0353, -0.0833,  ..., -0.0731,  0.0165,