# Fix perplexity
The way that we compute perplexity in the test code appears to be wrong.

Let's debug it.

In [1]:
## load test data
import torch
test_data = torch.load('../../data/reddit_data/combined_data_val_data.pt')
print(len(test_data))

43092


In [2]:
## load example model
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base', cache_dir='../../data/model_cache/')
# load weights
weight_file = '../../data/reddit_data/text_only_model/question_generation_model/checkpoint-171000/pytorch_model.bin'
weights = torch.load(weight_file)
model.load_state_dict(weights)
model.eval()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768, padding_idx=1)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
   

## Test perplexity

In [43]:
sample_data = test_data[0]
with torch.no_grad():
    sample_model_output = model(input_ids=sample_data['source_ids'].unsqueeze(0), 
                                attention_mask=sample_data['attention_mask'].unsqueeze(0), 
                                labels=sample_data['target_ids'].unsqueeze(0))
neg_ll = sample_model_output[0]
print(neg_ll)
print(torch.exp(neg_ll))
# normalize by sequence length?
print(torch.exp(neg_ll)/ len(sample_data['target_ids']))

tensor(13.1937)
tensor(536956.9375)
tensor(8389.9521)


That seems pretty high for negative log-likelihood of the whole sequence.
Normalizing by sequence length also seems very low.

What if we take out the empty labels?

In [40]:
pad_id = 1
sample_labels = sample_data['target_ids'].unsqueeze(0)
print(sample_labels)
sample_labels = sample_labels[sample_labels != pad_id]
print(sample_labels)

tensor([[    0,   894,    18,    45,  1996,    47,     7,   109,   932,    47,
           214,    45,  7217, 30256,     9,   608,     6,    50,  1996,    47,
           109,   402, 32316,    50,  2439,   116,     2,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1]])
tensor([    0,   894,    18,    45,  1996,    47,     7,   109,   932,    47,
          214,    45,  7217, 30256,     9,   608,     6,    50,  1996,    47,
          109,   402, 32316,    50,  2439,   116,     2])


In [38]:
# with torch.no_grad():
#     sample_model_output_2 = model(input_ids=sample_data['source_ids'].unsqueeze(0), 
#                                   attention_mask=sample_data['attention_mask'].unsqueeze(0), 
#                                   labels=sample_labels.unsqueeze(0))
print(sample_model_output_2[0])
print(torch.exp(sample_model_output_2[0]))

tensor(3.5857)
tensor(36.0773)


That seems much more within the "normal" perplexity range (10-50).