In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

In [None]:
# Crear la variable de entorno "HF_TOKEN" con el token de Hugging Face

In [None]:
dataset = load_dataset("Trelis/tiny-shakespeare")

In [None]:
dataset.shape

{'train': (472, 1), 'test': (49, 1)}

In [None]:
dataset["train"]

Dataset({
    features: ['Text'],
    num_rows: 472
})

In [None]:
# device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"

In [None]:
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like meta-llama/Meta-Llama-3-8B is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
encodings = tokenizer(dataset["test"]["Text"][:2], return_tensors= "pt", padding=True)

In [None]:
model.config.get_config_dict("meta-llama/Meta-Llama-3-8B-Instruct")

({'architectures': ['MambaForCausalLM'],
  'bos_token_id': 0,
  'conv_kernel': 4,
  'd_inner': 160,
  'd_model': 1024,
  'dt_rank': 'auto',
  'eos_token_id': 0,
  'expand': 2,
  'fused_add_norm': True,
  'hidden_act': 'silu',
  'hidden_size': 1024,
  'initializer_range': 0.1,
  'intermediate_size': 2048,
  'layer_norm_epsilon': 1e-05,
  'model_type': 'mamba',
  'n_layer': 48,
  'num_hidden_layers': 48,
  'pad_token_id': 0,
  'pad_vocab_size_multiple': 8,
  'rescale_prenorm_residual': False,
  'residual_in_fp32': True,
  'rms_norm': True,
  'ssm_cfg': {},
  'state_size': 16,
  'time_step_floor': 0.0001,
  'time_step_init_scheme': 'random',
  'time_step_max': 0.1,
  'time_step_min': 0.001,
  'time_step_rank': 64,
  'time_step_scale': 1.0,
  'torch_dtype': 'float32',
  'transformers_version': '4.39.0.dev0',
  'use_bias': False,
  'use_cache': True,
  'use_conv_bias': True,
  'vocab_size': 50280,
  '_commit_hash': 'b519127f5bfaaa1c27dd938dad051ec360972b23'},
 {})

In [None]:
def calculate_perplexity(max_length, stride, seq_len):
    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break
    return nlls

In [None]:
# max_length = model.config.n_positions
max_length = model.config.d_model
seq_len = encodings.input_ids.size(1)

In [None]:
stride = 512
nlls = calculate_perplexity(max_length, stride, seq_len)

ppl = torch.exp(torch.stack(nlls).mean())
print(f"Perplexity: {ppl.item()}")

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [01:05<?, ?it/s]

Perplexity: 24.20311164855957





In [None]:
stride = 2
nlls = calculate_perplexity(max_length, stride, seq_len)

ppl = torch.exp(torch.stack(nlls).mean())
print(f"Perplexity: {ppl.item()}")

  0%|          | 0/483 [01:09<?, ?it/s]

Perplexity: 24.20311164855957



