In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from evaluate import load

In [2]:
torch.cuda.set_device(2)

In [3]:
torch.cuda.current_device()

2

In [4]:
dataset = load_dataset("Trelis/tiny-shakespeare")

In [5]:
dataset.shape

{'train': (472, 1), 'test': (49, 1)}

In [6]:
dataset["train"]

Dataset({
    features: ['Text'],
    num_rows: 472
})

In [7]:
dataset["test"]["Text"][0]

"TRANIO:\nIs this your speeding? nay, then, good night our part!\n\nPETRUCHIO:\nBe patient, gentlemen; I choose her for myself:\nIf she and I be pleased, what's that to you?\n'Tis bargain'd 'twixt us twain, being alone,\nThat she shall still be curst in company.\nI tell you, 'tis incredible to believe\nHow much she loves me: O, the kindest Kate!\nShe hung about my neck; and kiss on kiss\nShe vied so fast, protesting oath on oath,\nThat in a twink she won me to her love.\nO, you are novices! 'tis a world to see,\nHow tame, when men and women are alone,\nA meacock wretch can make the curstest shrew.\nGive me thy hand, Kate: I will unto Venice,\nTo buy apparel 'gainst the wedding-day.\nProvide the feast, father, and bid the guests;\nI will be sure my Katharina shall be fine.\n\nBAPTISTA:\nI know not what to say: but give me your hands;\nGod send you joy, Petruchio! 'tis a match.\n\nGREMIO:\nAmen, say we: we will be witnesses.\n\nPETRUCHIO:\nFather, and wife, and gentlemen, adieu;\nI will 

In [8]:
len(dataset["test"]["Text"][0])

2859

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu"
print(device)

cuda


In [10]:
model_size = "8B"
# model_size = "70B"

In [11]:
model = AutoModelForCausalLM.from_pretrained(f"meta-llama/Meta-Llama-3-{model_size}", token="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")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
tokenizer = AutoTokenizer.from_pretrained(f"meta-llama/Meta-Llama-3-{model_size}", token="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")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [14]:
encodings = tokenizer(dataset["test"]["Text"], return_tensors="pt", padding=True)

In [15]:
model.config.get_config_dict(f"meta-llama/Meta-Llama-3-{model_size}")



({'architectures': ['LlamaForCausalLM'],
  'attention_bias': False,
  'attention_dropout': 0.0,
  'bos_token_id': 128000,
  'eos_token_id': 128001,
  'hidden_act': 'silu',
  'hidden_size': 4096,
  'initializer_range': 0.02,
  'intermediate_size': 14336,
  'max_position_embeddings': 8192,
  'model_type': 'llama',
  'num_attention_heads': 32,
  'num_hidden_layers': 32,
  'num_key_value_heads': 8,
  'pretraining_tp': 1,
  'rms_norm_eps': 1e-05,
  'rope_scaling': None,
  'rope_theta': 500000.0,
  'tie_word_embeddings': False,
  'torch_dtype': 'bfloat16',
  'transformers_version': '4.40.0.dev0',
  'use_cache': True,
  'vocab_size': 128256,
  '_commit_hash': '62bd457b6fe961a42a631306577e622c83876cb6'},
 {})

In [16]:
perplexity = load("perplexity", module_type="metric")

In [17]:
predictions = []
with torch.no_grad():
    model = model.to(device)
    for text in dataset["test"]["Text"]:
        encodings = tokenizer(text, return_tensors= "pt", padding=True)
        input_ids = encodings.input_ids.to(device)
        outputs = model(input_ids)
        p = tokenizer.decode(outputs.logits.argmax(dim=-1)[0], skip_special_tokens=True)
        predictions.append(p)

RuntimeError: CUDA out of memory. Tried to allocate 44.00 MiB (GPU 2; 31.75 GiB total capacity; 30.25 GiB already allocated; 19.50 MiB free; 30.38 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [18]:
dataset["test"]["Text"][0]

"TRANIO:\nIs this your speeding? nay, then, good night our part!\n\nPETRUCHIO:\nBe patient, gentlemen; I choose her for myself:\nIf she and I be pleased, what's that to you?\n'Tis bargain'd 'twixt us twain, being alone,\nThat she shall still be curst in company.\nI tell you, 'tis incredible to believe\nHow much she loves me: O, the kindest Kate!\nShe hung about my neck; and kiss on kiss\nShe vied so fast, protesting oath on oath,\nThat in a twink she won me to her love.\nO, you are novices! 'tis a world to see,\nHow tame, when men and women are alone,\nA meacock wretch can make the curstest shrew.\nGive me thy hand, Kate: I will unto Venice,\nTo buy apparel 'gainst the wedding-day.\nProvide the feast, father, and bid the guests;\nI will be sure my Katharina shall be fine.\n\nBAPTISTA:\nI know not what to say: but give me your hands;\nGod send you joy, Petruchio! 'tis a match.\n\nGREMIO:\nAmen, say we: we will be witnesses.\n\nPETRUCHIO:\nFather, and wife, and gentlemen, adieu;\nI will 

In [19]:
tokenizer(dataset["test"]["Text"][0], return_tensors= "pt", padding=True)

{'input_ids': tensor([[128000,  49873,   3895,    512,   3957,    420,    701,  58675,     30,
         103091,     11,   1243,     11,   1695,   3814,   1057,    961,   2268,
          80504,     49,  30593,   3895,    512,   3513,   8893,     11,  58909,
             26,    358,   5268,   1077,    369,   7182,    512,   2746,   1364,
            323,    358,    387,  18949,     11,   1148,    596,    430,    311,
            499,   5380,  17773,    285,  45663,   4265,    364,  15930,    953,
             83,    603,   4483,    467,     11,   1694,   7636,    345,   4897,
           1364,   4985,   2103,    387,   2917,    267,    304,   2883,    627,
             40,   3371,    499,     11,    364,     83,    285,  15400,    311,
           4510,    198,   4438,   1790,   1364,  16180,    757,     25,    507,
             11,    279,  24890,   5086,  30301,   4999,   8100,  18799,    922,
            856,  13272,     26,    323,  21735,    389,  21735,    198,   8100,
            34

In [20]:
predictions[0]

IndexError: list index out of range

In [None]:
results = perplexity.compute(predictions=predictions, model_id='gpt2', add_start_token=False)

In [None]:
results

Resultados perplexity:

· Trelis/tiny-shakespeare (test):

    8B ->  points
    70B ->  points