In [27]:
import os

# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoConfig, OPTForCausalLM, AutoTokenizer
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model

In [28]:
BASE_MODEL = "facebook/opt-125m"
CUTOFF_LEN = 256

In [29]:
tokenizer = AutoTokenizer.from_pretrained(
        BASE_MODEL,
        model_max_length=CUTOFF_LEN,
        padding_side="right",
        use_fast=False,
    )

In [30]:
tokenizer.pad_token

'<pad>'

In [31]:
data = load_dataset("json", data_files="alpaca_data.json")


def generate_prompt(data_point):
    # sorry about the formatting disaster gotta move fast
    if data_point["input"]:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{data_point["instruction"]}
### Input:
{data_point["input"]}
### Response:
{data_point["output"]}"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{data_point["instruction"]}
### Response:
{data_point["output"]}"""


data = data.shuffle().map(
    lambda data_point: tokenizer(
        generate_prompt(data_point),
        padding="longest",
        max_length=CUTOFF_LEN,
        truncation=True,
    )
)

Found cached dataset json (/home/manuel/.cache/huggingface/datasets/json/default-19891df62e56ad99/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)
100%|██████████| 1/1 [00:00<00:00, 494.49it/s]
                                                                   

In [32]:
data["train"]

Dataset({
    features: ['instruction', 'input', 'output', 'input_ids', 'attention_mask'],
    num_rows: 52002
})

In [33]:
tokenizer.convert_ids_to_tokens(1263)

'Ġresponse'

In [34]:
data["train"][1]

{'instruction': 'Given the following two segments of text, find the differences.',
 'input': 'Text 1: She went to the store\nText 2: She walked to the store',
 'output': 'The difference between the two segments is that the first one states "She went to the store", while the second one states "She walked to the store".',
 'input_ids': [2,
  45943,
  16,
  41,
  15741,
  14,
  7448,
  10,
  3685,
  6,
  11153,
  19,
  41,
  8135,
  14,
  1639,
  617,
  5377,
  4,
  21062,
  10,
  1263,
  14,
  16574,
  25830,
  5,
  2069,
  4,
  50118,
  48134,
  41241,
  35,
  50118,
  18377,
  5,
  511,
  80,
  5561,
  9,
  2788,
  6,
  465,
  5,
  5550,
  4,
  50118,
  48134,
  41327,
  35,
  50118,
  39645,
  112,
  35,
  264,
  439,
  7,
  5,
  1400,
  50118,
  39645,
  132,
  35,
  264,
  3203,
  7,
  5,
  1400,
  50118,
  48134,
  19121,
  35,
  50118,
  133,
  2249,
  227,
  5,
  80,
  5561,
  16,
  14,
  5,
  78,
  65,
  982,
  22,
  2515,
  439,
  7,
  5,
  1400,
  1297,
  150,
  5,
  200,
  65