In [None]:
import transformers
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM
from datasets import load_dataset

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

In [None]:
model_path = 'openlm-research/open_llama_3b'

tokenizer = LlamaTokenizer.from_pretrained(model_path)
model = transformers.LlamaForCausalLM.from_pretrained(
    model_path, load_in_8bit=True, torch_dtype=torch.float16, device_map="auto",
)

tokenizer.pad_token_id = (0)

tokenizer.padding_side = "left"

In [None]:
data_path = "/Users/mac/Desktop/LLM-Sentimental-Analysis/data/processed_datasets/alpaca-news-sentiment-dataset.csv"
data = load_dataset("csv", data_files=data_path)

In [None]:
data

In [None]:
def tokenize(prompt, add_eos_token=True):
    cutoff_len = 256
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=cutoff_len,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)
 
    result["labels"] = result["input_ids"].copy()
 
    return result

def generate_prompt(instruction, input, label):
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
                ### Instruction:
                {instruction}
                ### Input:
                {input}
                ### Response:
                {label}
            """
    
def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(
        data_point["instruction"],
        data_point["input"],
        data_point["output"],
    )
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [None]:
train_val = data["train"].train_test_split(
    test_size=10000, shuffle=True, seed=0
)
train_data = (
    train_val["train"].map(generate_and_tokenize_prompt)
)
val_data = (
    train_val["test"].map(generate_and_tokenize_prompt)
)