In [1]:
# load necessary packages 
import io, os, sys, glob  
import numpy as np
import matplotlib.pyplot as plt

import transformers
import torch
from datasets import load_dataset
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score
from transformers import (AutoConfig, 
                          AutoModelForSequenceClassification, 
                          AutoTokenizer, AdamW, 
                          get_linear_schedule_with_warmup,
                          set_seed,
                          )

sys.path.append("../")

# custom imports
from utils.GetLowestGPU import GetLowestGPU

device = GetLowestGPU(verbose=2) # get lowest memory GPU

---------------------------
 GPU | Memory-usage    
---------------------------
  0  | 00003MiB / 40537MiB
  1  | 31388MiB / 40537MiB
  2  | 03222MiB / 40537MiB
  3  | 00003MiB / 40537MiB
  4  | 00484MiB / 40537MiB
  5  | 00484MiB / 40537MiB
  6  | 00484MiB / 40537MiB
  7  | 00484MiB / 40537MiB
---------------------------
 Device set to cuda:0
---------------------------


# Load the Base Model

In [2]:
# load the model
model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'
pipeline = transformers.pipeline('text-generation',
                                  model=model_id,
                                  model_kwargs={'torch_dtype': torch.bfloat16},
                                  device_map=device)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Load Example Dataset

In [4]:
# load example dataset
dataset_path = 'yahma/alpaca-cleaned' #random dataset from a medium article
raw_dataset = load_dataset(dataset_path)

Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [5]:
# examine dataset
print("Structure:", raw_dataset)
print("Example Entry:", raw_dataset['train'][0])

Structure: DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 51760
    })
})
Example Entry: {'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.', 'input': '', 'instruction': 'Give three tips for staying healthy.'}


In [7]:
pipeline(raw_dataset['train'][0]['input'])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


KeyboardInterrupt: 

In [26]:
# instantiate a `preprocess_data` function that tokenizes the raw dataset
def preprocess_data(example):
    return pipeline.tokenizer(example['train'], truncation=True, padding='max_length', max_length=512)

# preprocess the raw dataset
preprocessed_dataset = raw_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/3512 [00:00<?, ? examples/s]

KeyError: 'train'