In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset

In [3]:
# import sentencepiece 

In [2]:
data = {
    'instruction': [
        'Given this dataframe, make a scatter plot of Age and Weight with x label Age, y label Weight, marker size of 10 and highlight each point according to its city.',
        'Create a scatter plot with Age on x-axis and Weight on y-axis. Label the x-axis as Age and the y-axis as Weight. Use marker size 10 and color points by city.'
    ],
    'output': [
        "{'plot_type': 'scatter', 'x': 'Age', 'y': 'Weight', 'xlabel': 'Age', 'ylabel': 'Weight', 'marker_size': '10', 'hue': 'city'}",
        "{'plot_type': 'scatter', 'x': 'Age', 'y': 'Weight', 'xlabel': 'Age', 'ylabel': 'Weight', 'marker_size': '10', 'hue': 'city'}"
    ]
}

# Convert to dataset
dataset = Dataset.from_dict(data)

model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocess data
def preprocess_function(examples):
    inputs = [f"translate English to JSON: {ex}" for ex in examples['instruction']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['output'], max_length=512, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/2 [00:00<?, ? examples/s]



In [4]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [5]:
trainer.train()

ValueError: expected sequence of length 45 at dim 1 (got 58)

In [6]:
def preprocess_function(examples):
    inputs = [f"translate English to JSON: {ex}" for ex in examples['instruction']]
    model_inputs = tokenizer(inputs, max_length=512, padding='max_length', truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['output'], max_length=512, padding='max_length', truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [7]:
# Check lengths of tokenized inputs and labels
print("Tokenized input lengths:")
for input_ids in tokenized_dataset["train"]["input_ids"]:
    print(len(input_ids))

print("Tokenized label lengths:")
for label_ids in tokenized_dataset["train"]["labels"]:
    print(len(label_ids))


Tokenized input lengths:


KeyError: "Column train not in the dataset. Current columns in the dataset: ['instruction', 'output', 'input_ids', 'attention_mask', 'labels']"

In [11]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset

# Example data
data = {
    'instruction': [
        'Given this dataframe, make a scatter plot of Age and Weight with x label Age, y label Weight, marker size of 10 and highlight each point according to its city.',
        'Create a scatter plot with Age on x-axis and Weight on y-axis. Label the x-axis as Age and the y-axis as Weight. Use marker size 10 and color points by city.'
    ],
    'output': [
        "{'plot_type': 'scatter', 'x': 'Age', 'y': 'Weight', 'xlabel': 'Age', 'ylabel': 'Weight', 'marker_size': '10', 'hue': 'city'}",
        "{'plot_type': 'scatter', 'x': 'Age', 'y': 'Weight', 'xlabel': 'Age', 'ylabel': 'Weight', 'marker_size': '10', 'hue': 'city'}"
    ]
}

# Convert to dataset
dataset = Dataset.from_dict(data)

# Load T5 model and tokenizer
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocess data
def preprocess_function(examples):
    inputs = [f"translate English to JSON: {ex}" for ex in examples['instruction']]
    model_inputs = tokenizer(inputs, max_length=512, padding='max_length', truncation=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['output'], max_length=512, padding='max_length', truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [12]:
# Check lengths of tokenized inputs and labels
print("Tokenized input lengths:")
for input_ids in tokenized_dataset["input_ids"]:
    print(len(input_ids))

print("Tokenized label lengths:")
for label_ids in tokenized_dataset["labels"]:
    print(len(label_ids))


Tokenized input lengths:
512
512
Tokenized label lengths:
512
512


In [13]:
# Split the dataset
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']


In [14]:
from transformers import DataCollatorForSeq2Seq, Trainer, TrainingArguments

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

# Train model
trainer.train()


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,No log,11.278948
2,No log,11.205039
3,No log,11.154114


TrainOutput(global_step=3, training_loss=11.72619883219401, metrics={'train_runtime': 64.4474, 'train_samples_per_second': 0.047, 'train_steps_per_second': 0.047, 'total_flos': 406025404416.0, 'train_loss': 11.72619883219401, 'epoch': 3.0})

In [15]:
evaluation_results = trainer.evaluate()
print(evaluation_results)


{'eval_loss': 11.15411376953125, 'eval_runtime': 0.3522, 'eval_samples_per_second': 2.839, 'eval_steps_per_second': 2.839, 'epoch': 3.0}


In [21]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

new_instructions = [
    'Given this dataframe, create a bar chart of Sales and Month with x label Month, y label Sales, and color by Region.',
    'Generate a line plot with Date on the x-axis and Temperature on the y-axis. Label the x-axis as Date and the y-axis as Temperature. Use different colors for each City.'
]

# Encode the new inputs
# new_inputs = [f"Fill in the following JSON object based on the instruction: The JSON object is {'x': None, 'y': None, 'hue': None, 'xlabel': None, 'ylabel': None, 'title': None}. Based on the instruction, provide values for 'x' (variable for the x-axis), 'y' (variable for the y-axis), 'hue' (variable for color differentiation), 'xlabel' (label for the x-axis), 'ylabel' (label for the y-axis), and 'title' (title of the plot).: {instruction}" for instruction in new_instructions]
new_inputs = [f"Fill in the following JSON object based on the instruction: The JSON object is {{'x': None, 'y': None, 'hue': None, 'xlabel': None, 'ylabel': None, 'title': None}}. Based on the instruction, provide values for 'x' (variable for the x-axis), 'y' (variable for the y-axis), 'hue' (variable for color differentiation), 'xlabel' (label for the x-axis), 'ylabel' (label for the y-axis), and 'title' (title of the plot).: {instruction}" for instruction in new_instructions]
# new_inputs = [f"construct a python dictionary with plotting arguments from the instruction: {instruction}" for instruction in new_instructions]
tokenized_inputs = tokenizer(new_inputs, max_length=512, padding='max_length', truncation=True, return_tensors='pt')

# Move inputs to the correct device
tokenized_inputs = {key: value.to(device) for key, value in tokenized_inputs.items()}

# Generate outputs
outputs = model.generate(**tokenized_inputs, max_length=512, num_beams=4, early_stopping=True)

# Decode the outputs
decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

for input_text, output_text in zip(new_instructions, decoded_outputs):
    print(f"Instruction: {input_text}")
    print(f"Generated JSON: {output_text}")
    print()


Instruction: Given this dataframe, create a bar chart of Sales and Month with x label Month, y label Sales, and color by Region.
Generated JSON: Fill in the following JSON object based on the instruction: The JSON object is 'x': None, 'y': None, 'hue': None, 'xlabel': None, 'ylabel': None, 'title': None. Based on the instruction, provide values for 'x' (variable for the x-axis), 'y' (variable for the y-axis), 'hue' (

Instruction: Generate a line plot with Date on the x-axis and Temperature on the y-axis. Label the x-axis as Date and the y-axis as Temperature. Use different colors for each City.
Generated JSON: 'x': None, 'y': None, 'hue': None, 'ylabel': None, 'title': None. Based on the instruction, provide values for 'x' (variable for the x-axis), 'y' (variable for the y-axis), 'hue' (variable for color differentiation), 'xlabel' (label for the x-axis), 'yl



In [1]:
import seaborn as sns

In [6]:
x = help(sns.boxplot)
x

Help on function boxplot in module seaborn.categorical:

boxplot(data=None, *, x=None, y=None, hue=None, order=None, hue_order=None, orient=None, color=None, palette=None, saturation=0.75, width=0.8, dodge=True, fliersize=5, linewidth=None, whis=1.5, ax=None, **kwargs)
    Draw a box plot to show distributions with respect to categories.
    
    A box plot (or box-and-whisker plot) shows the distribution of quantitative
    data in a way that facilitates comparisons between variables or across
    levels of a categorical variable. The box shows the quartiles of the
    dataset while the whiskers extend to show the rest of the distribution,
    except for points that are determined to be "outliers" using a method
    that is a function of the inter-quartile range.
    
    .. note::
        This function always treats one of the variables as categorical and
        draws data at ordinal positions (0, 1, ... n) on the relevant axis,
        even when the data has a numeric or date type.

In [7]:
type(x)

NoneType

In [8]:
!pip install sentencepiece

Defaulting to user installation because normal site-packages is not writeable
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [1]:
from llama_cpp import Llama


In [2]:
llm = Llama(
  model_path   = "/nfs/turbo/umms-indikar/shared/projects/RAG/models/Phi-3-mini-4k-instruct-q4.gguf",  # path to GGUF file
  n_ctx        = 4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
  n_threads    = 8, # The number of CPU threads to use, tailor to your system and the resulting performance
  n_gpu_layers = 0, # The number of layers to offload to GPU, if you have GPU acceleration available. Set to 0 if no GPU acceleration is available on your system.
)


llama_model_loader: loaded meta data with 24 key-value pairs and 195 tensors from /nfs/turbo/umms-indikar/shared/projects/RAG/models/Phi-3-mini-4k-instruct-q4.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  3072, 32064,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  3072,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [  8192,  3072,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.ffn_up.weight q4_K     [  3072, 16384,     1,     1 ]
llama_model_loader: - tensor    4:            blk.0.ffn_norm.weight f32      [  3072,     1,     1,     1 ]
llama_model_loader: - tensor    5:         blk.0.attn_output.weight q4_K     [  3072,  3072,     1,     1 ]
llama_model_loader: - tensor    6:            blk.0.attn_qkv.weight q5_K     [  3072,  9216,     1,     1 ]
llama_model_loader: - tensor    7:           blk.1.at

AssertionError: 

In [10]:
! CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python

Defaulting to user installation because normal site-packages is not writeable


In [9]:
!pip freeze

aiohttp==3.9.5
aiosignal==1.3.1
alabaster @ file:///home/ktietz/src/ci/alabaster_1611921544520/work
anaconda-client==1.11.1
anaconda-navigator==2.4.0
anaconda-project @ file:///opt/conda/conda-bld/anaconda-project_1660339890420/work
anndata==0.10.7
annotated-types==0.6.0
anyio @ file:///tmp/build/80754af9/anyio_1644481695334/work/dist
appdirs==1.4.4
argon2-cffi @ file:///opt/conda/conda-bld/argon2-cffi_1645000214183/work
argon2-cffi-bindings @ file:///tmp/build/80754af9/argon2-cffi-bindings_1644553347904/work
array_api_compat==1.6
arrow @ file:///croot/arrow_1676588132104/work
arxivscraper==0.0.5
asgiref==3.8.1
astroid @ file:///croot/astroid_1676904296642/work
astropy @ file:///opt/conda/conda-bld/astropy_1657786094003/work
asttokens @ file:///opt/conda/conda-bld/asttokens_1646925590279/work
async-timeout==4.0.3
atomicwrites==1.4.0
attrs==23.2.0
Automat @ file:///tmp/build/80754af9/automat_1600298431173/work
autopep8 @ file:///opt/conda/conda-bld/autopep8_1650463822033/work
Babel @ fi

# Didn't Work

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM  
#tokenizer = AutoTokenizer.from_pretrained("llmware/bling-1.4b-0.1")  
#model = AutoModelForCausalLM.from_pretrained("llmware/bling-1.4b-0.1")  

In [3]:
model_path = '/nfs/turbo/umms-indikar/shared/projects/RAG/models/'
model_name = 'pytorch_model.bin'
tokenizer = AutoTokenizer.from_pretrained(   model_path + model_name)  
model = AutoModelForCausalLM.from_pretrained(model_path + model_name)  

OSError: Incorrect path_or_model_id: '/nfs/turbo/umms-indikar/shared/projects/RAG/models/pytorch_model.bin'. Please provide either the path to a local folder or the repo_id of a model on the Hub.