In [1]:
import os
os.environ['HF_HOME'] = '/data1/malto/cache'

# Quantized GGUF Test with llama-cpp-python

In [5]:
from llama_cpp import Llama

model_path="/data1/malto/quantized/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = Llama(
  model_path=model_path,  # Download the model file first
  n_ctx=2048,  # The max sequence length to use - note that longer sequence lengths require much more resources
  n_threads=16,            # The number of CPU threads to use, tailor to your system and the resulting performance
  n_gpu_layers=0         # The number of layers to offload to GPU, if you have GPU acceleration available
)

print("model instatiated...")

# Simple inference example
output = llm(
  "[INST] {prompt} [/INST]", # Prompt
  max_tokens=512,  # Generate up to 512 tokens
  stop=["</s>"],   # Example stop token - not necessarily correct for this specific model! Please check before using.
  echo=True        # Whether to echo the prompt
)

# Chat Completion API

llm = Llama(model_path=model_path, chat_format="llama-2")  # Set chat_format according to the model you are using
llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are a story writing assistant."},
        {
            "role": "user",
            "content": "Write a story about llamas."
        }
    ]
)

llama_model_loader: loaded meta data with 26 key-value pairs and 995 tensors from /data1/malto/quantized/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:          blk.0.ffn_gate.0.weight q4_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    2:          blk.0.ffn_down.0.weight q4_K     [ 14336,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_up.0.weight q4_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    4:          blk.0.ffn_gate.1.weight q4_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    5:          blk.0.ffn_down.1.weight q4_K     [ 14336,  4096,     1,     1 ]
llama_model_loader: - tensor    6:            blk.0.ffn_up.1.weight q4_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    7:          blk.0.ffn_gate.2.weight q4_K 

model instatiated...



llama_print_timings:        load time =    1014.83 ms
llama_print_timings:      sample time =      26.88 ms /   185 runs   (    0.15 ms per token,  6882.95 tokens per second)
llama_print_timings: prompt eval time =    1014.80 ms /    12 tokens (   84.57 ms per token,    11.83 tokens per second)
llama_print_timings:        eval time =   35471.12 ms /   184 runs   (  192.78 ms per token,     5.19 tokens per second)
llama_print_timings:       total time =   36743.60 ms
llama_model_loader: loaded meta data with 26 key-value pairs and 995 tensors from /data1/malto/quantized/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:          blk.0.ffn_gate.0.weight q4_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    2:          blk.0.ffn_down.0.weight q4_K     [ 14336,  4096,     1,     1 ]
llama_model_loader: - tensor    3

{'id': 'chatcmpl-0da29388-160c-401a-98d9-f335169a6fc7',
 'object': 'chat.completion',
 'created': 1703156764,
 'model': '/data1/malto/quantized/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': ' Once upon a time, in the highlands of Peru, there was a peaceful valley where a group of llamas lived. The leader of the herd was an old and wise llama named Llucho. He was respected by all the other llamas for his knowledge and experience.\n\nLlucho\'s best friend was a young and energetic llama named Pisco. Pisco was always eager to explore new places and try new things, which often led him into trouble. But Llucho was always there to guide and protect him.\n\nOne day, while they were grazing in the valley, Pisco saw a group of humans approaching. He became curious and decided to go closer to take a look. Llucho tried to stop him, but Pisco was too fast. Before he knew it, Pisco was surrounded by the humans.\n\nThe humans we

# Quantized GPTQ Model Test

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

#model_name_or_path = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
model_name_or_path = "TheBloke/SOLAR-10.7B-Instruct-v1.0-GPTQ"


#revision = "gptq-3bit-128g-actorder_True"
revision = "gptq-8bit-32g-actorder_True"

# To use a different branch, change revision
# For example: revision="gptq-4bit-128g-actorder_True"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision=revision)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

prompt = "Write a story about llamas"
system_message = "You are a story writing assistant"
prompt_template=f'''[INST] {prompt} [/INST]'''
#openorca prompt below
prompt_template=f'''SYSTEM: {system_message}
USER: {prompt}
ASSISTANT:
'''
# SOLAR-instruct format
prompt_template=f"{prompt}"

In [3]:
print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
print(tokenizer.decode(output[0]))



*** Generate:




<s> Write a story about llamas.
Once upon a time in the lush Andean mountains, there was a herd of llamas roaming freely. Led by the wise and old llama named Llama, the herd was known for their soft fur and gentle nature. They lived a simple life, grazing on the rich grasses and drinking from the crystal clear streams.

One day, a group of travelers came to the mountain seeking guidance from the wise Llama. The travelers, a young couple, were lost and searching for a special healing herb that could save the life of their sick child. Llama listened intently to their story, and he knew he had to help them.

He gathered the herd, and together they set out on a perilous journey through treacherous passes and deep valleys. The llamas navigated the mountain paths with their keen senses, and their soft padded feet made no sound as they moved through the wilderness. The young couple was amazed at their strength and loyalty.

As the sun began to set, Llama led them to a hidden valley they had n

In [4]:
print("Testing parallelism...")
input_ids = tokenizer(35*[prompt_template], return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)

Testing parallelism...


In [8]:
print(tokenizer.decode(output[10]))

<s> Write a story about llamas
The Sunlit Fields of Llama Lore

In the lush green valley, nestled between two soaring mountains, lay a quaint and peaceful village. The villagers lived a simple life, surrounded by nature's bounty, and their pride and joy were the herds of llamas that roamed the sunlit fields. The llamas were not only a source of income, but also beloved companions and protectors of the village.

Legend has it that the llamas had found their way to the village centuries ago. A group of explorers from a distant land had passed through the valley, seeking shelter from a raging storm. The villagers took them in, and in gratitude, the explorers gifted a few llamas to the community. Over time, the llamas had thrived and become an integral part of village life.

Each llama had its own unique personality. There was Percy, the brave and loyal guardian who would stand sentry at the village gates, alerting the villagers of any danger. Then there was Llama Lucy, known for her exqui

# Baseline (Text Classification)

In [2]:
from transformers import TrainingArguments
import evaluate
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer
import numpy as np

os.environ['TOKENIZERS_PARALLELISM'] = "false"
os.environ["WANDB_DISABLED"] = "true"
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

BATCH_SIZE = 4
NUM_EPOCHS = 5
MULTI_STAGES = False
FREEZE = False
FROZEN_LAYERS = 15

In [3]:
#checkpoint = "microsoft/deberta-v2-xxlarge-mnli" # too big cannot train all of it and freezing stuff is suboptimal
#checkpoint = "bert-base-uncased"
checkpoint = "microsoft/deberta-xlarge-mnli"
#checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
from datasets import load_dataset

ds = load_dataset("json", data_files=["/data1/malto/shroom/val.model-agnostic.json"]).shuffle()
ds2 = load_dataset("json", data_files=["/data1/malto/shroom/trial-v1.json"])
ds = ds['train'].train_test_split(train_size=0.8) # more representative, apparently trial is easier or something
#ds['test'] = ds2['train']
ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src'],
        num_rows: 399
    })
    test: Dataset({
        features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src'],
        num_rows: 100
    })
})

In [5]:
from datasets import DatasetDict
ds_task = DatasetDict()
ds_not_task = DatasetDict()
ds_task['train'] = ds['train'].filter(lambda x: x['task'] == "MT")
ds_task['test'] = ds['test'].filter(lambda x: x['task'] == "MT")

ds_not_task['train'] = ds['train'].filter(lambda x: x['task'] != "MT")
ds_not_task['test'] = ds['test'].filter(lambda x: x['task'] != "MT")

ds_task, ds_not_task

Filter:   0%|          | 0/399 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/399 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

(DatasetDict({
     train: Dataset({
         features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src'],
         num_rows: 152
     })
     test: Dataset({
         features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src'],
         num_rows: 35
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src'],
         num_rows: 247
     })
     test: Dataset({
         features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src'],
         num_rows: 65
     })
 }))

In [6]:
def preprocess_function(examples):
    texts = []
    for hyp, tgt, task in zip(examples["hyp"], examples['tgt'], examples['task']):
        texts.append(f"{hyp} {tokenizer.sep_token} {task} {tokenizer.sep_token} {tgt}")
    model_inputs = tokenizer(texts)
    model_inputs["label"] = [1 if t == "Hallucination" else 0 for t in examples['label']]
    return model_inputs

In [7]:
ds = ds.map(preprocess_function, batched=True)
ds = ds.remove_columns(['hyp', 'ref', 'task', 'p(Hallucination)', 'labels', 'tgt', 'model', 'src'])

if ds_task is not None:
    ds_task = ds_task.map(preprocess_function, batched=True)
    ds_task = ds_task.remove_columns(['hyp', 'ref', 'task', 'p(Hallucination)', 'labels', 'tgt', 'model', 'src'])

    ds_not_task = ds_not_task.map(preprocess_function, batched=True)
    ds_not_task = ds_not_task.remove_columns(['hyp', 'ref', 'task', 'p(Hallucination)', 'labels', 'tgt', 'model', 'src'])

Map:   0%|          | 0/399 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/152 [00:00<?, ? examples/s]

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Map:   0%|          | 0/247 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

In [8]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
import evaluate
import numpy as np

def compute_metrics(eval_pred):
    accuracy = evaluate.load("accuracy")
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [10]:
id2label = {0: "Not Hallucination", 1: "Hallucination"}
label2id = {"Not Hallucination": 0, "Hallucination": 1}

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge-mnli and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
if FREEZE == True and checkpoint.startswith("microsoft"):
    print("freezing...")
    for param in model.deberta.embeddings.parameters():
        param.requires_grad = False
    for param in model.deberta.encoder.layer[:FROZEN_LAYERS].parameters():
        param.requires_grad = False

In [13]:
training_args = TrainingArguments(
    output_dir="/data1/malto/shroom/checkpoint/local_model",
    learning_rate=1e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="no",
    logging_steps=1,
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [14]:
def train_with_dataset(ds):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

In [15]:
if MULTI_STAGES == False:
    train_with_dataset(ds)
else:
    train_with_dataset(ds_not_task)
    train_with_dataset(ds_task)

***** Running training *****
  Num examples = 399
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 500


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5591,0.508947,0.79
2,0.3291,0.944425,0.79
3,0.0895,1.318534,0.75
4,0.0538,1.39008,0.78
5,0.025,1.475476,0.75


***** Running Evaluation *****
  Num examples = 100
  Batch size = 4
***** Running Evaluation *****
  Num examples = 100
  Batch size = 4
***** Running Evaluation *****
  Num examples = 100
  Batch size = 4
***** Running Evaluation *****
  Num examples = 100
  Batch size = 4
***** Running Evaluation *****
  Num examples = 100
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)


