In [1]:
import os
os.environ['HF_HOME'] = '/data1/malto/cache'

# Using a Quantized LLM and Few-Shot Learning to Generate Synthetic Labels

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from pathlib import Path
from random import random
from datasets import load_dataset

#model_name_or_path = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ"
#revision = "gptq-3bit-128g-actorder_True"

model_name_or_path = "TheBloke/SOLAR-10.7B-Instruct-v1.0-GPTQ"
revision = "gptq-8bit-32g-actorder_True"

# tasks MT, DM, PG
TASK_TYPE = "MT"
BATCH_SIZE = 40
BASE_DIR = Path("/data1/malto/shroom")

# To use a different branch, change revision
# For example: revision="gptq-4bit-128g-actorder_True"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision=revision)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

if tokenizer.pad_token == None: # apparently Mixtral does not have a padding token in tokenizer
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side = "left"

In [3]:
ds = load_dataset("json", data_files=["/data1/malto/shroom/val.model-agnostic.json"]).shuffle()
#ds['train'] = ds['train'].filter(lambda x: x['task'] == TASK_TYPE)
ds['train']

Dataset({
    features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src'],
    num_rows: 499
})

In [4]:
num_examples = 5
num_samples = len(ds['train'])
examples = "[INST] The following are examples of pairs of hyp and tgt sentences that can either be Hallucination or Not Hallucination depending on how similar they are [\INST]\n"

for i in range(num_examples):
    num = int(random() * num_samples)
    hyp = ds['train'][num]['hyp']
    tgt = ds['train'][num]['tgt']
    label = ds['train'][num]['label']
    examples += f"\nExample {i+1}\n<HYP>: {hyp}\n<TGT>: {tgt}\n<LABEL>: {label}\n"
print(examples)

[INST] The following are examples of pairs of hyp and tgt sentences that can either be Hallucination or Not Hallucination depending on how similar they are [\INST]

Example 1
<HYP>: Having a keen sense of humor.
<TGT>: Acrimonious, bitter, piercing.
<LABEL>: Hallucination

Example 2
<HYP>: (uncountable) Clothing.
<TGT>: (uncountable) (in combination) clothing
<LABEL>: Not Hallucination

Example 3
<HYP>: Are you certain that's the man?
<TGT>: Can you confirm it's him?
<LABEL>: Not Hallucination

Example 4
<HYP>: (informal) A fad.
<TGT>: A phenomenon that becomes popular for a very short time.
<LABEL>: Not Hallucination

Example 5
<HYP>: One square meter of solar battery produces about one watt of energy, so it’s not easy to use solar energy on a large scale right now.
<TGT>: The output power of a one square meter solar panel is about one watt, so it is difficult to use solar power on a large scale at present.
<LABEL>: Not Hallucination



In [5]:
def generate_prompt(mapped_ds):
    prompts = []
    for hyp, tgt in zip(mapped_ds["hyp"], mapped_ds["tgt"]):
        prompts.append(f"{examples}\n Example {num_examples+1}\n<HYP>: {hyp}\n<TGT>: {tgt}\n<LABEL>: ")
    return {'prompts' : prompts}

In [6]:
ds = ds.map(generate_prompt, batched=True)
ds

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src', 'prompts'],
        num_rows: 499
    })
})

In [7]:
def generate_prediction(mapped_ds):
    input_ids = tokenizer(mapped_ds['prompts'], padding=True, truncation=True, max_length=500, return_tensors='pt').input_ids.cuda()
    output = model.generate(inputs=input_ids, temperature=0.01, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=5)
    decoded_output = tokenizer.batch_decode(output)
    return {'output' : decoded_output}

In [8]:
ds = ds.map(generate_prediction, batched=True, batch_size=BATCH_SIZE)
ds

Map:   0%|          | 0/499 [00:00<?, ? examples/s]



DatasetDict({
    train: Dataset({
        features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src', 'prompts', 'output'],
        num_rows: 499
    })
})

In [9]:
def extract_synthetic_label(mapped_ds):
    syn_labels = []
    for item in mapped_ds['output']:
        if "Not" in item.splitlines()[-1]:
            syn_labels.append("Not Hallucination")
        else:
            syn_labels.append("Hallucination")
    return {'synthetic_labels' : syn_labels}

In [10]:
ds = ds.map(extract_synthetic_label, batched=True)
ds

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'label', 'model', 'ref', 'hyp', 'task', 'tgt', 'p(Hallucination)', 'src', 'prompts', 'output', 'synthetic_labels'],
        num_rows: 499
    })
})

In [11]:
correct = 0
for a, b in zip(ds['train']['label'], ds['train']['synthetic_labels']):
    if a == b:
        correct += 1
correct / num_samples

0.7755511022044088

In [12]:
ds['train'][0]

{'labels': ['Hallucination',
  'Hallucination',
  'Hallucination',
  'Hallucination',
  'Hallucination'],
 'label': 'Hallucination',
 'model': '',
 'ref': 'tgt',
 'hyp': 'Any of various insects of the family Mothidae, especially those of the genus Mothia.',
 'task': 'DM',
 'tgt': '(dated) A liver spot, especially an irregular or feathery one.',
 'p(Hallucination)': 1.0,
 'src': 'To remove <define> moth </define> patches , wash the spots with a solution of common bicarbonate of soda and water several times a day , until the patches are removed , which will usually be in forty - eight hours .',
 'prompts': "[INST] The following are examples of pairs of hyp and tgt sentences that can either be Hallucination or Not Hallucination depending on how similar they are [\\INST]\n\nExample 1\n<HYP>: Having a keen sense of humor.\n<TGT>: Acrimonious, bitter, piercing.\n<LABEL>: Hallucination\n\nExample 2\n<HYP>: (uncountable) Clothing.\n<TGT>: (uncountable) (in combination) clothing\n<LABEL>: Not H

## Create Labels for New Data Points

In [13]:
def create_empty_column():
    tr_ds = load_dataset("json", data_files=[str(BASE_DIR / "train.model-agnostic.json")])
    tr_ds = tr_ds.map(lambda x: {'labels' : []})
    tr_ds['train'].to_json(str(BASE_DIR / "train_labeled_SOLAR.model-agnostic.json"))

#create_empty_column()

In [14]:
tr_ds = load_dataset("json", data_files=[str(BASE_DIR / "train_labeled_SOLAR.model-agnostic.json")])

In [15]:
tr_ds = tr_ds.map(generate_prompt, batched=True)

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [16]:
tr_ds = tr_ds.map(generate_prediction, batched=True, batch_size=BATCH_SIZE)

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [17]:
tr_ds = tr_ds.map(extract_synthetic_label, batched=True)

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [20]:
def add_prediction(mapped_ds):
    mapped_ds['labels'].append(mapped_ds['synthetic_labels'])
    return mapped_ds

tr_ds = tr_ds.map(add_prediction)
tr_ds['train'][0]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

{'hyp': "Don't worry, it's only temporary.",
 'src': 'Не волнуйся. Это только временно.',
 'task': 'MT',
 'ref': 'either',
 'tgt': "Don't worry. It's only temporary.",
 'model': '',
 'labels': ['Not Hallucination'],
 'prompts': "[INST] The following are examples of pairs of hyp and tgt sentences that can either be Hallucination or Not Hallucination depending on how similar they are [\\INST]\n\nExample 1\n<HYP>: Having a keen sense of humor.\n<TGT>: Acrimonious, bitter, piercing.\n<LABEL>: Hallucination\n\nExample 2\n<HYP>: (uncountable) Clothing.\n<TGT>: (uncountable) (in combination) clothing\n<LABEL>: Not Hallucination\n\nExample 3\n<HYP>: Are you certain that's the man?\n<TGT>: Can you confirm it's him?\n<LABEL>: Not Hallucination\n\nExample 4\n<HYP>: (informal) A fad.\n<TGT>: A phenomenon that becomes popular for a very short time.\n<LABEL>: Not Hallucination\n\nExample 5\n<HYP>: One square meter of solar battery produces about one watt of energy, so it’s not easy to use solar ener

In [21]:
tr_ds['train'].to_json(str(BASE_DIR / "train_labeled_SOLAR.model-agnostic.json"))

Creating json from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

79891157