In [1]:
import os
import torch
import tqdm
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    AutoTokenizer,
    pipeline,
)
import gc
from tqdm import tqdm
import json
import re

In [2]:
name = "StructLM-7B-luis-QTSUMM"
model_name = f"./qtsumm/{name}"

dataset_name = "yale-nlp/QTSumm"

In [3]:
test = load_dataset(dataset_name, split="test")

# Create a new column 'prompt' in the dataset
test = test.add_column("prompt", [f"""Given following json that contains specifications of a product, generate a review of the key characteristics with json format. Follow the values on {{Keys}} to write the Output
    ### Product: {x["table"]}
    ### Keys: {x["query"]}
    ### Output:""" for x in test])

In [4]:
opt = model_name + "-QTSUMM"
dic = {opt: []}

In [5]:
# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

In [6]:
# Activate 8-bit precision base model loading
use_8bit = True

# Compute dtype for 4-bit base models
bnb_8bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [7]:
# Output directory where the model predictions and checkpoints will be stored
output_dir = f"./results/{model_name}/"

# Number of training epochs
num_train_epochs = 2

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 1

# Batch size per GPU for evaluation
per_device_eval_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 3

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 500

# Log every X updates steps
logging_steps = 25

In [8]:
# Maximum sequence length to use
max_seq_length = 1000

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [9]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_8bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_8bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [10]:

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(f"./tokenizers/{name}_tokenizer", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

for i, prompt in tqdm(enumerate(test)):
    tmp = {
        "Prompt": "",
        "Original": "",
        "Prediction": ""
    }
    resp = prompt["prompt"]
    prompt = resp
    #print(prompt)
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=2000)
    result = pipe(f"{prompt}")
    result = result[0]['generated_text']
    #print(result)
    tmp["Prompt"] = prompt
    tmp["Original"] = test["summary"][i]
    tmp["Prediction"] = result
    dic[opt].append(tmp)
del model
del tokenizer
gc.collect()
gc.collect()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

1078it [8:54:02, 29.72s/it]


0

In [11]:
results = {
    "Prompt": [],
    "Original": [],
    "Prediction": []
}

In [12]:
def format(val):
    val = val.replace('"', "'")
    val = '"' + val + '"'
    return val

In [13]:
def formatting_2(val):
    k = val.split("### Keys: ")[1].split("### Output: ")[1].split('\n')[0].replace('[', '').replace(']','').replace('\n', '').replace('\'', '')
    res = [k]
    return res

In [14]:
def formatting_3(val):
    out = val.split("'### Output':")[1].replace("'", "\"")
    k = val.split("### Keys: ")[1].split("'### Output':")[0].replace('[', '').replace(']','').replace('\n', '').replace('\'', '').split(", ")
    res = k
    return res

In [15]:
ommited = []
a = []
for i, t in enumerate(dic[opt]):
    try:
        val_p = t["Prediction"]
        print(val_p)
        if "llama" in model_name.lower():
            val_p = formatting_2(val_p)
            val_p = json.dumps(val_p)
        elif "Struct" in model_name:
            val_p = formatting_2(val_p) 
            val_p = json.dumps(val_p)
        else:
            val_p = formatting_2(val_p) 
            val_p = json.dumps(val_p)       
        
        #print(val_p, '\n aca termina la review')
        #val_p = format(val_p)
        #val_p = val_p + "\"}" if "\"}" not in val_p else val_p

        print(t["Original"])
        val = t["Original"]
        val = format(val)
        results["Original"].append(json.loads(val))
        results["Prediction"].append(json.loads(val_p))
        results["Prompt"].append(t["Prompt"])
        
    
    except Exception as e:
        ommited.append(i)
        print(e)
        a.append(t["Prediction"])
        

print(len(ommited), ommited)


Given following json that contains specifications of a product, generate a review of the key characteristics with json format. Follow the values on {Keys} to write the Output
    ### Product: {'header': ['Name', 'State', 'Birth', 'Death', 'Year appointed', 'Left office', 'Appointed by', 'Reason for termination'], 'rows': [['Roger B. Taney', 'Maryland', '1777', '1864', '1836', '1864', 'Jackson', 'death'], ['Edward Douglass White', 'Louisiana', '1845', '1921', '1894', '1921', 'Cleveland (associate) Taft (chief)', 'death'], ['Joseph McKenna', 'California', '1843', '1926', '1898', '1925', 'McKinley', 'retirement'], ['Pierce Butler', 'Minnesota', '1866', '1939', '1923', '1939', 'Harding', 'death'], ['Frank Murphy', 'Michigan', '1890', '1949', '1940', '1949', 'F. Roosevelt', 'death'], ['Sherman Minton', 'Indiana', '1890', '1965', '1949', '1956', 'Truman', 'death'], ['William J. Brennan, Jr.', 'New Jersey', '1906', '1997', '1956', '1990', 'Eisenhower', 'death'], ['Antonin Scalia', 'New Jersey

In [16]:
print(a, len(ommited))

["Given following json that contains specifications of a product, generate a review of the key characteristics with json format. Follow the values on {Keys} to write the Output\n    ### Product: {'header': ['Model', 'Qty', 'Built', 'First in', 'Last out', 'Ref(s)'], 'rows': [['Jodel D.120', '1', '1958', '1966', '1972', ''], ['Malmö MFI-9B', '2', '1964', '1968', '1974', ''], ['Beagle Pup', '1', '1969', '1974', '1977', ''], ['Robin DR400', '1', '1976', '1977', '1978', ''], ['Piper Cherokee', '2', '1976', '1977', '2001', ''], ['Piper Chieftain', '1', '1978', '1980', '1990', ''], ['Piper Tomahawk', '1', '1979', '1980', '1985', ''], ['Piper Seneca', '1', '1971', '1985', '1987', ''], ['Fuji FA-200 Aero Subaru', '1', '1985', '1986', '1996', ''], ['Socata Rallye', '1', '1980', '1987', '1993', ''], ['Grumman Cougar', '1', '1977', '1987', '?', ''], ['Bellanca Super Decathlon', '1', '1978', '1987', '1998', ''], ['Dornier Do 228', '2', '1984–85', '1989', '1992', '']], 'table_id': '7280d64e-9e4b-40

In [17]:
print(len(results["Original"]),len(results["Prompt"]),len(results["Prediction"]))

1075 1075 1075


In [18]:
print(len(a))

3


In [19]:
print(name)

StructLM-7B-luis-QTSUMM


In [20]:
with open(f'Outputs/Llama-2-7b-hf/{name}.json', 'w') as f:
    json.dump(results, f, indent=4)