In [1]:
import os
import torch
import torch.nn as nn
# import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

In [2]:
# 1) Setup the model
checkpoint = "bigscience/bloomz-560m"
# checkpoint = "bigscience/bigscience-small-testing"

model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map='auto') # load_in_8bit=True,
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [3]:
# 2) Freeze the original weights
for param in model.parameters():
    param.requires_grad = False # freeze the model - train adapters later
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable() # reduce the number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)

model.lm_head = CastOutputToFloat(model.lm_head)

In [4]:
# 3) Setting up the LoRA Adapters
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100*trainable_params/all_param}")
    
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, # attention heads
    lora_alpha=32, # alpha scaling TODO: Research what this is
    # target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 1572864 || all params: 560787456 || trainable%: 0.2804741766549072


In [5]:
# 4) Data
import transformers
from datasets import load_dataset
data = load_dataset(path="./datasets/cleaned_queries/")

Found cached dataset json (C:/Users/Alexis Strappazzon/.cache/huggingface/datasets/json/cleaned_queries-4d88e17e06e65763/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
data['train']['query'][:5]

['SELECT ?property ?propertyType ?propertyLabel ?propertyDescription WHERE {\n?property wikibase:propertyType ?propertyType .\nSERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }\n} ORDER BY ASC(xsd:integer(STRAFTER(STR(?property), \'P\')))',
 'SELECT ?id ?idLabel ?idDescription ?new{\n?id wikibase:directClaim ?pid .\nminus{?id wikibase:propertyType wikibase:ExternalId}\nBIND(Replace(STR(?id),"http://www.wikidata.org/entity/P"," ") as ?new)\nSERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }\n}\nORDER BY DESC(xsd:integer(?new))',
 'SELECT (COUNT(?article) AS ?count)\nWHERE {\n?article wdt:P31/wdt:P279* wd:Q13442814\n}',
 'SELECT (COUNT(DISTINCT ?article) AS ?count)\nWHERE {?article wdt:P31/wdt:P279* wd:Q95074}',
 'SELECT (COUNT(?item) AS ?count)\nWHERE { ?item wdt:P625 [] }']

In [7]:
data["train"]['metadata'][:5]

[{'context': 'Counting stuff on Wikidata\nAll Wikidata properties with label and description, ordered numerically\nAdapted from one of the Query Service Examples',
  'description': 'Wikidata properties in numerical order'},
 {'context': 'Counting stuff on Wikidata\nVariation of the above excluding external IDs (thanks Magnus Salgo)',
  'description': 'Wikidata properties excluding external IDs'},
 {'context': 'Counting stuff on Wikidata\nCount of scientific articles',
  'description': ''},
 {'context': 'Counting stuff on Wikidata\nCount of fictional characters',
  'description': 'Count of fictional characters'},
 {'context': 'Counting stuff on Wikidata\nCount of items with coordinate locations',
  'description': 'Count of items with coordinate locations'}]

In [8]:
def merge_columns(example):
    example["prediction"] = example["metadata"]["context"] + "\n" + example["metadata"]["description"] + " ->: " + str(example["query"])
    return example

data["train"] = data["train"].map(merge_columns)
data["train"]["prediction"][:5]

Map:   0%|          | 0/6739 [00:00<?, ? examples/s]

['Counting stuff on Wikidata\nAll Wikidata properties with label and description, ordered numerically\nAdapted from one of the Query Service Examples\nWikidata properties in numerical order ->: SELECT ?property ?propertyType ?propertyLabel ?propertyDescription WHERE {\n?property wikibase:propertyType ?propertyType .\nSERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }\n} ORDER BY ASC(xsd:integer(STRAFTER(STR(?property), \'P\')))',
 'Counting stuff on Wikidata\nVariation of the above excluding external IDs (thanks Magnus Salgo)\nWikidata properties excluding external IDs ->: SELECT ?id ?idLabel ?idDescription ?new{\n?id wikibase:directClaim ?pid .\nminus{?id wikibase:propertyType wikibase:ExternalId}\nBIND(Replace(STR(?id),"http://www.wikidata.org/entity/P"," ") as ?new)\nSERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }\n}\nORDER BY DESC(xsd:integer(?new))',
 'Counting stuff on Wikidata\nCount of scientific articles\n ->: S

In [9]:
data['train'][0]

{'query': 'SELECT ?property ?propertyType ?propertyLabel ?propertyDescription WHERE {\n?property wikibase:propertyType ?propertyType .\nSERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }\n} ORDER BY ASC(xsd:integer(STRAFTER(STR(?property), \'P\')))',
 'metadata': {'context': 'Counting stuff on Wikidata\nAll Wikidata properties with label and description, ordered numerically\nAdapted from one of the Query Service Examples',
  'description': 'Wikidata properties in numerical order'},
 'prediction': 'Counting stuff on Wikidata\nAll Wikidata properties with label and description, ordered numerically\nAdapted from one of the Query Service Examples\nWikidata properties in numerical order ->: SELECT ?property ?propertyType ?propertyLabel ?propertyDescription WHERE {\n?property wikibase:propertyType ?propertyType .\nSERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }\n} ORDER BY ASC(xsd:integer(STRAFTER(STR(?property), \'P\')))'}

In [10]:
data_processed = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)

Map:   0%|          | 0/6739 [00:00<?, ? examples/s]

In [11]:
data_processed

DatasetDict({
    train: Dataset({
        features: ['query', 'metadata', 'prediction', 'input_ids', 'attention_mask'],
        num_rows: 6739
    })
})

In [12]:
# class SparQLDataset(torch.utils.data.Dataset):
#     def __init__(self, )

In [13]:
# 5) Training

trainer = transformers.Trainer(
    model=model,
    train_dataset=data_processed['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2, # number of example per batch
        gradient_accumulation_steps=8, # number of batch to see before applying gradient update -> per_device_train_batch_size * gradient_accumulation_steps = number of example seen before an update
        warmup_steps=100, # starts with a very low lr and linearly goes up to the target lr every steps
        max_steps=10000, # number of steps after which the traning stops
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
trainer.train()



  0%|          | 0/10000 [00:00<?, ?it/s]

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 4.1955, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}
{'loss': 4.7607, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}
{'loss': 4.2144, 'learning_rate': 6e-06, 'epoch': 0.01}
{'loss': 5.26, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.01}
{'loss': 4.6033, 'learning_rate': 1e-05, 'epoch': 0.01}
{'loss': 4.6875, 'learning_rate': 1.2e-05, 'epoch': 0.01}
{'loss': 4.5105, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.02}
{'loss': 4.6486, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.02}
{'loss': 4.7938, 'learning_rate': 1.8e-05, 'epoch': 0.02}
{'loss': 4.7488, 'learning_rate': 2e-05, 'epoch': 0.02}
{'loss': 4.8069, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.03}
{'loss': 4.8849, 'learning_rate': 2.4e-05, 'epoch': 0.03}
{'loss': 4.3017, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.03}


KeyboardInterrupt: 

In [19]:
trainer.save_model()

In [None]:
# 6) Push model to hub
from huggingface_hub import notebook_login

notebook_login()

In [None]:

model.push_to_hub(user/repoid,
                  use_auth_token=True,
                  commit_message="basic training",
                  private=False)

In [None]:
# Inference

import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "username/repoid"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, device_map="auto") # load_in_8bits=True,
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

In [28]:
batch = tokenizer('“Training models with PEFT and LoRA is cool” ->: ', return_tensors='pt').to(model.device)
print(batch)
model.eval()
with torch.cuda.amp.autocast():
    output_tokens = model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"],  max_new_tokens=50)
    
print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


{'input_ids': tensor([[ 86753,   7508,    386,  20038,   1002,    426, 108045,    530,   9810,
          14062,    632,  35847,    982,  11953,     29,    210]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


 “Training models with PEFT and LoRA is cool” ->:                                                   
