In [1]:
%cd "AISocIMP23/Week 5/"

#imports
import os
import sys
from typing import List
import json
import warnings

import torch
import transformers
from datasets import load_dataset
import pandas as pd

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import LlamaForCausalLM, LlamaTokenizer

from utils.prompter import Prompter
warnings.filterwarnings('ignore')

/data/volume_2/AISocIMP23/Week 5


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#hparams
base_model = "/erasmian-lm/loopv3/checkpoint_3500000"
data_path = "/data/volume_2/datasets"
output_dir = "/data/volume_2/CHAT_ELM_7Mv3"
# training hyperparams
batch_size = 32
micro_batch_size = 4
num_epochs =  3
learning_rate =  2e-5
cutoff_len =  256
val_set_size =  0
# lora hyperparams
lora_r= 64
lora_alpha = 128
lora_dropout = 0.05
lora_target_modules= [
    "q_proj",
    "v_proj",
]
# llm hyperparams
train_on_inputs = False # if False, masks out inputs in loss
add_eos_token = False
group_by_length = True # false  # True = faster, but produces an odd training loss curve
resume_from_checkpoint = None  # either training checkpoint or final adapter
prompt_template_name = "alpaca"  # The prompt template to use, will default to alpaca.


In [3]:
class CustomPrompter(Prompter):
    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip().split("### Instruction:")[0]

prompter = CustomPrompter(prompt_template_name)

In [4]:
gradient_accumulation_steps = batch_size // micro_batch_size

device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1

In [8]:
from transformers import LlamaTokenizer, LlamaForCausalLM

token_path="/data/volume_2/AISocIMP23/Week 4/Token/"
model_path="/data/volume_2/erasmian-lm/loopv3/checkpoint_3250000/"

tokenizer = LlamaTokenizer.from_pretrained(token_path)
model = LlamaForCausalLM.from_pretrained(model_path)

tokenizer.pad_token_id =  0
model.config.pad_token_id = 0
#tokenizer.padding_side = "right"  # Allow batched inference

In [9]:
def tokenize(prompt, add_eos_token=True):
  result = tokenizer(
      prompt,
      truncation=True,
      max_length=cutoff_len,
      padding=False,
      return_tensors=None,
  )
  if (
      result["input_ids"][-1] != tokenizer.eos_token_id
      and len(result["input_ids"]) < cutoff_len
      and add_eos_token
  ):
      result["input_ids"].append(tokenizer.eos_token_id)
      result["attention_mask"].append(1)

  result["labels"] = result["input_ids"].copy()

  return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = prompter.generate_prompt(
        data_point["instruction"],
        data_point["input"],
        data_point["output"],
    )
    tokenized_full_prompt = tokenize(full_prompt)
    if not train_on_inputs:
        user_prompt = prompter.generate_prompt(
            data_point["instruction"], data_point["input"]
        )
        tokenized_user_prompt = tokenize(
            user_prompt, add_eos_token=add_eos_token
        )
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if add_eos_token:
            user_prompt_len -= 1

        tokenized_full_prompt["labels"] = [
            -100
        ] * user_prompt_len + tokenized_full_prompt["labels"][
            user_prompt_len:
        ]  # could be sped up, probably
    return tokenized_full_prompt

In [10]:
model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=lora_target_modules,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

if data_path.endswith(".json") or data_path.endswith(".jsonl"):
    data = load_dataset("json", data_files=data_path)
else:
    data = load_dataset(data_path)

data = data.shuffle()

if resume_from_checkpoint:
    # Check the available weights and load them
    checkpoint_name = os.path.join(
        resume_from_checkpoint, "pytorch_model.bin"
    )  # Full checkpoint
    if not os.path.exists(checkpoint_name):
        checkpoint_name = os.path.join(
            resume_from_checkpoint, "adapter_model.bin"
        )  # only LoRA model - LoRA config above has to fit
        resume_from_checkpoint = (
            False  # So the trainer won't try loading its state
        )
    # The two files above have a different name depending on how they were saved, but are actually the same.
    if os.path.exists(checkpoint_name):
        print(f"Restarting from {checkpoint_name}")
        adapters_weights = torch.load(checkpoint_name)
        set_peft_model_state_dict(model, adapters_weights)
    else:
        print(f"Checkpoint {checkpoint_name} not found")

model.print_trainable_parameters()  # Be more transparent about the % of trainable params.

trainable params: 8,388,608 || all params: 911,280,128 || trainable%: 0.9205301138751486


In [11]:
if val_set_size > 0:
    train_val = data["train"].train_test_split(
        test_size=val_set_size, shuffle=True, seed=42 #test_size=val_set_size
    )
    train_data = (
        train_val["train"].shuffle().map(generate_and_tokenize_prompt)
    )
    val_data = (
        train_val["test"].shuffle().map(generate_and_tokenize_prompt)
    )
else:
    train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
    val_data = None

if not ddp and torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    model.is_parallelizable = True
    model.model_parallel = True

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=micro_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        fp16=True,
        logging_steps=100,
        optim="adamw_torch",
        evaluation_strategy="steps" if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=200 if val_set_size > 0 else None,
        save_steps=200,
        output_dir=output_dir,
        save_total_limit=3,
        load_best_model_at_end=True if val_set_size > 0 else False,
        ddp_find_unused_parameters=False if ddp else None,
        group_by_length=group_by_length
    ),
    data_collator=transformers.DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)
model.config.use_cache = False



#if torch.__version__ >= "2" and sys.platform != "win32":
#    model = torch.compile(model)


Map: 100%|██████████| 103705/103705 [02:04<00:00, 833.71 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [12]:
import wandb

wandb.init(project='ELM_Chat')

[34m[1mwandb[0m: Currently logged in as: [33mferreiragoncalves[0m ([33melm-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
#tokenizer.pad_token = tokenizer.eos_token
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
model.save_pretrained("/data/volume_2/CHAT_ELM_7Mv3")
model.eval()
model = model.to("cuda:0")

Step,Training Loss
100,3.7623
200,3.5919
300,3.5249
400,3.5125
500,3.4761
600,3.4827
700,3.4611
800,3.4495
900,3.4257
1000,3.4652


In [43]:
instruction = "schrijf een examenvraag over economische groei" #

# Generate a response:
model = model.to("cuda:0")
input = None
prompt = prompter.generate_prompt(instruction, input)
inputs = tokenizer(prompt, return_tensors="pt")
inputs = inputs.to("cuda:0")
input_ids = inputs["input_ids"]

#play around with generation strategies for better/diverse sequences. https://huggingface.co/docs/transformers/generation_strategies
temperature=0.4
top_p=0.95
top_k=25
num_beams=1
# num_beam_groups=num_beams #see: 'Diverse beam search decoding'
max_new_tokens=256
repetition_penalty = 2.0
do_sample = True # allow 'beam sample': do_sample=True, num_beams > 1
num_return_sequences = 4 #generate multiple candidates, takes longer..

generation_config = transformers.GenerationConfig(
    temperature=temperature,
    top_p=top_p,
    top_k=top_k,
    num_beams=num_beams,
    repetition_penalty=repetition_penalty,
    do_sample=do_sample,
    min_new_tokens=94,
    num_return_sequences=num_return_sequences,
    pad_token_id = 0
    # num_beam_groups=num_beam_groups
)

generate_params = {
    "input_ids": input_ids,
    "generation_config": generation_config,
    "return_dict_in_generate": True,
    "output_scores": True,
    "max_new_tokens": max_new_tokens,
}
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=max_new_tokens,
    )


print(f'Instruction: {instruction}')

for i,s in enumerate(generation_output.sequences):
  output = tokenizer.decode(s,skip_special_tokens=True)
  # print(output)
  print(f'Output {i}: {prompter.get_response(output)}')

Instruction: schrijf een examenvraag over economische groei
Output 0: Hoe heeft de wereldwijde economie zich ontwikkeld in deze periode?
Output 1: Hoe zijn de financiële en niet-financiële ontwikkeling van Nederland in het afgelopen decennium?
Output 2: Wat was de invloed van het aantal banen op de werkgelegenheid in Nederland?
Output 3: Wat zijn de belangrijkste factoren die het succes van ons land kunnen beïnvloeden?


In [44]:
trainer.save_model("/data/volume_2/full_model/full_model_chat3")

In [77]:
def evaluate(instruction):
    # Generate a response:
    input = None
    prompt = prompter.generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = inputs.to("cuda:0")
    input_ids = inputs["input_ids"]
    
    #play around with generation strategies for better/diverse sequences. https://huggingface.co/docs/transformers/generation_strategies
    temperature=0.3
    top_p=0.95
    top_k=25
    num_beams=1
    # num_beam_groups=num_beams #see: 'Diverse beam search decoding'
    max_new_tokens=256
    repetition_penalty = 2.0
    do_sample = True # allow 'beam sample': do_sample=True, num_beams > 1
    num_return_sequences = 1 #generate multiple candidates, takes longer..
    
    generation_config = transformers.GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        repetition_penalty=repetition_penalty,
        do_sample=do_sample,
        min_new_tokens=192,
        num_return_sequences=num_return_sequences,
        pad_token_id = 0
        # num_beam_groups=num_beam_groups
    )
    
    generate_params = {
        "input_ids": input_ids,
        "generation_config": generation_config,
        "return_dict_in_generate": True,
        "output_scores": True,
        "max_new_tokens": max_new_tokens,
    }
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    
    
    print(f'Instruction: {instruction}')
    
    for i,s in enumerate(generation_output.sequences):
      output = tokenizer.decode(s,skip_special_tokens=True)
      # print(output)
      return(f' {prompter.get_response(output)}')

In [62]:
import gradio as gr

In [79]:
gr.Interface(
    fn=evaluate,
    inputs=[
            gr.components.Textbox(
                lines=2,
                label="Instruction",
                placeholder="Explain economic growth.",
            ),
        ],
        outputs=[
            gr.components.Textbox(
                lines=5,
                label="Output",
            )
        ],
    title="🌲 ELM - Erasmian Language Model",
    description="ELM is a 900M parameter language model finetuned to follow instructions. It is trained on Erasmus University academic outputs and the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) dataset. For more information, please visit [the GitHub repository](https://github.com/Joaoffg/ELM).",  # noqa: E501
    ).queue().launch(server_name="0.0.0.0", share=True)
     # Old testing code follows.

Running on local URL:  http://0.0.0.0:7874
Running on public URL: https://de48d9a57be1bc1b63.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Instruction: explain economic growth

