In [1]:
%cd "AISocIMP23/Week 5/"

#imports
import os
import sys
from typing import List
import json
import warnings

import torch
import transformers
from datasets import load_dataset
import pandas as pd

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM

from utils.prompter import Prompter
warnings.filterwarnings('ignore')

/data/storage_elm_medium/AISocIMP23/Week 5


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#hparams
base_model = "data/storage_elm_medium/erasmian-lm/loopv3/checkpoint_3500000"
data_path = "/datasets"
output_dir = "/data/volume_2/CHAT_ELM_7M"
# training hyperparams
batch_size = 32
micro_batch_size = 4
num_epochs =  3
learning_rate =  2e-5
cutoff_len =  256
val_set_size =  0
# lora hyperparams
lora_r=  8
lora_alpha = 16
lora_dropout = 0.05
lora_target_modules= [
    "q_proj",
    "v_proj",
]
# llm hyperparams
train_on_inputs = False # if False, masks out inputs in loss
add_eos_token = False
group_by_length = True # false  # True = faster, but produces an odd training loss curve
resume_from_checkpoint = None  # either training checkpoint or final adapter
prompt_template_name = "alpaca"  # The prompt template to use, will default to alpaca.


In [3]:
class CustomPrompter(Prompter):
    def get_response(self, output: str) -> str:
        return output.split(self.template["response_split"])[1].strip().split("### Instruction:")[0]

prompter = CustomPrompter(prompt_template_name)

In [4]:
gradient_accumulation_steps = batch_size // micro_batch_size

device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1

In [5]:
from transformers import LlamaTokenizer, LlamaForCausalLM

token_path="/data/storage_elm_medium/AISocIMP23/Week 4/Token"
model_path="/data/storage_elm_medium/CHAT_ELM_7Mv3"

tokenizer = LlamaTokenizer.from_pretrained(token_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

tokenizer.pad_token_id =  0
model.config.pad_token_id = 0
#tokenizer.padding_side = "right"  # Allow batched inference

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
def tokenize(prompt, add_eos_token=True):
  result = tokenizer(
      prompt,
      truncation=True,
      max_length=cutoff_len,
      padding=False,
      return_tensors=None,
  )
  if (
      result["input_ids"][-1] != tokenizer.eos_token_id
      and len(result["input_ids"]) < cutoff_len
      and add_eos_token
  ):
      result["input_ids"].append(tokenizer.eos_token_id)
      result["attention_mask"].append(1)

  result["labels"] = result["input_ids"].copy()

  return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = prompter.generate_prompt(
        data_point["instruction"],
        data_point["input"],
        data_point["output"],
    )
    tokenized_full_prompt = tokenize(full_prompt)
    if not train_on_inputs:
        user_prompt = prompter.generate_prompt(
            data_point["instruction"], data_point["input"]
        )
        tokenized_user_prompt = tokenize(
            user_prompt, add_eos_token=add_eos_token
        )
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        if add_eos_token:
            user_prompt_len -= 1

        tokenized_full_prompt["labels"] = [
            -100
        ] * user_prompt_len + tokenized_full_prompt["labels"][
            user_prompt_len:
        ]  # could be sped up, probably
    return tokenized_full_prompt

In [7]:
instruction = "Explain what is economic growth" #

# Generate a response:
model = model.to("cuda:0")
input = None
prompt = prompter.generate_prompt(instruction, input)
inputs = tokenizer(prompt, return_tensors="pt")
inputs = inputs.to("cuda:0")
input_ids = inputs["input_ids"]

#play around with generation strategies for better/diverse sequences. https://huggingface.co/docs/transformers/generation_strategies
temperature=0.2
top_p=0.95
top_k=25
num_beams=1
# num_beam_groups=num_beams #see: 'Diverse beam search decoding'
max_new_tokens=256
repetition_penalty = 2.0
do_sample = True # allow 'beam sample': do_sample=True, num_beams > 1
num_return_sequences = 1 #generate multiple candidates, takes longer..

generation_config = transformers.GenerationConfig(
    temperature=temperature,
    top_p=top_p,
    top_k=top_k,
    num_beams=num_beams,
    repetition_penalty=repetition_penalty,
    do_sample=do_sample,
    min_new_tokens=32,
    num_return_sequences=num_return_sequences,
    pad_token_id = 0
    # num_beam_groups=num_beam_groups
)

generate_params = {
    "input_ids": input_ids,
    "generation_config": generation_config,
    "return_dict_in_generate": True,
    "output_scores": True,
    "max_new_tokens": max_new_tokens,
}
with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=max_new_tokens,
    )


print(f'Instruction: {instruction}')

for i,s in enumerate(generation_output.sequences):
  output = tokenizer.decode(s,skip_special_tokens=True)
  # print(output)
  print(f'Output {i}: {prompter.get_response(output)}')

Instruction: Explain what is economic growth
Output 0: Economic growth refers to changes in GDP, employment and income levels over time as well as its determinants such as population size or age distribution. It can be defined by two main components of economic development; i) The rate at which people are able to work (i.e., their ability to perform tasks), including working hours per week, number of days on job, and other activities like hiring new employees for jobs or training, while 2) A change in consumption patterns due to changing environmental conditions, eg, increasing energy expenditure, decreasing food prices, etcetera.


In [8]:
def evaluate(instruction):
    # Generate a response:
    input = None
    prompt = prompter.generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = inputs.to("cuda:0")
    input_ids = inputs["input_ids"]
    
    #play around with generation strategies for better/diverse sequences. https://huggingface.co/docs/transformers/generation_strategies
    temperature=0.2
    top_p=0.95
    top_k=25
    num_beams=1
    # num_beam_groups=num_beams #see: 'Diverse beam search decoding'
    max_new_tokens=256
    repetition_penalty = 2.0
    do_sample = True # allow 'beam sample': do_sample=True, num_beams > 1
    num_return_sequences = 1 #generate multiple candidates, takes longer..
    
    generation_config = transformers.GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        repetition_penalty=repetition_penalty,
        do_sample=do_sample,
        min_new_tokens=32,
        num_return_sequences=num_return_sequences,
        pad_token_id = 0
        # num_beam_groups=num_beam_groups
    )
    
    generate_params = {
        "input_ids": input_ids,
        "generation_config": generation_config,
        "return_dict_in_generate": True,
        "output_scores": True,
        "max_new_tokens": max_new_tokens,
    }
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    
    
    print(f'Instruction: {instruction}')
    
    for i,s in enumerate(generation_output.sequences):
      output = tokenizer.decode(s,skip_special_tokens=True)
      # print(output)
      return(f' {prompter.get_response(output)}')

In [10]:
import gradio as gr

In [None]:
gr.Interface(
    fn=evaluate,
    inputs=[
            gr.components.Textbox(
                lines=2,
                label="Instruction",
                placeholder="Tell me about alpacas.",
            ),
            gr.components.Textbox(lines=2, label="Input", placeholder="none"),
            gr.components.Slider(
                minimum=0, maximum=1, value=0.1, label="Temperature"
            ),
            gr.components.Slider(
                minimum=0, maximum=1, value=0.75, label="Top p"
            ),
            gr.components.Slider(
                minimum=0, maximum=100, step=1, value=40, label="Top k"
            ),
            gr.components.Slider(
                minimum=1, maximum=4, step=1, value=4, label="Beams"
            ),
            gr.components.Slider(
                minimum=1, maximum=2000, step=1, value=128, label="Max tokens"
            ),
            gr.components.Checkbox(label="Stream output"),
        ],
        outputs=[
            gr.components.Textbox(
                lines=5,
                label="Output",
            )
        ],
    title="🦙🌲 Alpaca-LoRA",
    description="Alpaca-LoRA is a 7B-parameter LLaMA model finetuned to follow instructions. It is trained on the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) dataset and makes use of the Huggingface LLaMA implementation. For more information, please visit [the project's website](https://github.com/tloen/alpaca-lora).",  # noqa: E501
    ).queue().launch(server_name="0.0.0.0", share=True)
     # Old testing code follows.

In [11]:
gr.Interface(
    fn=evaluate,
    inputs=[
            gr.components.Textbox(
                lines=2,
                label="Instruction",
                placeholder="Explain economic growth.",
            ),
        ],
        outputs=[
            gr.components.Textbox(
                lines=5,
                label="Output",
            )
        ],
    title="🌲 ELM - Erasmian Language Model",
    description="ELM is a 900M parameter language model finetuned to follow instruction. It is trained on Erasmus University academic outputs and the [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) dataset. For more information, please visit [the GitHub repository](https://github.com/Joaoffg/ELM).",  # noqa: E501
    ).queue().launch(server_name="0.0.0.0", share=True)
     # Old testing code follows.

Running on local URL:  http://0.0.0.0:7860
Running on public URL: https://fc37f2edaf8ee7bf98.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Instruction: What is academic freedom?
Instruction: What is life
