In [None]:
%%capture
# Uncomment if you haven't these packages
%pip install --upgrade accelerate peft bitsandbytes trl huggingface_hub
%pip install "transformers==4.38.2" # Bug occured in v4.39.1 - AttributeError: 'torch.dtype' object has no attribute 'itemsize'
%pip install flash-attn --no-build-isolation #Nvidia download guide - https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2

In [None]:
from os import path,chdir
import sys
chdir(path.dirname(path.realpath(sys.argv[0]))) # change working directory to script location

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from scripts.improve_result import improve_result, generate_constraints
from scripts.jsonl_parser import read_jsonl, write_jsonl
from huggingface_hub import login

In [None]:
login()

In [None]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
    attn_implementation="flash_attention_2",
)
model_name = "Tony177/codellama-13b-dockerfile-generation"

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",
)
model.config.use_cache = True
model.config.pretraining_tp = 1 # Setting config.pretraining_tp to a value different than 1 will activate the more accurate but slower computation of the linear layers, which should better match the original logits.
model.enable_input_require_grads() # Warning about gradients during generation

In [None]:
# Load the tokenizer from Hugginface and set padding_side to “right” to fix the issue with fp16
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
def return_forced_words_ids(prompt: str) -> list:
    image_name, ports =  generate_constraints(prompt)
    forced_words_ids = []

    if image_name != "":
        forced_words_ids.append(tokenizer([f"FROM {image_name}", f"from {image_name}"],add_special_tokens=False).input_ids)
    else:
        forced_words_ids.append(tokenizer(["FROM", "from"],add_special_tokens=False).input_ids)

    if ports != "":
        forced_words_ids.append(tokenizer([f"EXPOSE {ports}", f"expose {ports}"],add_special_tokens=False).input_ids)
    forced_words_ids.append(tokenizer(["```dockerfile", "```Dockerfile"],add_special_tokens=False).input_ids)
    
    return forced_words_ids

In [None]:
bad_words = ["apk", "\begin(code)", "\\end(code)","EOF", "exit", "ONBUILD"]
bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids

In [None]:
def generate_text(tokenizer, model, prompt: str) -> str:
    prompt = "<s>[INST] " + prompt + " [/INST]"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda") # Added last part to avoid crash to KeyError: 'shape'
    # beam-search sampling if num_beams>1 
    gen_tokens = model.generate(input_ids, bad_words_ids=bad_words_ids , force_words_ids=return_forced_words_ids(prompt), max_new_tokens=512, num_beams=5, pad_token_id=tokenizer.eos_token_id)
    result = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0] # One element list, just the response
    return improve_result(result)

In [None]:
def generate_text_INST(tokenizer, model, prompt: str) -> str:
    prompt = "<s>[INST] " + prompt + " [/INST]"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda") # Added last part to avoid crash to KeyError: 'shape'
    # beam-search multinomial sampling if num_beams>1
    gen_tokens = model.generate(input_ids, max_new_tokens=512, num_beams=5, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id)
    result = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0] # One element list, just the response
    return improve_result(result)

In [None]:
def generate_text_FORCED(tokenizer, model, prompt: str) -> str:
    prompt = "<s>[INST] " + prompt + " [/INST]"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda") # Added last part to avoid crash to KeyError: 'shape'
    # beam-search multinomial sampling if num_beams>1
    gen_tokens = model.generate(input_ids, bad_words_ids=bad_words_ids , force_words_ids=return_forced_words_ids(prompt), max_new_tokens=512, num_beams=5, early_stopping=False,pad_token_id=tokenizer.eos_token_id)
    result = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0] # One element list, just the response
    return improve_result(result)

In [None]:
print(generate_text(tokenizer, model, "Generate a dockerfile of Wordpress 5.7"))

In [None]:
print(generate_text(tokenizer, model, "Generate a dockerfile of Python 3.7"))

In [None]:
print(generate_text(tokenizer, model, "Generate a dockerfile of Ruby 3.2.1"))

In [None]:
from tqdm import tqdm

input_list, output_list = read_jsonl("../dataset.jsonl")
codellama_output_list = []
for e in tqdm(input_list):
    codellama_output_list.append(generate_text(tokenizer, model, e))
write_jsonl(input_list, output_list, codellama_output_list, "../dataset_llm.jsonl")