In [1]:
%%capture
# Uncomment if you haven't these packages
%pip install --upgrade accelerate peft bitsandbytes trl huggingface_hub
%pip install "transformers==4.38.2" # Bug occured in v4.39.1 - AttributeError: 'torch.dtype' object has no attribute 'itemsize'
%pip install flash-attn --no-build-isolation #Nvidia download guide - https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2

In [2]:
from os import path,chdir
import sys
chdir(path.dirname(path.realpath(sys.argv[0]))) # change working directory to script location

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from scripts.improve_result import improve_result, generate_constraints
from scripts.jsonl_parser import read_jsonl, write_jsonl
from huggingface_hub import login

In [4]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
    attn_implementation="flash_attention_2",
)
model_name = "Tony177/codellama-13b-dockerfile-generation"

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",
)
model.config.use_cache = True
model.config.pretraining_tp = 1 # Setting config.pretraining_tp to a value different than 1 will activate the more accurate but slower computation of the linear layers, which should better match the original logits.
model.enable_input_require_grads() # Warning about gradients during generation

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
# Load the tokenizer from Hugginface and set padding_side to “right” to fix the issue with fp16
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [8]:
def return_forced_words_ids(prompt: str) -> list:
    image_name = generate_constraints(prompt)
    forced_words_ids = []

    if image_name != "":
        forced_words_ids.append(tokenizer(f"FROM {image_name}\n", add_special_tokens=True).input_ids)
    else:
        forced_words_ids.append(tokenizer("FROM", add_special_tokens=True).input_ids)
    forced_words_ids.append(tokenizer(["```dockerfile\n", "```Dockerfile\n"], add_special_tokens=True).input_ids)
    forced_words_ids.append(tokenizer(["ARG DEBIAN_FRONTEND noninteractive\n","ARG debian_frontend noninteractive\n"], add_special_tokens=True).input_ids)

    return forced_words_ids

In [9]:
bad_words = ["apk", "\begin(code)", "\\end(code)", "EOF", "exit", "ONBUILD", "alpine", "# FROM", "#FROM"]
bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids

In [10]:
def generate_text(tokenizer, model, prompt: str) -> str:
    prompt = "<s>[INST] " + prompt + " [/INST]"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda") # Added last part to avoid crash to KeyError: 'shape'
    # beam-search multinomial sampling if num_beams>1 and do_sample=True
    gen_tokens = model.generate(input_ids, bad_words_ids=bad_words_ids , force_words_ids=return_forced_words_ids(prompt), max_new_tokens=512, no_repeat_ngram_size=7, num_beams=5, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    result = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0] # One element list, just the response
    return improve_result(prompt, result)

In [11]:
def generate_text_INST(tokenizer, model, prompt: str) -> str:
    prompt = "<s>[INST] " + prompt + " [/INST]"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda") # Added last part to avoid crash to KeyError: 'shape'
    # beam-search multinomial sampling if num_beams>1
    gen_tokens = model.generate(input_ids, max_new_tokens=512, num_beams=5, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id)
    result = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0] # One element list, just the response
    return improve_result(prompt, result)

In [12]:
def generate_text_FORCED(tokenizer, model, prompt: str) -> str:
    prompt = "<s>[INST] " + prompt + " [/INST]"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda") # Added last part to avoid crash to KeyError: 'shape'
    # beam-search multinomial sampling if num_beams>1
    gen_tokens = model.generate(input_ids, bad_words_ids=bad_words_ids , force_words_ids=return_forced_words_ids(prompt), max_new_tokens=512, num_beams=5, early_stopping=False,pad_token_id=tokenizer.eos_token_id)
    result = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0] # One element list, just the response
    return improve_result(prompt, result)

In [13]:
print(generate_text(tokenizer, model, "Generate a dockerfile of Wordpress 5.7"))

[INST] Generate a dockerfile of Wordpress 5.7 [/INST] Here' ```dockerfile
 ARG DEBIAN_FRONTEND noninteractive

FROM FROM wordpress:5.7

# Install required packages
RUN apt-get update && apt-get install -y --no-install-recommends \
        curl \
        git \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

# Install Composer
RUN curl -sS https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer

# Install Node.js
RUN curl -sL https://deb.nodesource.com/setup_14.x | bash -
RUN apt-get install -y nodejs

# Install Yarn
RUN npm install -g yarn

# Install PHP extensions
RUN docker-php-ext-install -j "$(nproc)" pdo pdo_mysql zip

# Install Composer dependencies
# COPY composer.json composer.lock /var/www/html/
WORKDIR /var/www/html
RUN composer install --no-autoloader --no-scripts

# Install NPM dependencies
# COPY package.json package-lock.json ./
RUN yarn install --frozen-lockfile

# Install Bower dependencies
# COPY bower.json bower.json
RUN b

In [14]:
print(generate_text(tokenizer, model, "Generate a dockerfile of Python 3.7"))

ARG DEBIAN_FRONTEND noninteractive
 FROM FROM python:3.7

 # Set the working directory to /app
 WORKDIR /app

 # Copy the current directory contents into the container at /app
 COPY . /app

 # Install any needed packages specified in requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt

 # Make port 5000 available to the world outside this container
 EXPOSE 5000

 # Define environment variable
 ENV NAME World

 # Run app.py when the container launches
 CMD ["python", "app.py"]


In [15]:
print(generate_text(tokenizer, model, "Generate a dockerfile of Ruby 3.2.1"))

ARG DEBIAN_FRONTEND noninteractive
 FROM ruby FROM ruby:3.2.1

 # Install necessary packages
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     libpq-dev \
     libxml2-dev \
     libxslt-dev \
     libcurl4-openssl-dev \
     libffi-dev \
     nodejs \
     yarn \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*

 # Install bundler
 RUN gem install bundler

 # Set working directory
 WORKDIR /app

 # Copy Gemfile and Gemfile.lock
 COPY Gemfile Gemfile.lock ./

 # Install gems
 RUN bundle install

 # Copy the rest of the application code
 COPY . .

 # Expose port
 EXPOSE 3000

 # Start the application
 CMD ["bundle", "exec", "rails", "server", "-b", "0.0.0.0"]


In [16]:
print(generate_text(tokenizer, model, "Generate a dockerfile of uvdesk v1.1.3"))

ARG DEBIAN_FRONTEND noninteractive

FROM FROM ubuntu:20.04

# Install required packages
RUN apt-get update && apt-get install -y --no-install-recommends \
    curl \
    git \
    unzip \
    && rm -rf /var/lib/apt/lists/*

# Install Composer
RUN curl -sS https://getcomposer.org/installer | php -- --install-dir=/usr/local/bin --filename=composer

# Install Node.js
RUN curl -sL https://deb.nodesource.com/setup_14.x | bash -
RUN apt-get install -y nodejs

# Install Yarn
RUN npm install -g yarn

# Install uvdesk
RUN git clone https://github.com/uvdesk/core.git /uvdesk \
    && cd /uvdesk && git checkout v1.1.3 \
    && composer install --no-dev --optimize-autoloader \
    && yarn install \
    && yarn build

# Expose uvdesk port
EXPOSE 8080

# Set working directory
WORKDIR /uvdesk

# Start uvdesk
CMD ["php", "bin/console", "server:start", "0.0.0.0:8080"]


In [17]:
from tqdm import tqdm

input_list, output_list = read_jsonl("../dataset.jsonl")
codellama_output_list = []
for e in tqdm(input_list):
    codellama_output_list.append(generate_text(tokenizer, model, e))
write_jsonl(input_list, output_list, codellama_output_list, "../dataset_llm.jsonl")

100%|██████████| 80/80 [3:51:53<00:00, 173.92s/it]  
