In [1]:
%%capture
# Uncomment if you haven't these packages
%pip install accelerate datasets transformers torch bitsandbytes huggingface_hub langchain

In [2]:
from os import path
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    pipeline
)
from requests import get

In [3]:
model = "meta-llama/Llama-2-7b-chat-hf"
# Load the tokenizer from Hugginface and set padding_side to “right” to fix the issue with fp16
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True,device_map="cpu")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [4]:
import tarfile
from io import BytesIO

# Check if the directory exists
if not path.exists('deduplicated-sources'):
    # If not, download and extract the file
    response = get("https://github.com/jjhenkel/binnacle-icse2020/raw/00a8fb5220377211b059b35539be4594b703d106/datasets/0b-deduplicated-dockerfile-sources/github.tar.xz", stream=True)
    if response.status_code == 200:
        # Extract the tar.xz file
        with tarfile.open(fileobj=BytesIO(response.content),mode='r:xz') as tar:
            if hasattr(tarfile, 'data_filter'):
                tar.extractall(filter="data")
            else:
                tar.extractall()

In [5]:
dockerfile = '''Analyze:FROM maven:3-jdk-8-slim

COPY ./src/main/resources/settings.xml settings.xml

RUN apt-get update && apt-get install -y git-core && rm -rf /var/lib/apt/lists/*

RUN git clone --depth 1 https://github.com/TelluIoT/ThingML.git

RUN cd ThingML && mvn -s ../settings.xml -DskipTests clean install

FROM openjdk:8-jre-slim

COPY --from=0 /ThingML/compilers/official-network-plugins/target/*-jar-with-dependencies.jar thingml.jar

RUN chmod +x thingml.jar


ENTRYPOINT ["java", "-jar", "thingml.jar"]
CMD ["-h"]'''

In [None]:

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="cpu",
    max_length=1000,
    eos_token_id=tokenizer.eos_token_id
)

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0.1})

In [None]:
from langchain import PromptTemplate,  LLMChain

template = """
              You are an intelligent bot that analyze Dockerfile.
              You return a brief response starting with "Generate a Dockerfile" followed by the instruction to build the principal service of the Dockerfile given after "Analyze:", with the relative version.
              {query}
           """

prompt = PromptTemplate(template=template, input_variables=["query"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
print(llm_chain.run(dockerfile))