## This script is there to go through the Qwen model and see how it works.

Let's start by importing the necessary libraries.

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Starcoder2ForCausalLM, Qwen2ForCausalLM
from trl.commands.scripts.ppo import device_map

get the model and tokenizer from the checkpoint.

In [4]:
checkpoint = "Qwen/Qwen2.5-3B-Instruct"
device = "cuda"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

In [9]:
# test_data = dataset["test"]
instruction ="You are a helpful programmer instruction"

messages = [{"role": "system", "content": instruction},
            {"role": "user", "content": "write a function that takes a list of integers and returns the sum of all the integers in the list in java."},
            {"role": "assistant", "content": "write a function that takes a list of integers and returns the sum of all the integers in the list in java."}]

Test the model with a simple input.

In [10]:
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
print(text)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

<|im_start|>system
You are a helpful programmer instruction<|im_end|>
<|im_start|>user
write a function that takes a list of integers and returns the sum of all the integers in the list in java.<|im_end|>
<|im_start|>assistant
write a function that takes a list of integers and returns the sum of all the integers in the list in java.<|im_end|>
<|im_start|>assistant



In [10]:
generated_ids0 = model.generate(
    **model_inputs,
    max_new_tokens=300
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids0)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)


Certainly! Below is a simple Java function that takes a list of integers and returns the sum of all the integers in the list:

```java
import java.util.ArrayList;
import java.util.Arrays;

public class SumCalculator {

    /**
     * Calculates the sum of all elements in the given list of integers.
     *
     * @param numbers An ArrayList of Integer values.
     * @return The sum of all elements in the list.
     */
    public static int calculateSum(ArrayList<Integer> numbers) {
        return numbers.stream().mapToInt(Integer::intValue).sum();
    }

    public static void main(String[] args) {
        // Example usage:
        ArrayList<Integer> exampleList = new ArrayList<>(Arrays.asList(1, 2, 3, 4, 5));
        System.out.println("The sum of the list is: " + calculateSum(exampleList));
        
        // Additional test cases
        ArrayList<Integer> emptyList = new ArrayList<>();
        System.out.println("The sum of the empty list is: " + calculateSum(emptyList));
        


In [9]:
#print(generated_ids)
output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(output)

Certainly! Below is a simple Java function that takes a list of integers and returns the sum of all the integers in the list:

```java
import java.util.ArrayList;
import java.util.Arrays;

public class SumCalculator {
    /**
     * Calculates the sum of all elements in the given list.
     * 
     * @param numbers An ArrayList of Integers.
     * @return The sum of all elements in the list.
     */
    public static int calculateSum(ArrayList<Integer> numbers) {
        return numbers.stream().mapToInt(Integer::intValue).sum();
    }

    public static void main(String[] args) {
        // Example usage:
        ArrayList<Integer> exampleList = new ArrayList<>(Arrays.asList(1, 2, 3, 4, 5));
        System.out.println("The sum of the list is: " + calculateSum(exampleList));
        
        // Additional test cases
        ArrayList<Integer> emptyList = new ArrayList<>();
        System.out.println("The sum of an empty list is: " + calculateSum(emptyList));
        
        ArrayList<I

### LLMs for software Model Completion

In [1]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [2]:
checkpoint = "D:\\LLM\\thesisPractical\\fine_tuned_models\\gemma_instruct\\Qwen2.5-3B-Instruct-software-model_completion"

In [7]:
instruction = "You are an AI assistant that specializes in UML model completion. Given an incomplete UML model represented in JSON format, output the missing portions of the model in JSON format."

def format_chat_template_to_print(input):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": f'Here is the incomplete UML model:\n{input}'}
    ] 
    return tokenizer.apply_chat_template(row_json, tokenize=False, add_generation_prompt=True)

def format_chat_template(input):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": f'Here is the incomplete UML model:\n{input}'}
    ] 
    return tokenizer.apply_chat_template(row_json, tokenize=True, return_tensors="pt", add_generation_prompt=True).to("cuda")

In [3]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint, device_map="auto", quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
org_path = "D:\\LLM\\thesisPractical\\datasets_for_fine_tuning\\structural_removal_non_contiguous\\processed_4000"

test_dataset_url = org_path + "\\test.jsonl"

data_files = {
    'test': test_dataset_url
}

dataset = load_dataset('json', data_files=data_files)
test_dataset = dataset['test']

Generating test split: 0 examples [00:00, ? examples/s]

In [8]:
data = test_dataset[0]['input']
text_to_print = format_chat_template_to_print(data)
print(text_to_print)
input_ids = format_chat_template(data)

<|im_start|>system
You are an AI assistant that specializes in UML model completion. Given an incomplete UML model represented in JSON format, output the missing portions of the model in JSON format.<|im_end|>
<|im_start|>user
Here is the incomplete UML model:
{"directed":true,"nodes":[{"viewpoint":null,"visibility":"PUBLIC_LITERAL","qualifiedName":"model","name":"model","id":0,"URI":null,"eClass":"Model"},{"isSingleExecution":false,"isReadOnly":false,"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Activity","name":"Activity","id":2,"isActive":false,"isReentrant":true,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"Activity"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ControlFlow","name":"ControlFlow","id":3,"isLeaf":false,"eClass":"ControlFlow"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ControlFlow2","name":"ControlFlow2","id":4,"isLeaf":false,"eClass":"ControlFlow"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ControlFlow3","name":"Contro

In [9]:
output = test_dataset[0]['output']
print(output)

{"nodes":[{"visibility":"PUBLIC_LITERAL","id":1,"eClass":"PackageImport"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ControlFlow10","name":"ControlFlow10","id":10,"isLeaf":false,"eClass":"ControlFlow"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ControlFlow18","name":"ControlFlow18","id":17,"isLeaf":false,"eClass":"ControlFlow"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"DecisionNode","name":"DecisionNode","id":25,"isLeaf":false,"eClass":"DecisionNode"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ActivityFinalNode","name":"ActivityFinalNode","id":26,"isLeaf":false,"eClass":"ActivityFinalNode"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"DecisionNode2","name":"DecisionNode2","id":35,"isLeaf":false,"eClass":"DecisionNode"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ActivityFinalNode4","name":"ActivityFinalNode4","id":39,"isLeaf":false,"eClass":"ActivityFinalNode"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ActivityFinalNode5","name":"ActivityFinalNode5","i

In [10]:
outputs = model.generate(input_ids=input_ids, max_length=3500)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded_output)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


system
You are an AI assistant that specializes in UML model completion. Given an incomplete UML model represented in JSON format, output the missing portions of the model in JSON format.
user
Here is the incomplete UML model:
{"directed":true,"nodes":[{"viewpoint":null,"visibility":"PUBLIC_LITERAL","qualifiedName":"model","name":"model","id":0,"URI":null,"eClass":"Model"},{"isSingleExecution":false,"isReadOnly":false,"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Activity","name":"Activity","id":2,"isActive":false,"isReentrant":true,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"Activity"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ControlFlow","name":"ControlFlow","id":3,"isLeaf":false,"eClass":"ControlFlow"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ControlFlow2","name":"ControlFlow2","id":4,"isLeaf":false,"eClass":"ControlFlow"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ControlFlow3","name":"ControlFlow3","id":5,"isLeaf":false,"eCl