## This script is there to go through the Qwen model and see how it works.

Let's start by importing the necessary libraries.

In [166]:
from lamini import batch_size
from sympy.physics.units import temperature
from transformers import AutoModelForCausalLM, AutoTokenizer, Starcoder2ForCausalLM, Qwen2ForCausalLM

get the model and tokenizer from the checkpoint.

In [167]:
checkpoint = "Qwen/Qwen2.5-3B-Instruct"
device = "cuda"

In [168]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
print(tokenizer.pad_token)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<|endoftext|>


In [9]:
# test_data = dataset["test"]
instruction ="You are a helpful programmer instruction"

messages = [{"role": "system", "content": instruction},
            {"role": "user", "content": "write a function that takes a list of integers and returns the sum of all the integers in the list in java."},
            {"role": "assistant", "content": "write a function that takes a list of integers and returns the sum of all the integers in the list in java."}]

Test the model with a simple input.

In [10]:
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
print(text)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

<|im_start|>system
You are a helpful programmer instruction<|im_end|>
<|im_start|>user
write a function that takes a list of integers and returns the sum of all the integers in the list in java.<|im_end|>
<|im_start|>assistant
write a function that takes a list of integers and returns the sum of all the integers in the list in java.<|im_end|>
<|im_start|>assistant



In [10]:
generated_ids0 = model.generate(
    **model_inputs,
    max_new_tokens=300
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids0)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)


Certainly! Below is a simple Java function that takes a list of integers and returns the sum of all the integers in the list:

```java
import java.util.ArrayList;
import java.util.Arrays;

public class SumCalculator {

    /**
     * Calculates the sum of all elements in the given list of integers.
     *
     * @param numbers An ArrayList of Integer values.
     * @return The sum of all elements in the list.
     */
    public static int calculateSum(ArrayList<Integer> numbers) {
        return numbers.stream().mapToInt(Integer::intValue).sum();
    }

    public static void main(String[] args) {
        // Example usage:
        ArrayList<Integer> exampleList = new ArrayList<>(Arrays.asList(1, 2, 3, 4, 5));
        System.out.println("The sum of the list is: " + calculateSum(exampleList));
        
        // Additional test cases
        ArrayList<Integer> emptyList = new ArrayList<>();
        System.out.println("The sum of the empty list is: " + calculateSum(emptyList));
        


In [9]:
#print(generated_ids)
output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(output)

Certainly! Below is a simple Java function that takes a list of integers and returns the sum of all the integers in the list:

```java
import java.util.ArrayList;
import java.util.Arrays;

public class SumCalculator {
    /**
     * Calculates the sum of all elements in the given list.
     * 
     * @param numbers An ArrayList of Integers.
     * @return The sum of all elements in the list.
     */
    public static int calculateSum(ArrayList<Integer> numbers) {
        return numbers.stream().mapToInt(Integer::intValue).sum();
    }

    public static void main(String[] args) {
        // Example usage:
        ArrayList<Integer> exampleList = new ArrayList<>(Arrays.asList(1, 2, 3, 4, 5));
        System.out.println("The sum of the list is: " + calculateSum(exampleList));
        
        // Additional test cases
        ArrayList<Integer> emptyList = new ArrayList<>();
        System.out.println("The sum of an empty list is: " + calculateSum(emptyList));
        
        ArrayList<I

### LLMs for software Model Completion

In [24]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [25]:
checkpoint = "D:\\LLM\\thesisPractical\\fine_tuned_models\\Qwen2.5-3B-Instruct-software-model_completion"

In [26]:
instruction = "Given an incomplete UML model represented in JSON format. Produce only one missing node."

def format_chat_template_to_print(input):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": f'Here is the incomplete UML model:\n{input}'}
    ] 
    return tokenizer.apply_chat_template(row_json, tokenize=False, add_generation_prompt=True)

def format_chat_template(input):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": f'Here is the incomplete UML model:\n{input}'}
    ] 
    return tokenizer.apply_chat_template(row_json, tokenize=True, return_tensors="pt", add_generation_prompt=True).to("cuda")

In [27]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint, device_map="auto", quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [28]:
org_path = "D:\\LLM\\thesisPractical\\datasets_for_fine_tuning\\structural_removal_non_contiguous\\processed_4000"

test_dataset_url = org_path + "\\test.jsonl"

data_files = {
    'test': test_dataset_url
}

dataset = load_dataset('json', data_files=data_files)
test_dataset = dataset['test']

In [29]:
data = test_dataset[2]['input']
text_to_print = format_chat_template_to_print(data)
print(text_to_print)
input_ids = format_chat_template(data)

<|im_start|>system
Given an incomplete UML model represented in JSON format. Produce only one missing node.<|im_end|>
<|im_start|>user
Here is the incomplete UML model:
{"directed":true,"nodes":[{"viewpoint":null,"visibility":"PUBLIC_LITERAL","qualifiedName":"model","name":"model","id":0,"URI":null,"eClass":"Model"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Bank Customer","name":"Bank Customer","id":2,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"Actor"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Insert Card","name":"Insert Card","id":3,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"UseCase"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Insert Pin","name":"Insert Pin","id":4,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"UseCase"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Display Menu","name":"Display Menu","id":5,"isLeaf":false,"isAbstract":false,"isFinalSp

In [30]:
output = test_dataset[2]['output']
print(output)

{"nodes":[{"visibility":"PUBLIC_LITERAL","id":1,"eClass":"PackageImport"},{"isDerived":false,"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Bank Customer_Display Menu","name":"Bank Customer_Display Menu","id":8,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"Association"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Deposit Cheque/Cash","name":"Deposit Cheque/Cash","id":9,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"UseCase"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::ATM Bank Server","name":"ATM Bank Server","id":10,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"Actor"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::deposits cash/cheque ","name":"deposits cash/cheque ","id":14,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"UseCase"},{"visibility":"PUBLIC_LITERAL","qualifiedName":null,"name":null,"id":22,"value":1,"eClass":"LiteralInteger"},{

In [35]:
outputs = model.generate(input_ids=input_ids, max_new_tokens=80, do_sample=True)
# print only assistant text
decoded_output = tokenizer.decode(outputs, skip_special_tokens=True)
print(text.split("assistant")[1])


TypeError: argument 'ids': 'list' object cannot be interpreted as an integer

In [39]:
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text.split("assistant\n")[1])

{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Enter pin","name":"Enter pin","id":8,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"UseCase"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Deposit Cash","name":"Deposit Cash","id":9,"isLeaf":false,"isAbstract":false,"isFinal


In [None]:
outputs = model.generate(input_ids=input_ids, max_new_tokens=50, temperature=0.6)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded_output)

system
Given an incomplete UML model represented in JSON format. Produce only one missing node.
user
Here is the incomplete UML model:
{"directed":true,"nodes":[{"viewpoint":null,"visibility":"PUBLIC_LITERAL","qualifiedName":"model","name":"model","id":0,"URI":null,"eClass":"Model"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Bank Customer","name":"Bank Customer","id":2,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"Actor"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Insert Card","name":"Insert Card","id":3,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"UseCase"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Insert Pin","name":"Insert Pin","id":4,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"UseCase"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Display Menu","name":"Display Menu","id":5,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"UseC

## Evaluate LLM

In [1]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser
import torch
from dataclasses import dataclass, field

In [2]:
@dataclass
class ScriptArguments:
    """
    Arguments for evaluation
    """
    checkpoint: str = field(
        default= "D:\\LLM\\thesisPractical\\fine_tuned_models\\Qwen2.5-3B-Instruct-software-model_completion"
    )
    test_dataset: str = field(
        default="D:\\LLM\\thesisPractical\\datasets_for_fine_tuning\\structural_removal_one\\test.jsonl"
    )
    max_new_tokens: int = field(
        default=100,
        metadata={"help": "Maximum number of tokens to generate"}
    )
    device: str = field(
        default="cuda",
        metadata={"help": "Device to run the model on"}
    )
    max_length_input: int = field(
        default=4100
    )
    max_length_output: int = field(
        default=4100
    )
    
parser = HfArgumentParser(ScriptArguments)
# Parse the arguments, ignoring unrecognized ones
script_args, remaining_args = parser.parse_args_into_dataclasses(return_remaining_strings=True)    


In [3]:
checkpoint = "D:\\LLM\\thesisPractical\\fine_tuned_models\\Qwen2.5-3B-Instruct-software-model_completion"

In [33]:
org_path = "D:\\LLM\\thesisPractical\\datasets_for_fine_tuning\\structural_removal_one"

test_dataset_url = org_path + "\\test.jsonl"

data_files = {
    'test': test_dataset_url
}

dataset = load_dataset('json', data_files=data_files)
test_dataset = dataset['test']

In [5]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint, device_map="auto", quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
instruction2 = "You are an AI assistant that specializes in UML model completion. Given an incomplete UML model represented in JSON format, output the missing portions of the model in JSON format."


def compute_length(row):
    # Tokenize the "output" text
    tokens_output = tokenizer(row['output'], truncation=False)  # En
    
    tokens_input= tokenizer(row['input'], truncation=False) 
    
    # Compute the length (number of tokens)
    return {
        'output_length': len(tokens_output['input_ids']),
        'input_length': len(tokens_input['input_ids'])
    }

In [7]:
# Apply the function to the dataset
test_dataset = test_dataset.map(compute_length)
max_length = test_dataset['output_length']
max_length_output = max(max_length)
max_length = test_dataset['input_length']
max_length_input = max(max_length)
print(max_length_input)
print(max_length_output)


3709
183


In [9]:
print(tokenizer.pad_token)

<|endoftext|>


In [35]:
def format_chat_template(row):
    row_json = [
        {"role": "system", "content": instruction2},
        {"role": "user", "content": f'Here is the incomplete UML model:\n{row["input"]}'}
    ]
    tokenized_input  = tokenizer.apply_chat_template(row_json, tokenize=True, padding=True, padding_side="right", add_generation_prompt=True ,max_length=4100)
    print(len(tokenized_input))
    tokenized_output = tokenizer(row['output'], truncation=False)
    #print(len(tokenized_output['input_ids']))
    
    row['input_ids'] = tokenized_input
    row['labels'] = tokenized_output['input_ids']
    return row

In [36]:
# Apply the function to the dataset
test_dataset = test_dataset.map(format_chat_template)
test_dataset = test_dataset.remove_columns(["input", "output"])
test_dataset.set_format(type='torch', columns=['input_ids','labels'])

print(test_dataset)

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

2815
3461
3436
3085
1546
3221
3105
2393
2829
3101
1652
3495
2930
1701
2273
2157
1973
2953
2868
1876
3088
2536
1972
2836
2738
3565
2188
2307
3123
1829
3603
3101
3765
3414
3101
1712
1472
1810
1510
3492
1547
3219
3169
1109
2769
1771
2842
2887
3388
Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 49
})


In [37]:
#print(type(test_dataset['input_ids']))
print(test_dataset)
#for i in test_dataset['input_ids']:
    #print(len(i))

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 49
})


In [38]:
eval_dataloader = DataLoader(test_dataset, batch_size=1)

In [108]:
#print(encode)
def align_predictions_labels(generated_ids, labels, pad_token_id, max_length):
    """
    Aligns the lengths of predictions and labels by padding or truncating.

    Args:
        generated_ids (Tensor): Tensor of shape (batch_size, pred_seq_length)
        labels (Tensor): Tensor of shape (batch_size, label_seq_length)
        pad_token_id (int): The token ID used for padding.

    Returns:
        Tuple[Tensor, Tensor]: Aligned predictions and labels.
    """
    global true_predictions
    
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
    predicted_text = generated_text.split("assistant\n")[1]
    predicted_tokens = tokenizer(predicted_text, return_tensors='pt', padding=True, truncation=True, max_length=max_length)['input_ids']
    
    batch_size, pred_len = predicted_tokens.size()
    _, label_len = labels.size()
    print("prediction length: ")
    print(pred_len)
    print("label length: ")
    print(label_len)

    if pred_len < label_len:
        # Pad predictions
        padding = torch.full((batch_size, label_len - pred_len), pad_token_id, dtype=predicted_tokens.dtype).to( predicted_tokens.device)
        true_predictions = torch.cat([predicted_tokens, padding], dim=1)
    elif pred_len > label_len:
        # Truncate predictions
        true_predictions =  predicted_tokens[:, :label_len]
    else:
        true_predictions = predicted_tokens
        
    
    batch_size, pred_len = true_predictions.size()
    print("true prediction length: ")
    print(pred_len)
    if  pred_len!= label_len:
        raise ValueError("The size of the predictions and labels should match.")

    return true_predictions, labels

In [109]:
import evaluate
device = 'cuda'
metric = evaluate.load("accuracy")
model.eval()
batch1 = next(iter(eval_dataloader))
for batch in eval_dataloader:
    input_ids = batch['input_ids'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
         # Generate outputs
        generated_ids = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_length_output,      # Ensure generation does not exceed allocated length
            pad_token_id=tokenizer.pad_token_id,
            do_sample=False                        # Use deterministic generation; set to True for diversity
        ) 
    aligned_preds, aligned_labels = align_predictions_labels(generated_ids, labels, tokenizer.pad_token_id, 4100)
    
    # Convert tensors to lists
    preds = aligned_preds.cpu().tolist()
    refs = aligned_labels.cpu().tolist()
    
    
    # Flatten the lists for overall token-level accuracy
    preds_flat = [token for seq in preds for token in seq]
    refs_flat = [token for seq in refs for token in seq]
    
    metric.add_batch(predictions=preds_flat, references=refs_flat)

metric.compute()    

prediction length: 
183
label length: 
45
true prediction length: 
45
prediction length: 
183
label length: 
47
true prediction length: 
47
prediction length: 
183
label length: 
45
true prediction length: 
45
prediction length: 
183
label length: 
51
true prediction length: 
51
prediction length: 
183
label length: 
103
true prediction length: 
103
prediction length: 
183
label length: 
45
true prediction length: 
45
prediction length: 
183
label length: 
165
true prediction length: 
165
prediction length: 
183
label length: 
70
true prediction length: 
70
prediction length: 
183
label length: 
45
true prediction length: 
45
prediction length: 
183
label length: 
165
true prediction length: 
165
prediction length: 
183
label length: 
98
true prediction length: 
98
prediction length: 
183
label length: 
53
true prediction length: 
53
prediction length: 
183
label length: 
183
true prediction length: 
183
prediction length: 
183
label length: 
101
true prediction length: 
101
prediction

{'accuracy': 0.16079393398751116}

In [92]:
batch_size, pred_len = generated_ids.size()
print(pred_len)
#print(generated_ids)
text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
out = text.split("assistant\n")[1]
inp = text.split("assistant\n")[0]
encode_out = tokenizer(out, return_tensors='pt', padding=True, truncation=True, max_length=4100)['input_ids']
encode_inp = tokenizer(inp + "assistant\n", return_tensors='pt', padding=True, truncation=True, max_length=4100)['input_ids']
batch_size, pred_len = encode_out.size()
print(pred_len)
encode_out = encode_out[:, :10]
batch_size, pred_len = encode_out.size()





_, label_len = labels.size()
if pred_len < label_len:
  # Pad predictions
  padding = torch.full((batch_size, label_len - pred_len), tokenizer.pad_token_id, dtype=encode_out.dtype).to(encode_out.device)
  encode_out = torch.cat([encode_out, padding], dim=1)
elif pred_len > label_len:
  # Truncate predictions
  encode_out = encode_out[:, :label_len]

batch_size, pred_len = encode_out.size()
print(encode_out)
print(pred_len)
print(label_len)
text = tokenizer.decode(encode_out[0], skip_special_tokens=True)
print(text)



2998
183
tensor([[  4913,  20008,  66582,  28176,   3252,  59259,  78047,   2198,  36335,
            675, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643]])
45
45
{"nodes":[{"visibility":"PUBLIC_LITERAL","qualifiedName


In [79]:
print(text)

<|im_start|>system
You are an AI assistant that specializes in UML model completion. Given an incomplete UML model represented in JSON format, output the missing portions of the model in JSON format.<|im_end|>
<|im_start|>user
Here is the incomplete UML model:
{"directed":true,"nodes":[{"viewpoint":null,"visibility":"PUBLIC_LITERAL","qualifiedName":"model","name":"model","id":0,"URI":null,"eClass":"Model"},{"visibility":"PUBLIC_LITERAL","id":1,"eClass":"PackageImport"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Actor","name":"Actor","id":2,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"Actor"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::UseCase","name":"UseCase","id":3,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"UseCase"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"model::UseCase2","name":"UseCase2","id":4,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"UseCase"},{"isDerived":f