# Model and Data import

In [20]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("m-a-p/OpenCodeInterpreter-DS-6.7B")
model = AutoModelForCausalLM.from_pretrained("m-a-p/OpenCodeInterpreter-DS-6.7B")

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:19<00:00,  9.66s/it]


In [1]:
import os

def extract_data(file_path):
    data = []
    with open(file_path, "r") as file:
        for line in file:
            data.append(line.strip())
    return data


def save_translations(translations, filename):
    file_path = os.path.join('results', f'{filename}.txt')
    # if not os.path.exists(file_path):
    #     os.makedirs(file_path)

    with open(file_path, 'w') as file:
        for item in translations:
            file.write(f"{item}\n")



python_data_path = r'benchmark\G-TransEval\Python.test' 
java_data_path = r'benchmark\G-TransEval\Java.test' 

python_data = extract_data(python_data_path)
java_data = extract_data(java_data_path)


In [57]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("m-a-p/OpenCodeInterpreter-DS-6.7B")

max_lll = 0
for el in java_data:
    lll = len(tokenizer(el, return_tensors="pt")["input_ids"][0])
    if lll > max_lll:
        max_lll = lll

max_lll

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


512

# Translation extraction

In [2]:
import re

def extract_functions(file):

    with open(file, 'r') as file:
        data = file.read()
    split_data = data.split('\n\n# END OF TRANSLATION\n\n')[:-1]

    all_list = []

    for translation in split_data:
        if '```python' in translation:
            psttern = r'```python\n(.*?)```'
            match = re.findall(psttern, translation, re.DOTALL)[0]
            all_list.append(match)
        else:
            translated_function = 'def ' + translation.split('def ')[1]
            cleaned_translation = translated_function.split('#')[0]
            all_list.append(cleaned_translation)

    return all_list


def prepare_for_unit(data):
    for translation in data:
        translation = translation.replace('\n', 'NEW_LINE')

In [3]:
data = extract_functions('baseline_translation.txt')

In [None]:
data

In [4]:
data[0]

'def greatestCommonDivisor(a, b):\n    if a == 0 or b == 0:\n        return a + b\n    if a == b:\n        return a\n    if a > b:\n        return greatestCommonDivisor(a % b, b)\n    else:\n        return greatestCommonDivisor(a, b % a)'

# Translation evaluation metrics

In [45]:
from codebleu import calc_codebleu

prediction = ""
reference = "def sum ( first , second ) :\n return second + first"

result = calc_codebleu([reference], [prediction], lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None)
print(result)


{'codebleu': 0.25, 'ngram_match_score': 0, 'weighted_ngram_match_score': 0, 'syntax_match_score': 0.0, 'dataflow_match_score': 0.0}


# Parameters manipulation and count

In [90]:
for param in model.parameters():
    param.requires_grad = False

# for i in range(0, 31):
#     for param in model.model.layers[i].parameters():
#         param.requires_grad = True

for param in model.model.layers[0].parameters():
    param.requires_grad = True

In [96]:
total_param = sum(p.numel() for p in model.parameters())
trinable_param = sum(p.numel() for p in model.parameters() if p.requires_grad)

percentages = trinable_param / total_param * 100

print(f"Percentage of trainable parameters: {percentages:.2f}%")

Percentage of trainable parameters: 3.00%


In [98]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    trainable_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        trainable_params += params
    print(table)
    print(f"Total Trainable Params: {trainable_params}")
    total_param = sum(p.numel() for p in model.parameters())
    percentage = trainable_params/total_param * 100

    return print(f"\nPercentage of trainable parameters: {percentage:.2f}%")
    
count_parameters(model)

+------------------------------------------------+------------+
|                    Modules                     | Parameters |
+------------------------------------------------+------------+
|     model.layers.0.self_attn.q_proj.weight     |  16777216  |
|     model.layers.0.self_attn.k_proj.weight     |  16777216  |
|     model.layers.0.self_attn.v_proj.weight     |  16777216  |
|     model.layers.0.self_attn.o_proj.weight     |  16777216  |
|      model.layers.0.mlp.gate_proj.weight       |  45088768  |
|       model.layers.0.mlp.up_proj.weight        |  45088768  |
|      model.layers.0.mlp.down_proj.weight       |  45088768  |
|     model.layers.0.input_layernorm.weight      |    4096    |
| model.layers.0.post_attention_layernorm.weight |    4096    |
+------------------------------------------------+------------+
Total Trainable Params: 202383360

Percentage of trainable parameters: 3.00%
