In [None]:
!pip install transformers[torch]

### load the data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pickle
train_file = '/content/drive/MyDrive/Colab_Notebooks/CL/modified_train_data_400.pkl'
valid_file = '/content/drive/MyDrive/Colab_Notebooks/CL/modified_valid_data_400.pkl'
code_file = '/content/drive/MyDrive/Colab_Notebooks/CL/mod_train_code_400.pkl'
text_file = '/content/drive/MyDrive/Colab_Notebooks/CL/mod_train_text_400.pkl'
with open(train_file, 'rb') as file:
    train = pickle.load(file)
with open(valid_file, 'rb') as file:
    valid = pickle.load(file)
with open(code_file, 'rb') as file:
    codes = pickle.load(file)
with open(text_file, 'rb') as file:
    texts = pickle.load(file)

### CL method

#### text length

In [None]:
text_length = [len(text) for text in texts]
method = text_length

#### rare token

In [None]:
from transformers import AutoTokenizer
from collections import Counter

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-large-ntp-py")

# 统计词频
counter = Counter()
for text in texts:
    tokens = tokenizer.tokenize(text)
    counter.update(tokens)


# 设置阈值，例如频次小于 5 的标记认为是罕见标记
threshold = 5
rare_tokens = [token for token, count in counter.items() if count < threshold]

Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors


In [None]:
rare_counts = []
for example in texts:
    tokens = tokenizer.tokenize(example)
    num = 0
    for word in tokens:
        if word in rare_tokens:
            num += 1
    rare_counts.append(num)


In [None]:
method = rare_counts

#### text perplexity

In [None]:
perplexity_file = '/content/drive/MyDrive/Colab_Notebooks/CL/mod_perplexity_400.pkl'
with open(perplexity_file, 'rb') as file:
    perplexity = pickle.load(file)

method = perplexity

#### code length

In [None]:
code_length = [len(code) for code in codes]
method = code_length

#### nubmer of code lines

In [None]:
def structured_code(code):
    lines = code.split('NEW_LINE')[:-1]
    codes = lines[:]
    lines = [line.strip() for line in lines]

    for i, line in enumerate(lines):
        if 'INDENT' in line:
            lines[i] = line.replace('INDENT ', '')
            lines[i:] = [' '*4 + str(l) for l in lines[i:]]
        while 'DEDENT' in line:
            lines[i] = line.replace('DEDENT ', '', 1)
            lines[i:] = [str(l)[4:] for l in lines[i:]]
            line = lines[i]

    code = ''
    for line in enumerate(lines):
        code = code + line[1] + '\n'

    return code[:-1], len(lines)

In [None]:
lines = []
for code in codes:
  _, num = structured_code(code)
  lines.append(num)
len(lines)

8485

In [None]:
method = lines

#### variable number

In [None]:
def structured_code(code):
    lines = code.split('NEW_LINE')[:-1]
    codes = lines[:]
    lines = [line.strip() for line in lines]

    for i, line in enumerate(lines):
        if 'INDENT' in line:
            lines[i] = line.replace('INDENT ', '')
            lines[i:] = [' '*4 + str(l) for l in lines[i:]]
        while 'DEDENT' in line:
            lines[i] = line.replace('DEDENT ', '', 1)
            lines[i:] = [str(l)[4:] for l in lines[i:]]
            line = lines[i]

    code = ''
    for line in enumerate(lines):
        code = code + line[1] + '\n'

    return code[:-1]

In [None]:
import ast
## count the number of variables
def count_variables(code):
    tree = ast.parse(code)
    variable_count = 0

    for node in ast.walk(tree):
        if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Store):
            variable_count += 1

    return variable_count

In [None]:
variable_counts = []
for i, code in enumerate(codes):
    ### many code have some unintended structure
    example = structured_code(code)
    count = count_variables(example)
    variable_counts.append(count)
print(len(variable_counts))

8485


In [None]:
method = variable_counts

#### alpha ratio

In [None]:
def alphanumeric_ratio(code):
    count = 0
    total = 0
    code = code.split(' ')
    for char in code:
        if char.isalpha():
            count += 1
        if char.isalnum():
            total += 1
    return count*100 / total

# 计算字母数字字符数量
alpha_ratio = []
for code in codes:
    code = structured_code(code)
    ratio = alphanumeric_ratio(code)
    alpha_ratio.append(ratio)

In [None]:
method = alpha_ratio

#### perplexity of generated code

In [None]:
code_perplexity_file = '/content/drive/MyDrive/Colab_Notebooks/CL/mod_perplexity_code.pkl'
with open(code_perplexity_file, 'rb') as file:
    code_perplexity = pickle.load(file)

method = code_perplexity

#### codebleu score (from large to small❗️❗️❗️

In [None]:
### order by codebleu
codebleu_file = '/content/drive/MyDrive/Colab_Notebooks/CL/mod_codebleu_400.pkl'
with open(codebleu_file, 'rb') as file:
    codebleu = pickle.load(file)

method = codebleu

#### combination

In [None]:
from scipy.stats import rankdata
import numpy as np

# first feature buckets
num_buckets = 6
method = variable_counts
rank = rankdata(method, 'ordinal')
random_method = perplexity
random_rank = rankdata(random_method, 'ordinal')

buckets_counts = np.zeros(num_buckets)
buckets_lists = [[] for _ in range(num_buckets)]
"""change the method"""

# 遍历数据，根据分段划分和统计
for i, sample in enumerate(rank):
    #for j in range(num_buckets-1, -1, -1):
    for j in range(num_buckets):
        #if sample >= (j)/num_buckets*len(rank):
        if sample <= (j+1)/num_buckets*len(rank):
            buckets_counts[j] += 1
            buckets_lists[j].append(train[i])
            break

# second feature buckets
random_buckets_counts = np.zeros(num_buckets)
random_buckets_lists = [[] for _ in range(num_buckets)]
"""change the method"""

# 遍历数据，根据分段划分和统计
for i, sample in enumerate(random_rank):
    #for j in range(num_buckets-1, -1, -1):
    for j in range(num_buckets):
        #if sample >= (j)/num_buckets*len(rank):
        if sample <= (j+1)/num_buckets*len(random_rank):
            random_buckets_counts[j] += 1
            random_buckets_lists[j].append(train[i])
            break

In [None]:
num_random = 1/(num_buckets + 1)   #percentage
passes = [1 for _ in range(num_buckets)]

subset_list = [[] for _ in range(num_buckets)]
random_selected_list = [[] for _ in range(num_buckets)]
subset_list[0] = np.array(buckets_lists[0])
#subset_lists = [[] for _ in range(num_buckets)]

#### select the random data from previous buckets
for i in range(num_buckets):
    random_lists = []
    for j in range(0,i):
        random_list = np.random.choice(random_selected_list[j], int(num_random*len(random_selected_list[j])))
        random_lists = np.concatenate((random_lists, random_list))
    random_selected_list[i] = np.concatenate((np.array(random_buckets_lists[i]), random_lists))
    subset_list[i] = np.concatenate((np.array(buckets_lists[i]), random_lists))
    np.random.shuffle(subset_list[i])    #shuffle the data within the subsets


buckets_size = [len(subset_list[i]) for i in range(num_buckets)]
data = np.concatenate(([subset for subset in subset_list]))

In [None]:
data.shape, buckets_size

((12150,), [1414, 1616, 1846, 2109, 2410, 2755])

#### split the buckets and select the random

In [None]:
from scipy.stats import rankdata
import numpy as np

rank = rankdata(method, 'ordinal')
num_buckets = 6
buckets_counts = np.zeros(num_buckets)
buckets_lists = [[] for _ in range(num_buckets)]
length = [[] for _ in range(num_buckets)]

for i, sample in enumerate(rank):
    #for j in range(num_buckets-1, -1, -1):
    for j in range(num_buckets):
        #if sample >= (j)/num_buckets*len(rank):
        if sample <= (j+1)/num_buckets*len(rank):
            length[j].append(method[i])
            buckets_counts[j] += 1
            buckets_lists[j].append(train[i])
            break

assert sum(buckets_counts) == len(train)
for i in range(num_buckets-1):
    assert min(length[i]) <= max(length[i+1])

In [None]:
min(length[0])

0.0

In [None]:
num_random = 1/(num_buckets + 1)   #percentage
passes = [1 for _ in range(num_buckets)]

subset_list = [[] for _ in range(num_buckets)]
subset_list[0] = np.array(buckets_lists[0])
#subset_lists = [[] for _ in range(num_buckets)]

#### select the random data from previous buckets
for i in range(num_buckets):
    random_lists = []
    for j in range(0,i):
        random_list = np.random.choice(subset_list[j], int(num_random*len(subset_list[j])))
        random_lists = np.concatenate((random_lists, random_list))
    subset_list[i] = np.concatenate((np.array(buckets_lists[i]), random_lists))
    np.random.shuffle(subset_list[i])    #shuffle the data within the subsets


buckets_size = [len(subset_list[i]) for i in range(num_buckets)]
data = np.concatenate(([subset for subset in subset_list]))

In [None]:
data.shape, buckets_size

((12150,), [1414, 1616, 1846, 2109, 2410, 2755])

#### randomly selected

In [None]:
n = 800*16 - data.shape[0]
print(n)
random_data = np.random.choice(train, n, replace=True)
dataset = np.concatenate((data, random_data))

650


### CL training

In [6]:
import torch
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

train_data = MyDataset(train)
valid_data = MyDataset(valid)

In [None]:
from transformers import T5ForConditionalGeneration, DataCollatorWithPadding, AutoTokenizer

model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-large-ntp-py')
tokenizer = AutoTokenizer.from_pretrained('Salesforce/codet5-large-ntp-py')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/511k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

In [None]:
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader
from torch.utils.data.sampler import BatchSampler, SequentialSampler

class CustomTrainer(Trainer):
    def __init__(self, *args, custom_train_dataset, **kwargs):
        super().__init__(*args, **kwargs)
        self.custom_train_dataset = custom_train_dataset

    def get_train_dataloader(self):
        trainloader = DataLoader(self.custom_train_dataset,
                                 batch_size = 16,
                                 shuffle=False,
                                 sampler=SequentialSampler(self.custom_train_dataset),
                                 )
        return trainloader

targs = TrainingArguments('alpha_ratio_formal_800steps',
                          num_train_epochs=1, logging_steps=50, save_steps=50,
                          do_eval=True, evaluation_strategy='steps',
                          #gradient_accumulation_steps=4,
                          gradient_checkpointing=True, fp16=True,
                          eval_accumulation_steps=10,
                          per_device_eval_batch_size=16, per_device_train_batch_size=16,
                          metric_for_best_model='eval_loss', greater_is_better=False,
                          load_best_model_at_end=True, save_total_limit=1)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = CustomTrainer(
    model=model,
    args=targs,
    custom_train_dataset=train_data,
    eval_dataset=valid_data,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
50,0.738,0.518352
100,0.534,0.493199
150,0.496,0.471707
200,0.4733,0.454059
250,0.4702,0.44465
300,0.455,0.430026
350,0.4443,0.426577
400,0.4321,0.417873
450,0.4239,0.410607
500,0.415,0.408473


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TrainOutput(global_step=800, training_loss=0.4551690435409546, metrics={'train_runtime': 1236.1123, 'train_samples_per_second': 10.355, 'train_steps_per_second': 0.647, 'total_flos': 2.165047296e+16, 'train_loss': 0.4551690435409546, 'epoch': 1.0})

### Baseline

In [4]:
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader
from torch.utils.data.sampler import BatchSampler, SequentialSampler
from transformers import T5ForConditionalGeneration, DataCollatorWithPadding, AutoTokenizer

model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-large-ntp-py')
tokenizer = AutoTokenizer.from_pretrained('Salesforce/codet5-large-ntp-py')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/511k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

In [9]:
from transformers import Trainer, TrainingArguments
# set another trainer to do the normal training since we do it after finising the CL part
# problem: need larger cuda memory
from transformers import DataCollatorWithPadding

args = TrainingArguments('baseline_5epoch',
                          num_train_epochs=5, logging_steps=50, save_steps=50,
                          do_eval=True, evaluation_strategy='steps',
                          gradient_accumulation_steps=4,
                          gradient_checkpointing=True, fp16=True,
                          eval_accumulation_steps=10,
                          per_device_eval_batch_size=16, per_device_train_batch_size=16,
                          metric_for_best_model='eval_loss', greater_is_better=False,
                          load_best_model_at_end=True, save_total_limit=1)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer_normal = Trainer(
    model=model,
    args=args,
    train_dataset=train_data,
    eval_dataset=valid_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [10]:
trainer_normal.train()

Step,Training Loss,Validation Loss
50,0.3218,0.399728


KeyboardInterrupt: ignored

### Evaluation

#### load the model and dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install codebleu
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install timeout_decorator

Collecting codebleu
  Downloading codebleu-0.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tree-sitter<1.0.0,>=0.20.0 (from codebleu)
  Downloading tree_sitter-0.20.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (484 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.3/484.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tree-sitter, codebleu
Successfully installed codebleu-0.2.1 tree-sitter-0.20.2
Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
max_input_length = 400
max_target_length = 400

def tokenize_function(examples):
  # encode the code-docstring pairs
  codes = examples['code']
  docstrings = examples['text']

  #inputs = [prefix + code for code in codes]
  inputs = [docstring for docstring in docstrings]
  model_inputs = tokenizer(inputs, max_length=max_input_length, padding="max_length", truncation=True)

  # encode the summaries
  labels = tokenizer(codes, max_length=max_target_length, padding="max_length", truncation=True).input_ids

  # important: we need to replace the index of the padding tokens by -100
  # such that they are not taken into account by the CrossEntropyLoss
  labels_with_ignore_index = []
  for labels_example in labels:
    labels_example = [label if label != 0 else -100 for label in labels_example]
    labels_with_ignore_index.append(labels_example)

  model_inputs["labels"] = labels_with_ignore_index

  return model_inputs

In [5]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('Salesforce/codet5-base')
checkpoint = '/content/drive/MyDrive/Colab_Notebooks/CL/baseline_5epoch/checkpoint-500'
mymodel = T5ForConditionalGeneration.from_pretrained(checkpoint)

In [6]:
from datasets import load_dataset
test_dataset = load_dataset('codeparrot/xlcost-text-to-code','Python-program-level', split='test')
test_data = test_dataset.map(tokenize_function, batched=True)
test_data.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])

Downloading builder script:   0%|          | 0.00/7.61k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/570k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/887 [00:00<?, ? examples/s]

In [7]:
model=mymodel.to('cuda')

#### generate the code

In [None]:
from tqdm.auto import tqdm

generated_code = []
progress_bar = tqdm(range(len(test_data)))

### baseline with 2epoch
for test in test_data:
    input_ids = test['input_ids'].reshape(1, 400).to('cuda')
    # generate
    outputs = model.generate(input_ids, max_length=1024, num_beams=3)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_code.append(pred)
    progress_bar.update(1)

  0%|          | 0/887 [00:00<?, ?it/s]

In [None]:
import pickle
filename = '/content/drive/MyDrive/Colab_Notebooks/cl_codet5/generated_code_alpha_ratio_formal_800steps.pkl'
with open(filename, 'wb') as file:
    pickle.dump(generated_code, file)

In [None]:
import pickle
filename = '/content/drive/MyDrive/Colab_Notebooks/cl_codet5/generated_text_perplexity_variable_800steps.pkl'
with open(filename, 'rb') as file:
    generated_code = pickle.load(file)

In [None]:
import pickle
filename = '/content/drive/MyDrive/Colab_Notebooks/CL/generated_code_3beams/generated_code_codebleu_variable_number.pkl'
with open(filename, 'rb') as file:
    generated_code = pickle.load(file)

In [None]:
import pickle
filename = '/content/drive/MyDrive/Colab_Notebooks/CL/generated_code_3beams/variable_text_perplexity_1500steps.pkl'
with open(filename, 'rb') as file:
    generated_variable_perplexity = pickle.load(file)

#### error rate

In [None]:
def write_code_to_file(code_content):
    file_path = '/content/code.py'
    with open(file_path, "w", encoding='utf-8') as file:
        file.write(code_content)

In [None]:
# transfor the code to the correct structure
def structured_code(code):
    lines = code.split('NEW_LINE')[:-1]
    codes = lines[:]
    lines = [line.strip() for line in lines]

    for i, line in enumerate(lines):
        if 'INDENT' in line:
            lines[i] = line.replace('INDENT ', '')
            lines[i:] = [' '*4 + str(l) for l in lines[i:]]
        while 'DEDENT' in line:
            lines[i] = line.replace('DEDENT ', '', 1)
            lines[i:] = [str(l)[4:] for l in lines[i:]]
            line = lines[i]

    code = ''
    for line in enumerate(lines):
        code = code + line[1] + '\n'

    return code[:-1]

In [None]:
import pytest
import subprocess
from tqdm.auto import tqdm
from timeout_decorator import timeout


def unit_test():
    running_err = 0
    structured_err = 0
    out_of_time_err = 0
    time_err, run_err = [], []
    progress_bar = tqdm(range(len(test_data)))

    for i, test in enumerate(test_data):
        pred = generated_code[i]

        code = structured_code(pred)
        write_code_to_file(code)

        @timeout(15)
        def running_test():
            process = subprocess.run(["python", "code.py"], capture_output=True, text=True)
            return process
        try:
            process = running_test()
            if process.returncode != 0:
                run_err.append(i)
                running_err += 1
                print('running error:', i)
        except Exception:
            time_err.append(i)
            out_of_time_err += 1
            print('out of time:', i)

        progress_bar.update(1)
    return running_err, out_of_time_err, run_err, time_err

In [None]:
running_err, out_of_time_err, run_err, time_err = unit_test()

  0%|          | 0/887 [00:00<?, ?it/s]

running error: 2
running error: 48
running error: 61
running error: 66
running error: 72
running error: 73
running error: 99
running error: 105
running error: 108
running error: 115
running error: 117
running error: 120
running error: 125
running error: 132
running error: 140
running error: 169
running error: 174
running error: 194
running error: 200
running error: 203
running error: 213
running error: 224
running error: 232
running error: 233
running error: 234
running error: 268
running error: 301
running error: 318
running error: 326
running error: 329
running error: 367
running error: 369
running error: 375
running error: 376
running error: 381
running error: 384
running error: 394
running error: 433
running error: 438
running error: 445
running error: 498
running error: 501
running error: 504
running error: 511
running error: 512
running error: 514
running error: 521
running error: 527
running error: 530
running error: 542
running error: 546
running error: 548
running error: 559
r

In [None]:
running_err, out_of_time_err

(70, 0)

#### codebleu

In [None]:
!pip install codebleu

In [None]:
import numpy as np
from tqdm.auto import tqdm
from codebleu import calc_codebleu
import evaluate

bleu_metric = evaluate.load("bleu")
em_metric = evaluate.load('exact_match')
print(len(generated_code))

def cal_metrics():
  target = test_data['code']
  progress_bar = tqdm(range(len(target)))

  codebleu = 0
  bleu = 0
  em = 0
  error = 0
  n = 0
  for i, test in enumerate(target):

      pred = generated_code[i]
      pred = structured_code(pred)
      predictions = structured_code(target[i])
      try:
          code_bleu = calc_codebleu([predictions], [pred], lang="python", weights=(0.25, 0.25, 0.25, 0.25))
          codebleu += code_bleu['codebleu']
          bleu += bleu_metric.compute(predictions=[predictions], references=[pred])['bleu']
          em += em_metric.compute(predictions=[predictions], references=[pred])['exact_match']
          n += 1
      except Exception:
          error += 1

      progress_bar.update(1)

      if (i+1) % 50 == 0:
          result = {'codebleu': round(codebleu*100/(i+1), 4),
                    'bleu': round(bleu*100/(i+1), 4),
                    'em': round(em*100/(i+1), 4)}
          print(i+1, 'Steps:', result)

  codebleu = round(codebleu*100 / n, 4)
  bleu = round(bleu*100 / n, 4)
  em = round(em*100/n, 4)
  return {'codebleu': codebleu,
          'bleu': bleu,
          'em':em}, error, n

In [None]:
result, error, n = cal_metrics()

In [None]:
result, error

#### test example

In [None]:
num = 5
print(num)
test_example = test_dataset[num]
text = test_example['text']
code = test_example['code']
print('Doc:', text)
print('Original Code:', code)

5
Doc: Check if the remainder of N | Function to check if a number holds the condition ( N - 1 ) ! % N = N - 1 ; Corner cases ; Number divisible by 2 or 3 are not prime ; Iterate from 5 and keep checking for prime ; Function to check the expression for the value N ; Driver code
Original Code: def isPrime ( n ) : NEW_LINE INDENT if ( n == 1 ) : NEW_LINE INDENT return True NEW_LINE DEDENT if ( n <= 3 ) : NEW_LINE INDENT return True NEW_LINE DEDENT if ( ( n % 2 == 0 ) or ( n % 3 == 0 ) ) : NEW_LINE INDENT return False NEW_LINE DEDENT i = 5 NEW_LINE while ( i * i <= n ) : NEW_LINE INDENT if ( ( n % i == 0 ) or ( n % ( i + 2 ) == 0 ) ) : NEW_LINE INDENT return False ; NEW_LINE i += 6 NEW_LINE DEDENT DEDENT return true ; NEW_LINE DEDENT def checkExpression ( n ) : NEW_LINE INDENT if ( isPrime ( n ) ) : NEW_LINE INDENT print ( " Yes " ) NEW_LINE DEDENT else : NEW_LINE INDENT print ( " No " ) NEW_LINE DEDENT DEDENT if __name__ == ' _ _ main _ _ ' : NEW_LINE INDENT N = 3 NEW_LINE checkExpressio

In [None]:
code = structured_code(code)
print(code)

def isPrime ( n ) :
    if ( n == 1 ) :
        return True
    if ( n <= 3 ) :
        return True
    if ( ( n % 2 == 0 ) or ( n % 3 == 0 ) ) :
        return False
    i = 5
    while ( i * i <= n ) :
        if ( ( n % i == 0 ) or ( n % ( i + 2 ) == 0 ) ) :
            return False ;
            i += 6
    return true ;
def checkExpression ( n ) :
    if ( isPrime ( n ) ) :
        print ( " Yes " )
    else :
        print ( " No " )
if __name__ == ' _ _ main _ _ ' :
    N = 3
    checkExpression ( N )


In [None]:
# prepare for the model
input_ids = tokenizer(test_example['text'], return_tensors='pt').input_ids.to('cuda')
# generate
outputs = mymodel.generate(input_ids, max_length=1024, num_beams=3)
pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated code:", pred)

Generated code: def check ( n ) : NEW_LINE INDENT if ( n <= 1 ) : NEW_LINE INDENT return False NEW_LINE DEDENT if ( n <= 3 ) : NEW_LINE INDENT return True NEW_LINE DEDENT if ( n % 2 == 0 or n % 3 == 0 ) : NEW_LINE INDENT return False NEW_LINE DEDENT for i in range ( 5, int ( n ** ( 1 / 2 ) ) + 1, 6 ) : NEW_LINE INDENT if ( n % i == 0 or n % ( i + 2 ) == 0 ) : NEW_LINE INDENT return False NEW_LINE DEDENT DEDENT return True NEW_LINE DEDENT def checkExpression ( N ) : NEW_LINE INDENT if ( check ( N ) ) : NEW_LINE INDENT print ( " Yes " ) NEW_LINE DEDENT else : NEW_LINE INDENT print ( " No " ) NEW_LINE DEDENT DEDENT if __name__ =='_ _ main _ _': NEW_LINE INDENT N = 12 NEW_LINE checkExpression ( N ) NEW_LINE DEDENT


In [None]:
pred = structured_code(pred)
print(pred)

def check ( n ) :
    if ( n <= 1 ) :
        return False
    if ( n <= 3 ) :
        return True
    if ( n % 2 == 0 or n % 3 == 0 ) :
        return False
    for i in range ( 5, int ( n ** ( 1 / 2 ) ) + 1, 6 ) :
        if ( n % i == 0 or n % ( i + 2 ) == 0 ) :
            return False
    return True
def checkExpression ( N ) :
    if ( check ( N ) ) :
        print ( " Yes " )
    else :
        print ( " No " )
if __name__ =='_ _ main _ _':
    N = 12
    checkExpression ( N )


In [None]:
from codebleu import calc_codebleu
calc_codebleu([code], [pred], 'python')

{'codebleu': 0.579665577828515,
 'ngram_match_score': 0.554067938037583,
 'weighted_ngram_match_score': 0.5645943732764771,
 'syntax_match_score': 0.72,
 'dataflow_match_score': 0.48}