In [2]:
from datasets import load_from_disk
from transformers import (
    RobertaTokenizer,
    T5ForConditionalGeneration,
)
import torch
import random

In [3]:
torch.set_num_threads(16)

In [4]:
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
model = T5ForConditionalGeneration.from_pretrained('/data/nicolasmaier/model/codet5-finetuned-split/checkpoint-312000')

In [5]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)
model_gpu = model.to(device)

cuda:0


In [6]:
dataset = load_from_disk("/data/nicolasmaier/dataset/hf_clean_dataset")
dataset_split = load_from_disk("/data/nicolasmaier/dataset/hf_split_dataset")
print(dataset)
print(dataset_split)

DatasetDict({
    train: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 425631
    })
    valid: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14634
    })
    test: Dataset({
        features: ['code', 'contents', 'xmi', 'originalLine', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 25156
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'idx'],
        num_rows: 7931293
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'idx'],
        num_rows: 255994
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'idx'],
        num_rows: 476050
    })
})


In [19]:
example_orig = dataset["test"][42]
input = tokenizer(example_orig["contents"], return_tensors='pt').to(device)

#<?xml version="1.0" encoding="ASCII
prev_outputs = [12880, 2902, 1177, 1546, 21, 18, 20, 6, 2688, 1546, 13756]

all_outputs = []

for i in range(50):
    print(i, end=' ')
    decoder_input = [1] + prev_outputs
    outputs = model_gpu.generate(input.input_ids, max_length=801, num_beams=5, early_stopping=True, decoder_input_ids=torch.tensor(decoder_input).unsqueeze(0).to(device))
    outputs = outputs[0].tolist()

    all_outputs += outputs[1:201]
    prev_outputs = outputs[201:]

    if model.config.eos_token_id in outputs:
        all_outputs += outputs[201:]
        print()
        break


0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 


In [10]:
example_orig = dataset["test"][50]
with open('debug_0.txt', 'w') as f:
    f.write(example_orig["xmi"])

#with open('debug_1.txt', 'w') as f:
#    f.write(tokenizer.decode(all_outputs))

with open('debug_2.txt', 'w') as f:
    f.write(example_orig["code"])


In [27]:
i = random.randint(0, len(dataset["train"]))
print(dataset["train"][i]["input_ids"], '\n\n    ' + dataset["train"][i]["code"])

[1, 5610, 1018, 5610, 31, 203, 203, 482, 667, 5916, 797, 288, 203, 565, 632, 6618, 203, 565, 1071, 6320, 1138, 5365, 1253, 4995, 1138, 12, 691, 590, 16, 203, 29159, 514, 29455, 16, 203, 29159, 514, 21522, 16, 203, 29159, 514, 2653, 16, 203, 29159, 514, 3021, 16, 203, 29159, 1525, 1384, 16, 203, 29159, 1525, 1800, 13, 1216, 13367, 288, 203, 3639, 1815, 29455, 480, 446, 31, 203, 3639, 1815, 21522, 480, 446, 31, 203, 3639, 1815, 2653, 480, 446, 31, 203, 3639, 1815, 3021, 480, 446, 31, 203, 203, 3639, 3877, 1339, 273, 7183, 12, 2293, 16, 29455, 16, 21522, 1769, 203, 3639, 2358, 18, 19282, 6560, 18, 78, 3353, 18, 2425, 18, 2271, 18, 1138, 843, 273, 17698, 12, 4923, 16, 3021, 16, 1339, 1769, 203, 3639, 1993, 7800, 6158, 12, 2293, 16, 1339, 18, 24805, 1733, 9334, 843, 1769, 203, 203, 3639, 2358, 18, 19282, 6560, 18, 78, 3353, 18, 2425, 18, 2271, 18, 23583, 563, 273, 843, 18, 338, 7446, 5621, 203, 3639, 514, 4995, 273, 563, 18, 588, 5365, 5621, 203, 3639, 327, 394, 6320, 1138, 5365, 1253, 12, 