In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from transformers import RobertaTokenizer, T5ForConditionalGeneration, T5Config, T5Model

import torch
import torch.nn as nn

from tqdm import tqdm

from helper import get_j_c_data_loaders, to_device, get_device, plot_data



In [2]:
# GLOBALS
device = get_device()
BEAM_SIZE = 10
MAX_SEQ_LEN = 200
SOURCE_LEN = 200
LEARNING_RATE = 5e-4
EPOCHS = 7
BATCH_SIZE = 128 # change depending on the GPU Colab gives you

torch.cuda.empty_cache()

In [3]:

from transformers import RobertaTokenizer, T5ForConditionalGeneration

model_name_or_path = 'models/java2doc' 

tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model = to_device(T5ForConditionalGeneration.from_pretrained(model_name_or_path), device)

train_dl , valid_dl, test_dl = get_j_c_data_loaders(BATCH_SIZE, tokenizer)

In [4]:
def sample(clf, dl):
    sent_data = []
    for data in tqdm(dl):
        with torch.no_grad():
            gen = model.generate(data['j_ids'], max_length=MAX_SEQ_LEN, attention_mask=data['j_mask'], num_beams=5)
            sent_data += [tokenizer.decode(entry, skip_special_tokens=True) for entry in gen]
    
    return sent_data

In [5]:

doc_data = [sample(model, dl) for dl in (train_dl, valid_dl, test_dl)]

100%|██████████| 81/81 [07:19<00:00,  5.43s/it]
100%|██████████| 4/4 [00:22<00:00,  5.65s/it]
100%|██████████| 8/8 [01:00<00:00,  7.54s/it]


In [6]:
dataset_path = "../../datasets/"

for name, d_data in zip(('train.doc.txt', 'valid.doc.txt' , 'test.doc.txt'), tuple(doc_data)):
    with open(dataset_path + name, 'w') as f:
        for item in d_data:
            f.write("%s\n" % item)