# Create Tensor Dataset to load JAVA-C# Pair data

In [49]:
%load_ext autoreload
%autoreload 2

import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"\\py_scripts")

import numpy as np
import pandas as pd

from helper import to_device, get_device, get_j_c_data_loaders
from JCDataSet import JCDataSet

from transformers import RobertaTokenizer, RobertaModel

import torch
import torchvision as thv
from torchvision.utils import make_grid
from torch.utils.data import DataLoader, Dataset
import torch.utils.data as data_utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
# GLOBALS
device = get_device()

BATCH_SIZE = 8


### Get Data from dataset and load into respective dataframes

In [41]:
train_df = pd.concat([pd.read_csv('../../datasets/train.java-cs.txt.cs', delimiter='\n', names=['cs_code']), 
pd.read_csv('../../datasets/train.java-cs.txt.java', delimiter='\n', names=['j_code'])], axis=1)

valid_df = pd.concat([pd.read_csv('../../datasets/valid.java-cs.txt.cs', delimiter='\n', names=['cs_code']),
pd.read_csv('../../datasets/valid.java-cs.txt.java', delimiter='\n', names=['j_code'])], axis=1)

test_df = pd.concat([pd.read_csv('../../datasets/test.java-cs.txt.cs', delimiter='\n', names=['cs_code']), 
pd.read_csv('../../datasets/test.java-cs.txt.java', delimiter='\n', names=['j_code'])], axis=1)

train_df.head()

Unnamed: 0,cs_code,j_code
0,public virtual ListSpeechSynthesisTasksRespons...,public ListSpeechSynthesisTasksResult listSpee...
1,public virtual UpdateJourneyStateResponse Upda...,public UpdateJourneyStateResult updateJourneyS...
2,public void RemovePresentationFormat(){Mutable...,public void removePresentationFormat() {remove...
3,"public CellRangeAddressList(int firstRow, int ...","public CellRangeAddressList(int firstRow, int ..."
4,public virtual void delete(int key){int i = bi...,public void delete(int key) {int i = binarySea...


In [42]:
code_tokenizer = to_device(RobertaTokenizer.from_pretrained("Salesforce/codet5-base-multi-sum"), device)

token_df = pd.DataFrame()
token_df['cs_train_tokens'] = train_df['cs_code'].apply(lambda x: code_tokenizer.tokenize(x)).apply(len)
token_df['j_train_tokens'] = train_df['j_code'].apply(lambda x: code_tokenizer.tokenize(x)).apply(len)

Unnamed: 0,cs_train_tokens,j_train_tokens
0,"[public, Ġvirtual, ĠList, Spe, ech, Sy, nt, he...","[public, ĠList, Spe, ech, Sy, nt, hesis, T, as..."
1,"[public, Ġvirtual, ĠUpdate, J, ourney, State, ...","[public, ĠUpdate, J, ourney, State, Result, Ġu..."
2,"[public, Ġvoid, ĠRemove, Present, ation, Forma...","[public, Ġvoid, Ġremove, Present, ation, Forma..."
3,"[public, ĠCell, Range, Address, List, (, int, ...","[public, ĠCell, Range, Address, List, (, int, ..."
4,"[public, Ġvirtual, Ġvoid, Ġdelete, (, int, Ġke...","[public, Ġvoid, Ġdelete, (, int, Ġkey, ), Ġ{, ..."
...,...,...
10290,"[public, Ġvirtual, ĠDelete, Ad, m, Channel, Re...","[public, ĠDelete, Ad, m, Channel, Result, Ġdel..."
10291,"[public, ĠSet, Sub, scription, Attributes, Req...","[public, ĠSet, Sub, scription, Attributes, Req..."
10292,"[public, Ġvirtual, Ġvoid, ĠUn, safe, Write, (,...","[public, Ġvoid, Ġunsafe, Write, (, char, Ġb, )..."
10293,"[public, Ġoverride, Ġbool, Ġremove, (, object,...","[@, Override, Ġpublic, Ġboolean, Ġremove, (, O..."


In [43]:
token_count = np.average([np.quantile(token_df[col].to_list(), .98) for col in token_df.columns])
print("The 98th quantile on the dataset:", token_count, "Which will be adjusted to 200")
MAX_SEQ_LEN = 200

The 97th quantile on the dataset: 190.09000000000015 Which will be adjusted to 200


In [46]:
training_set = JCDataSet(train_df, code_tokenizer, MAX_SEQ_LEN)
validation_set = JCDataSet(valid_df, code_tokenizer, MAX_SEQ_LEN)
testing_set = JCDataSet(test_df, code_tokenizer, MAX_SEQ_LEN)

train_dl = DataLoader(training_set, BATCH_SIZE, num_workers=0, pin_memory=True, shuffle=True)
val_dl = DataLoader(validation_set, BATCH_SIZE, num_workers=0, pin_memory=True)
test_dl = DataLoader(testing_set, BATCH_SIZE, num_workers=0, pin_memory=True)

train_dl = to_device(train_dl, device)
val_dl = to_device(val_dl, device)
test_dl = to_device(test_dl, device)